def main(args): args_dict = vars(args) print('args: {}'.format(args_dict)) with tf.Graph().as_default() as g: # rollout subgraph with tf.name_scope('rollout'): observations = tf.placeholder(shape=(None, OBSERVATION_DIM), dtype=tf.float32) logits = build_graph(observations) logits_for_sampling = tf.reshape(logits, shape=(1, len(ACTIONS))) # Sample the action to be played during rollout. sample_action = tf.squeeze( tf.multinomial(logits=logits_for_sampling, num_samples=1)) optimizer = tf.train.RMSPropOptimizer(learning_rate=args.learning_rate, decay=args.decay) # dataset subgraph for experience replay with tf.name_scope('dataset'): # the dataset reads from MEMORY ds = tf.data.Dataset.from_generator(gen, output_types=(tf.float32, tf.int32, tf.float32)) ds = ds.shuffle(MEMORY_CAPACITY).repeat().batch(args.batch_size) iterator = ds.make_one_shot_iterator() # training subgraph with tf.name_scope('train'): # the train_op includes getting a batch of data from the dataset, so we do not need to use a feed_dict when running the train_op. next_batch = iterator.get_next() train_observations, labels, processed_rewards = next_batch # This reuses the same weights in the rollout phase. train_observations.set_shape((args.batch_size, OBSERVATION_DIM)) train_logits = build_graph(train_observations) cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_logits, labels=labels) # Extra loss when the paddle is moved, to encourage more natural moves. probs = tf.nn.softmax(logits=train_logits) #move_cost = args.laziness * tf.reduce_sum(probs * [0, 1.0, 1.0], axis=1) loss = tf.reduce_sum(processed_rewards * cross_entropies) #+ move_cost) global_step = tf.train.get_or_create_global_step() train_op = optimizer.minimize(loss, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=args.max_to_keep) with tf.name_scope('summaries'): rollout_reward = tf.placeholder(shape=(), dtype=tf.float32) # the weights to the hidden layer can be visualized hidden_weights = tf.trainable_variables()[0] for h in range(args.hidden_dim): slice_ = tf.slice(hidden_weights, [0, h], [R * C, 1]) ## image = tf.reshape(slice_, [1, R, C, 1]) tf.summary.image('hidden_{:04d}'.format(h), image) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) tf.summary.scalar('{}_max'.format(var.op.name), tf.reduce_max(var)) tf.summary.scalar('{}_min'.format(var.op.name), tf.reduce_min(var)) tf.summary.scalar('rollout_reward', rollout_reward) tf.summary.scalar('loss', loss) merged = tf.summary.merge_all() print('Number of trainable variables: {}'.format( len(tf.trainable_variables()))) #inner_env = gym.make('Pong-v0') # tf.agents helper to more easily track consecutive pairs of frames #env = FrameHistory(inner_env, past_indices=[0, 1], flatten=False) # tf.agents helper to automatically reset the environment #env = AutoReset(env) with tf.Session(graph=g) as sess: if args.restore: restore_path = tf.train.latest_checkpoint(args.output_dir) print('Restoring from {}'.format(restore_path)) saver.restore(sess, restore_path) else: sess.run(init) summary_path = os.path.join(args.output_dir, 'summary') summary_writer = tf.summary.FileWriter(summary_path, sess.graph) # lowest possible score after an episode as the # starting value of the running reward _rollout_reward = -1.0 #old = -21.0 for i in range(args.n_epoch): print('>>>>>>> epoch {}'.format(i + 1)) print('>>> Rollout phase') epoch_memory = [] episode_memory = [] game = Game({'max_steps': 10000}) # initialize game from game.py h = random.randint(1, R * C + 1) l = random.randint(1, h // 2 + 1) # The loop for actions/stepss pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } _observation = preprocess( game.init(pizza_config) [0]) #np.zeros(OBSERVATION_DIM) #get only first value of tuple while True: # sample one action with the given probability distribution _label = sess.run(sample_action, feed_dict={observations: [_observation]}) _action = ACTIONS[_label] _state, _reward, _done, _ = game.step(_action) if args.render: game.render() # record experience episode_memory.append((_observation, _label, _reward)) # Get processed frame delta for the next step #pair_state = _pair_state #current_state, previous_state = pair_state #current_x = prepro(current_state) #previous_x = prepro(previous_state) _observation = preprocess(_state) if _done: obs, lbl, rwd = zip(*episode_memory) # processed rewards prwd = discount_rewards(rwd, args.gamma) prwd -= np.mean(prwd) prwd /= (np.std(prwd) + .00000001 ) #epsilon otherwise div by zero if no rewards # store the processed experience to memory epoch_memory.extend(zip(obs, lbl, prwd)) # calculate the running rollout reward _rollout_reward = 0.9 * _rollout_reward + 0.1 * sum(rwd) episode_memory = [] #if args.render: # _ = input('episode done, press Enter to replay') # epoch_memory = [] # continue if len(epoch_memory) >= ROLLOUT_SIZE: break game = Game({'max_steps': 10000}) # initialize game from game.py h = random.randint(1, R * C + 1) l = random.randint(1, h // 2 + 1) pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } _observation = preprocess( game.init(pizza_config)[0] ) #np.zeros(OBSERVATION_DIM) #get only first value of tuple # add to the global memory MEMORY.extend(epoch_memory) print('>>> Train phase') print('rollout reward: {}'.format(_rollout_reward)) # Here we train only once. _, _global_step = sess.run([train_op, global_step]) if _global_step % args.save_checkpoint_steps == 0: print('Writing summary') feed_dict = {rollout_reward: _rollout_reward} summary = sess.run(merged, feed_dict=feed_dict) summary_writer.add_summary(summary, _global_step) save_path = os.path.join(args.output_dir, 'model.ckpt') save_path = saver.save(sess, save_path, global_step=_global_step) print('Model checkpoint saved: {}'.format(save_path))
def main(args): def preprocess(state_dict): stacked_frames = [ np.zeros((R, C), dtype=np.float32) for i in range(stack_size) ] # replace with np.int stacked_frames[0] = np.array( state_dict['ingredients_map'] ) #.reshape((R,C)) #tomatoes_map #could use .append... #stacked_frames[1] = (stacked_frames[0] - 1) * -1 #mushrooms_map = (tomatoes_map - 1) * -1 #/!\ Add mushroom map and set stack_size to 5 instead of 4 cursor_R, cursor_C = state_dict[ 'cursor_position'] #cursor_R, cursor_C = _state['cursor_position'] stacked_frames[1] = np.zeros( [R, C]) # try np.zeros((R,C)) #cursor_map = np.zeros([R, C]) stacked_frames[1][cursor_R, cursor_C] = 1 #cursor_map[cursor_R,cursor_C] = 1 slice_map = np.array(state_dict['slices_map'] ) #slice_map = np.array(_state['slices_map']) current_slice_id = slice_map[ cursor_R, cursor_C] #current_slice_id = slice_map[cursor_R,cursor_C] current_slice = np.where( slice_map == current_slice_id ) #current_slice = np.where(slice_map==current_slice_id) stacked_frames[2] = np.zeros([R, C]) #np.zeros([R, C]) stacked_frames[2][ current_slice] = 1 #current_slice_map[current_slice] = 1 other_slice = np.where((slice_map != current_slice_id) & (slice_map != -1)) stacked_frames[3] = np.zeros([R, C]) #other_slice_map = np.zeros([R, C]) stacked_frames[3][other_slice] = 1 #other_slice_map[other_slice] = 1 """ state = np.concatenate(( np.array(state_dict['ingredients_map']).ravel(), np.array(state_dict['slices_map']).ravel(), np.array(state_dict['cursor_position']).ravel(), [state_dict['min_each_ingredient_per_slice'], state_dict['max_ingredients_per_slice']], )) """ stacked_state = np.stack(stacked_frames, axis=2) #return state.astype(np.float).ravel() return stacked_state class DQNetwork: def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'): #self.state_size = state_size #(OK) self.state_size = state_size #(OK) self.action_size = action_size #(OK) self.learning_rate = learning_rate #(OK) args. with tf.variable_scope(name): # We create the placeholders # *state_size means that we take each elements of state_size in tuple hence is like if we wrote ### Replacing [None, *state_size] by [1, batch_size, *state_size] NOPE needs [None, *state_size for predict_action (1 value)] #self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs") self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs") self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name="actions_") # Remember that target_Q is the R(s,a) + ymax Qhat(s', a') self.target_Q = tf.placeholder(tf.float32, [None], name="target") """ First convnet: CNN ELU """ # Input is RxCx4 (from 110x84x4) self.conv1 = tf.layers.conv2d( inputs=self.inputs_, filters=32, kernel_size=[4, 4], # from [8,8] strides=[1, 1], padding="VALID", kernel_initializer=tf.contrib.layers. xavier_initializer_conv2d(), name="conv1") self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out") """ Second convnet: CNN ELU """ """ self.conv2 = tf.layers.conv2d(inputs = self.conv1_out, filters = 64, kernel_size = [3,3], # from [4,4] strides = [2,2], padding = "VALID", kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name = "conv2") self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out") """ """ Third convnet: CNN ELU """ """ self.conv3 = tf.layers.conv2d(inputs = self.conv2_out, filters = 64, kernel_size = [3,3], strides = [2,2], padding = "VALID", kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name = "conv3") self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out") self.flatten = tf.contrib.layers.flatten(self.conv3_out) """ self.flatten = tf.contrib.layers.flatten(self.conv1_out) #1_Hot_Encode L and H and add it here below flatten [TO BE ADDED - FIXED FOR NOW...] ### INIT self.flatten to our flatten state!!! (no CNN for now) #self.flatten = self.inputs_ # append 5 node features at the end (cursor 2x1, L, H) self.fc = tf.layers.dense( inputs=self.flatten, units=512, activation=tf.nn.elu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="fc1") self.fc2 = tf.layers.dense( inputs=self.fc, units=512, activation=tf.nn.elu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="fc2") self.output = tf.layers.dense( inputs=self.fc2, kernel_initializer=tf.contrib.layers.xavier_initializer(), units=self.action_size, activation=None) # Q is our predicted Q value. self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_)) # The loss is the difference between our predicted Q_values and the Q_target # Sum(Qtarget - Q)^2 self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q)) self.optimizer = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss) class Memory(): def __init__(self, max_size): self.buffer = deque(maxlen=max_size) #stack_size = 4 # We stack 4 frames ## could be commented # Initialize deque with zero-images one array for each image #stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4) #MEMORY = deque([], maxlen=MEMORY_CAPACITY) #NEEDS TO BE CHECKED!! #MEMORY is buffer.... def add(self, experience): self.buffer.append(experience) def sample(self, batch_size): buffer_size = len(self.buffer) index = np.random.choice(np.arange(buffer_size), size=batch_size, replace=False) return [self.buffer[i] for i in index] # similar to the generator.... def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions): ## EPSILON GREEDY STRATEGY # Choose action a from state s using epsilon greedy. ## First we randomize a number exp_exp_tradeoff = np.random.rand() # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook explore_probability = explore_stop + ( explore_start - explore_stop) * np.exp(-decay_rate * decay_step) if (explore_probability > exp_exp_tradeoff): # Make a random action (exploration) choice = random.randint(1, len(possible_actions)) - 1 action = possible_actions[choice] else: # Get action from Q-network (exploitation) # Estimate the Qs values state Qs = sess.run(DQNetwork.output, feed_dict={ DQNetwork.inputs_: state.reshape( (1, *state.shape)) }) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) action = possible_actions[choice] return action, explore_probability # Init possible_actions = np.array(np.identity(action_size, dtype=int).tolist()) #(OK) print("The action size is : ", action_size) #(OK) print(possible_actions) #(OK) # Reset the graph tf.reset_default_graph() # Instantiate the DQNetwork DQNetwork = DQNetwork(state_size, action_size, learning_rate) # Instantiate memory memory = Memory(max_size=memory_size) for i in range(pretrain_length): # If it's the first step if i == 0: game = Game({'max_steps': max_steps }) # initialize game from game.py // not 10000 h = 6 #random.randint(1, R * C + 1) l = 1 #random.randint(1, h // 2 + 1) pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_lines = [ "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM" ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } _state = preprocess( game.init(pizza_config) [0]) #np.zeros(OBSERVATION_DIM) #get only first value of tuple # Get the next_state, the rewards, done by taking a random action choice = random.randint(1, len(possible_actions)) - 1 action = possible_actions[choice] #as one-hot #translate _action into 1 to 5 action for the game... _action = ACTIONS[np.argmax(action)] next_state, _reward, _done, _ = game.step( _action) #next_state is _state in Game agent _next_state = preprocess(next_state) if episode_render and i % 20 == 0: # NEEDS TO BE CHECKED args.render: game.render() # If the episode is finished (we maxed out the number of frames) if _done: # We finished the episode _next_state = np.zeros( _state.shape ) # _state is flattened with cursor, L and H appended # Add experience to memory (push action one-hot encoded instead of _action label e.g.'right') memory.add((_state, action, _reward, _next_state, _done)) # Start a new episode game = Game({'max_steps': max_steps}) # initialize game from game.py not h = 6 #random.randint(1, R * C + 1) l = 1 #random.randint(1, h // 2 + 1) pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_lines = [ "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM" ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } _state = preprocess( game.init(pizza_config) [0]) #np.zeros(OBSERVATION_DIM) #get only first value of tuple ### Watch-out, _observation will be flattened and won't conserve Rc needed for CNN else: # Add experience to memory (push action one-hot encoded instead of _action label e.g.'right') memory.add((_state, action, _reward, _next_state, _done)) # Our new state is now the next_state _state = _next_state # Setup TensorBoard Writer #NEEDS TO BE CHECKED #summary_path = os.path.join('gs://pizza-game/', './summary') summary_path = os.path.join(args.output_dir, 'summary') writer = tf.summary.FileWriter(summary_path) #writer = tf.summary.FileWriter("./tensorboard/dqn/1") for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) tf.summary.scalar('{}_max'.format(var.op.name), tf.reduce_max(var)) tf.summary.scalar('{}_min'.format(var.op.name), tf.reduce_min(var)) #tf.summary.scalar('rollout_reward', rollout_reward) tf.summary.scalar('loss', DQNetwork.loss) _total_reward = tf.placeholder(tf.float32, (), name="tot_reward") tf.summary.scalar('reward', _total_reward) write_op = tf.summary.merge_all() # Saver will help us to save our model saver = tf.train.Saver() if training == True: with tf.Session() as sess: # Initialize the variables sess.run(tf.global_variables_initializer()) # Initialize the decay rate (that will use to reduce epsilon) decay_step = 0 rewards_list = [] average_reward = [] average_reward_scalar = 0 for episode in range(total_episodes): # Set step to 0 step = 0 # Initialize the rewards of the episode episode_rewards = [] episode_actions = [] # Make a new episode and observe the first state # Start a new episode game = Game({'max_steps': max_steps}) # initialize game from game.py h = 6 #random.randint(1, R * C + 1) l = 1 #random.randint(1, h // 2 + 1) pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_lines = [ "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM" ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } _state = preprocess(game.init(pizza_config)[0]) while step < max_steps: step += 1 #Increase decay_step decay_step += 1 # Predict the action to take and take it action, explore_probability = predict_action( explore_start, explore_stop, decay_rate, decay_step, _state, possible_actions) #action is one-hot... so translate _action into 1 to 5 action for the game... _action = ACTIONS[np.argmax(action)] #Perform the action and get the next_state, reward, and done information next_state, _reward, _done, _ = game.step( _action) #next_state is _state in Game agent _next_state = preprocess(next_state) # Add the reward to total reward episode_rewards.append(_reward) episode_actions.append(_action) # If the game is finished if _done: # The episode ends so no next state _next_state = np.zeros( _state.shape ) # _state is flattened with cursor, L and H appended # Set step = max_steps to end the episode step = max_steps # Get the total reward of the episode total_reward = np.sum(episode_rewards) average_reward.append(total_reward) if (episode % 100 == 0 and episode < 500) or (episode % 1000 == 0): print( 'Episode: {}'.format(episode), 'Total reward: {}'.format(total_reward), 'Explore P: {:.4f}'.format( explore_probability), 'Training Loss {:.4f}'.format(loss)) print(episode_actions) print(episode_rewards) # Add reward to that point between printing episodes if (episode < 500): if (episode == 0): average_reward_scalar = np.sum( average_reward) else: average_reward_scalar = np.sum( average_reward) / 100 else: if (episode == 1000): average_reward_scalar = np.sum( average_reward) / 600 else: average_reward_scalar = np.sum( average_reward) / 1000 rewards_list.append((episode, total_reward)) # Store transition <st,at,rt+1,st+1> in memory D memory.add( (_state, action, _reward, _next_state, _done)) if episode_render and ( (episode % 100 == 0 and episode < 500) or (episode % 1000 == 0)): game.render() else: # Add experience to memory memory.add( (_state, action, _reward, _next_state, _done)) # st+1 is now our current state _state = _next_state ### LEARNING PART # Obtain random mini-batch from memory batch = memory.sample(batch_size) # reshaping states by using squeeze.... states_mb = np.array([each[0] for each in batch], ndmin=3) ## Consider modifying ndmin #states_mb = np.squeeze(states_mb, axis=0) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) # reshaping next_states by using squeeze.... next_states_mb = np.array([each[3] for each in batch], ndmin=3) #next_states_mb = np.squeeze(next_states_mb, axis=0) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # Get Q values for next_state /!\ --- Why shape of DQNetwork.inputs_ = (1, 64, 89??) Qs_next_state = sess.run( DQNetwork.output, feed_dict={DQNetwork.inputs_: next_states_mb}) # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a') for i in range(0, len(batch)): terminal = dones_mb[i] # If we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max( Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run( [DQNetwork.loss, DQNetwork.optimizer], feed_dict={ DQNetwork.inputs_: states_mb, DQNetwork.target_Q: targets_mb, DQNetwork.actions_: actions_mb }) # Save model every 5 episodes if (episode % 100 == 0 and episode < 500) or (episode % 1000 == 0): save_os_path = os.path.join(args.output_dir, 'models/model.ckpt') save_path = saver.save(sess, save_os_path) print("Model Saved") # Write TF Summaries summary = sess.run(write_op, feed_dict={ DQNetwork.inputs_: states_mb, DQNetwork.target_Q: targets_mb, DQNetwork.actions_: actions_mb, _total_reward: average_reward[0] }) average_reward = [] #_total_reward = tf.placeholder(tf.float32, (), name="tot_reward") writer.add_summary(summary, episode) writer.flush() with tf.Session() as sess: total_test_rewards = [] # Load the model restore_os_path = os.path.join(args.restore_dir, 'models/model.ckpt') restore_path = saver.restore(sess, restore_os_path) #saver.restore(sess, restore_path) for episode in range(1): total_rewards = 0 game = Game({'max_steps': max_steps}) # initialize game from game.py h = 6 #random.randint(1, R * C + 1) l = 1 #random.randint(1, h // 2 + 1) pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_lines = [ "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM" ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } _state = preprocess( game.init(pizza_config) [0]) #np.zeros(OBSERVATION_DIM) #get only first value of tuple print("****************************************************") print("EPISODE ", episode) while True: _state = _state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(DQNetwork.output, feed_dict={DQNetwork.inputs_: _state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) action = possible_actions[choice] #as one-hot #translate _action into 1 to 5 action for the game... _action = ACTIONS[np.argmax(action)] print(_action) #Perform the action and get the next_state, reward, and done information next_state, _reward, _done, _ = game.step( _action) #next_state is _state in Game agent _next_state = preprocess(next_state) game.render() total_rewards += _reward if _done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break _state = _next_state
def sample(self, batch_size): buffer_size = len(self.buffer) index = np.random.choice(np.arange(buffer_size), size=batch_size, replace=False) return [self.buffer[i] for i in index] # similar to the generator.... # Instantiate memory memory = Memory(max_size=memory_size) for i in range(pretrain_length): # If it's the first step if i == 0: game = Game({'max_steps': max_steps}) # initialize game from game.py // not 10000 h = 6 #random.randint(1, R * C + 1) l = 1 #random.randint(1, h // 2 + 1) pizza_lines = [ ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R) ] pizza_lines = [ "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM" ] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h }