def train(self): self.build_model() self.__model.summary() self.__model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) UP_ACTION = 2 DOWN_ACTION = 3 # hyperparameters gamma = .99 # initializing variables x_train, y_train, rewards = [], [], [] reward_sum = 0 episode_nb = 0 # initialize variables resume = True running_reward = None epochs_before_saving = 10 log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/" # load pre-trained model if exist if (resume and os.path.isfile(LOG_DIR + 'my_model_weights.h5')): print("loading previous weights") self.__model.load_weights(LOG_DIR + 'my_model_weights.h5') # add a callback tensorboard object to visualize learning tbCallBack = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, \ write_graph=True, write_images=True) # initializing environment env = gym.make('Pong-v0') observation = env.reset() prev_input = None # main loop while (True): # preprocess the observation, set input as difference between images cur_input = prepro(observation) x = cur_input - prev_input if prev_input is not None else np.zeros( 80 * 80) prev_input = cur_input # forward the policy network and sample action according to the proba distribution proba = self.__model.predict(np.expand_dims(x, axis=1).T) action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION y = 1 if action == 2 else 0 # 0 and 1 are our labels # log the input and label to train later x_train.append(x) y_train.append(y) # do one step in our environment observation, reward, done, info = env.step(action) rewards.append(reward) reward_sum += reward # end of an episode if done: print('At the end of episode', episode_nb, 'the total reward was :', reward_sum) # increment episode number episode_nb += 1 # training self.__model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=[tbCallBack], \ sample_weight=discount_rewards(rewards, gamma)) # Saving the weights used by our model if episode_nb % epochs_before_saving == 0: self.__model.save_weights( 'my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5') # Log the reward running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 tflog('running_reward', running_reward, custom_dir=log_dir) # Reinitialization x_train, y_train, rewards = [], [], [] observation = env.reset() reward_sum = 0 prev_input = None
# load pre-trained model if exist if (os.path.isfile('my_model_weights.h5')): print("loading previous weights") model.load_weights('my_model_weights.h5') from karpathy import prepro, discount_rewards import numpy as np prev_input = None # main loop for i in range(STEPS): # choose random action # preprocess the observation, set input as difference between images cur_input = prepro(observation) x = cur_input - prev_input if prev_input is not None else np.zeros(80 * 80) prev_input = cur_input # forward the policy network and sample action according to the proba distribution proba = model.predict(np.expand_dims(x, axis=1).T) action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION # run one step observation, reward, done, info = env.step(action) frames.append(observation) # collecting observation # if episode is over, reset to beginning if done: observation = env.reset() frames.append(observation) # collecting observation