def main(args): # preprocess input state def preprocess(obser): '''preprocess 210x160x3 frame into 6400(80x80) flat vector''' obser = obser[35:195] # 160x160x3 obser = obser[::2, ::2, 0] # downsample (80x80) obser[obser == 144] = 0 obser[obser == 109] = 0 obser[obser != 0] = 1 return obser.astype(np.float).ravel() INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 MAX_EPISODES = 20000 MAX_STEPS = 5000 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # load env env = gym.make("Pong-v0") # evaluation for ep in xrange(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print 'Ep%s Reward: %s ' % (ep + 1, total_rewards)
def main(args): def preprocess(obs): obs = obs[35:195] obs = obs[::2, ::2, 0] obs[obs == 144] = 0 obs[obs == 109] = 0 obs[obs != 0] = 1 return obs.astype(np.float).ravel() INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # load env env = gym.make('Pong-v0') # evaluation for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep+1, total_rewards))
def main(args): def preprocess(obs): obs = obs[35:195] obs = obs[::2, ::2, 0] obs[obs == 144] = 0 obs[obs == 109] = 0 obs[obs != 0] = 1 return obs.astype(np.float).ravel() INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # load env env = gym.make('Pong-v0') # evaluation for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep + 1, total_rewards))