def train(env_id, num_timesteps, num_cpu): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(SEED + rank) gym.logger.setLevel(logging.WARN) env = wrap_deepmind(env) # wrap the env one more time for getting total reward env = Monitor(env, rank) return env return _thunk env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) learn(CNN, env, SEED, total_timesteps=int(num_timesteps * 1.1)) env.close() pass
def main(env_id, num_timesteps, seed, policy, nstack, nsteps, lrschedule, optimizer, num_cpu, model_file, use_static_wrapper, use_encoded_imagination, use_decoded_imagination): num_timesteps //= 4 assert not (use_encoded_imagination and use_decoded_imagination) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if use_static_wrapper: env = StaticWrapper(env) if policy == 'cnn' or use_encoded_imagination: env = RenderWrapper(env, 400, 600) env = DownsampleWrapper(env, 4) if use_encoded_imagination or use_decoded_imagination: env = FrameStack(env, 3) if use_encoded_imagination: env = EncodedImaginationWrapper(env, model_file, num_cpu) if use_decoded_imagination: env = DecodedImaginationWrapper(env, model_file, num_cpu) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'fc': policy_fn = FcPolicy if policy == 'cnn': policy_fn = CnnPolicy learn(policy_fn, env, seed, nsteps=nsteps, nstack=nstack, total_timesteps=num_timesteps, lrschedule=lrschedule, optimizer=optimizer, max_episode_length=195) env.close()
def main(): cumulative_avg_rewards = [] for seed_ in [10, 50, 100, 200, 500]: seed(seed_) set_random_seed(seed_) print("Seed: ", seed_) episode = 0 # initialize environment env_id = get_args().env #env = make_atari(env_id) #env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False) #env = Monitor(env) env = SubprocVecEnv([make_env(seed_, i) for i in range(6)]) #24 print("CHECK_ENV", env.reset().__array__().shape) state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = get_agent(env) save_path = os.path.join('models_entropy_coeff1', "Space_inv_A2C_LSTM_nstep8_MAX_rew_546") agent.load(save_path) lstm_state = np.zeros((6, 256), dtype=np.float32) #24 # run for 100 episodes #for i in range(100): counter = 0 episodic_reward_lis = [] for i in range(wandb.config.episodes): # Set reward received in this episode = 0 at the start of the episode episodic_reward = np.zeros((6)) #24 episodic_reward_m = np.zeros((6)) #24 reset = False #env = gym.wrappers.Monitor(env, 'test/'+str(i), force=True) obs = env.reset() renders = [] count = 0 action_count = 0 done = False done1 = np.zeros(6) #24 done2 = np.zeros(6) #24 while not done: a, v, lstm_state = agent.step(obs, S_=lstm_state, M_=done1) obs, reward, done1, info = env.step(a, done1, cond="eval") done = done2.all() if (done): episodic_reward_m1 = episodic_reward_m.max() break if (done1.any()): episodic_reward_m[np.logical_and( done2 <= 0, done1)] = episodic_reward[np.logical_and( done2 <= 0, done1)] for j in np.nonzero(done1)[0]: episodic_reward[j] = 0 episodic_reward += reward done2 = np.logical_or(done1, done2) if (i == 0): reset = True cumulative_avg_reward = evaluate(episodic_reward_m1, reset) tf.reset_default_graph() env.close() # your models will be evaluated on 100-episode average reward # therefore, we stop logging after 100 episodes print("*************************************************************") print("CUMULATIVE_AVG_REWARD", cumulative_avg_reward) print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") cumulative_avg_rewards.append(cumulative_avg_reward) print("Final score: ", np.mean(cumulative_avg_rewards))
def runTrain(gymId='BreakoutNoFrameskip-v4', numEnvs=16, seed=0, filePathBrain='training/breakout-v1.pth', numSteps=5, numBatches=20000, outputBatchInterval=1000, joinEnvs=1, epsilon=0.00001): def make_env(rank): def _thunk(): env = make_atari(gymId) env.seed(seed + rank) gym.logger.setLevel(logging.WARN) env = wrap_deepmind(env) # wrap the env one more time for getting total reward env = Monitor(env, rank) return env return _thunk print('training starting', numBatches, outputBatchInterval, 'epsilon', epsilon) env = SubprocVecEnv([make_env(i) for i in range(numEnvs)]) numActions = env.action_space.n torchDevice = 'cpu' if torch.cuda.is_available(): torchDevice = 'cuda' agent = ai_a2c.A2C(numActions, device=torchDevice) if filePathBrain: agent.load(filePath=filePathBrain) timingStart = date_time.now() batchCount = 0 states, actions, rewards, dones, values = [], [], [], [], [] for ii in range(numEnvs): states.append([]) actions.append([]) rewards.append([]) dones.append([]) values.append([]) # Set first state. # Environment returns 1 frame, but we want multiple, so we stack the new # state on top of the past ones. nh, nw, nc = env.observation_space.shape nstack = 4 batchStateShape = (numEnvs * numSteps, nh, nw, nc * nstack) emptyState = np.zeros((numEnvs, nh, nw, nc * nstack), dtype=np.uint8) obs = env.reset() # states = updateState(obs, emptyState, nc) lastStates = updateState(obs, emptyState, nc) lastDones = [False for _ in range(numEnvs)] totalRewards = [] realTotalRewards = [] # All actions are always valid. validActions = [0, 1, 2, 3] while batchCount < numBatches: states, actions, rewards, dones, values = [], [], [], [], [] stepCount = 0 while stepCount < numSteps: actionsStep, valuesStep = agent.selectActions( lastStates, validActions=validActions, randomRatio=epsilon) # print ('actionsStep', actionsStep) states.append(np.copy(lastStates)) actions.append(actionsStep) values.append(valuesStep) if stepCount > 0: dones.append(lastDones) # Input the action (run a step) for all environments. statesStep, rewardsStep, donesStep, infosStep = env.step( actionsStep) # Update state for any dones. for n, done in enumerate(donesStep): if done: lastStates[n] = lastStates[n] * 0 lastStates = updateState(obs, lastStates, nc) # Update rewards for logging / tracking. for done, info in zip(donesStep, infosStep): if done: totalRewards.append(info['reward']) if info['total_reward'] != -1: realTotalRewards.append(info['total_reward']) lastDones = donesStep rewards.append(rewardsStep) stepCount += 1 # Dones is one off, so add the last one. dones.append(lastDones) # discount/bootstrap off value fn # lastValues = self.agent.value(lastStates).tolist() # Can skip this as it is done in the learn function with calcActualStateValues? # Join all (combine batches and steps). states = np.asarray(states, dtype='float32').swapaxes( 1, 0).reshape(batchStateShape) actions = np.asarray(actions).swapaxes(1, 0).flatten() rewards = np.asarray(rewards).swapaxes(1, 0).flatten() dones = np.asarray(dones).swapaxes(1, 0).flatten() values = np.asarray(values).swapaxes(1, 0).flatten() agent.learn(states, actions, rewards, dones, values) batchCount += 1 if batchCount % outputBatchInterval == 0: runTime = date_time.diff(date_time.now(), timingStart, 'minutes') totalSteps = batchCount * numSteps runTimePerStep = runTime / totalSteps runTimePerStepUnit = 'minutes' if runTimePerStep < 0.02: runTimePerStep *= 60 runTimePerStepUnit = 'seconds' print(batchCount, numBatches, '(batch done)', number.toFixed(runTime), 'run time minutes,', totalSteps, 'steps,', number.toFixed(runTimePerStep), runTimePerStepUnit, 'per step') r = totalRewards[-100:] # get last 100 tr = realTotalRewards[-100:] if len(r) == 100: print("avg reward (last 100):", np.mean(r)) if len(tr) == 100: print("avg total reward (last 100):", np.mean(tr)) print("max (last 100):", np.max(tr)) # Only save periodically as well. if filePathBrain: agent.save(filePathBrain) env.close() if filePathBrain: agent.save(filePathBrain) runTime = date_time.diff(date_time.now(), timingStart, 'minutes') totalSteps = numBatches * numSteps runTimePerStep = runTime / totalSteps runTimePerStepUnit = 'minutes' if runTimePerStep < 0.02: runTimePerStep *= 60 runTimePerStepUnit = 'seconds' print('training done:', number.toFixed(runTime), 'run time minutes,', totalSteps, 'steps,', number.toFixed(runTimePerStep), runTimePerStepUnit, 'per step') return None