def test_make_output(_format): """ test make output :param _format: (str) output format """ writer = make_output_format(_format, LOG_DIR) writer.writekvs(KEY_VALUES) if _format == 'tensorboard': read_tb(LOG_DIR) elif _format == "csv": read_csv(LOG_DIR + 'progress.csv') elif _format == 'json': read_json(LOG_DIR + 'progress.json') writer.close()
def configure(dir, format_strs=None, custom_output_formats=None): if not dir: return assert isinstance(dir, str) os.makedirs(dir, exist_ok=True) if format_strs is None: strs = os.getenv('OPENAI_LOG_FORMAT') format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS output_formats = [make_output_format(f, dir) for f in format_strs] if custom_output_formats is not None: assert isinstance(custom_output_formats, list) for custom_output_format in custom_output_formats: assert isinstance(custom_output_format, KVWriter) output_formats.extend(custom_output_formats) Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) log('Logging to %s' % dir)
input_height, input_width = (86, 80) batch_size = 32 update_freq = 10000 learn_freq = 4 save_freq = 500000 action_space_size = env.action_space.n NUM_STEPS = 4000000 replay_memory_size = 40000 replay_alpha = 0.6 replay_beta = 0.4 replay_epsilon = 1e-6 is_load_model = True watch_flag = True fps = 30 #frames shown per second when watch_flag == True log_csv_writer = logger.make_output_format("json", "logs") log = logger.Logger("logs", [log_csv_writer]) def preprocess_frame(frame): """Given a frame, scales it and converts to grayscale""" im = resize(color.rgb2gray(frame)[:176, :], (input_height, input_width), mode='constant') return im """ model arch ---------- Conv1 (8x8x32 filter) -> ReLU -> Conv2 (4x4x64 filter) -> ReLU -> Conv3 (3x3x64 filter) -> ReLU -> FC4 (512 neurons) -> ReLU -> FC5 (9 neurons) -> ReLU -> Output Q-value for each action """ def q_function_nn(obs, action_space_size, scope, reuse=False):
def eval(self): # create base_dir to save results env_id = self.args['env_id'] if self.args[ 'env_kind'] == 'mario' else self.args['eval_type'] # base_dir = os.path.join(self.args['log_dir'], self.args['exp_name'], env_id) # os.makedirs(base_dir, exist_ok=True) # i forget to restore, i cannot believe myself # load_path = self.args['load_path'] # args['IS_HIGH_RES'] is used to signal whether save videos nlevels = self.args['NUM_LEVELS'] save_video = False # train progress results logger format_strs = ['csv'] format_strs = filter(None, format_strs) dirc = os.path.join(self.args['log_dir'], 'inter') output_formats = [ logger.make_output_format(f, dirc) for f in format_strs ] self.result_logger = logger.Logger(dir=dirc, output_formats=output_formats) if self.args['env_kind'] == 'mario': # do NOT FORGET to change this nlevels = 20 # curr_iter = 0 # results_list = [] restore_iter = [25 * i for i in range(117)] + [2929] for r in restore_iter: load_path = os.path.join(self.args['load_dir'], 'model-{}'.format(r)) print(load_path) self.agent.load(load_path) save_video = False nlevels = 20 if self.args['env_kind'] == 'mario' else self.args[ 'NUM_LEVELS'] results, _ = self.agent.evaluate(nlevels, save_video) results['iter'] = r for (k, v) in results.items(): self.result_logger.logkv(k, v) self.result_logger.dumpkvs() ''' results['iter'] = curr_iter = int(l.split('/')[-1].split('-')[-1]) print(results) results_list.append(results) csv_columns = results_list[0].keys() print(csv_columns) curr_dir = os.path.join(base_dir, str(curr_iter)) os.makedirs(curr_dir, exist_ok=True) csv_save_path = os.path.join(curr_dir, 'results.csv'.format()) with open(csv_save_path, 'w') as file: writer = csv.DictWriter(file, fieldnames=csv_columns) writer.writeheader() for data in results_list: writer.writerow(data) print('results are dumped to {}'.format(csv_save_path)) ''' '''
def train(self): curr_iter = 0 # train progress results logger format_strs = ['csv'] format_strs = filter(None, format_strs) dirc = os.path.join(self.args['log_dir'], 'inter') if self.restore_iter > -1: dirc = os.path.join(self.args['log_dir'], 'inter-{}'.format(self.restore_iter)) output_formats = [ logger.make_output_format(f, dirc) for f in format_strs ] self.result_logger = logger.Logger(dir=dirc, output_formats=output_formats) # in case we are restoring the training if self.restore_iter > -1: self.agent.load(self.load_path) if not self.args['transfer_load']: curr_iter = self.restore_iter print('max_iter: {}'.format(self.max_iter)) # interim saves to compare in the future # for 128M frames, inter_save = [] for i in range(3): divisor = (2**(i + 1)) inter_save.append( int(self.args['num_timesteps'] // divisor) // (self.args['nsteps'] * self.args['NUM_ENVS'] * self.args['nframeskip'])) print('inter_save: {}'.format(inter_save)) total_time = 0.0 # results_list = [] while curr_iter < self.early_max_iter: frac = 1.0 - (float(curr_iter) / self.max_iter) # self.agent.update calls rollout start_time = time.time() ## linearly annealing curr_lr = self.lr(frac) curr_cr = self.cliprange(frac) ## removed within training evaluation ## i could not make flag_sum to work properly ## evaluate each 100 run for 20 training levels # only for mario (first evaluate, then update) # i am doing change to get zero-shot generalization without any effort if curr_iter % (self.args['save_interval']) == 0: save_video = False nlevels = 20 if self.args[ 'env_kind'] == 'mario' else self.args['NUM_LEVELS'] results, _ = self.agent.evaluate(nlevels, save_video) results['iter'] = curr_iter for (k, v) in results.items(): self.result_logger.logkv(k, v) self.result_logger.dumpkvs() # representation learning in each 25 steps info = self.agent.update(lr=curr_lr, cliprange=curr_cr) end_time = time.time() # additional info info['frac'] = frac info['curr_lr'] = curr_lr info['curr_cr'] = curr_cr info['curr_iter'] = curr_iter # info['max_iter'] = self.max_iter info['elapsed_time'] = end_time - start_time # info['total_time'] = total_time = (total_time + info['elapsed_time']) / 3600.0 info['expected_time'] = self.max_iter * info[ 'elapsed_time'] / 3600.0 ## logging results using baselines's logger logger.logkvs(info) logger.dumpkvs() if curr_iter % self.args['save_interval'] == 0: self.agent.save(curr_iter, cliprange=curr_cr) if curr_iter in inter_save: self.agent.save(curr_iter, cliprange=curr_cr) curr_iter += 1 self.agent.save(curr_iter, cliprange=curr_cr) # final evaluation for mario save_video = False nlevels = 20 if self.args['env_kind'] == 'mario' else self.args[ 'NUM_LEVELS'] results, _ = self.agent.evaluate(nlevels, save_video) results['iter'] = curr_iter for (k, v) in results.items(): self.result_logger.logkv(k, v) self.result_logger.dumpkvs()
def create_json_logger(log_dir): return logger.Logger(log_dir, [logger.make_output_format(f, log_dir) for f in ['json']] )
def create_logger(log_dir): return logger.Logger(log_dir, [logger.make_output_format(f, log_dir) for f in logger.LOG_OUTPUT_FORMATS] )
def test_make_output_fail(): """ test value error on logger """ with pytest.raises(ValueError): make_output_format('dummy_format', LOG_DIR)
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2000, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, **network_kwargs): ''' nsteps = nsteps, ent_coef = ent_coef, vf_coef = vf_coef, max_grad_norm = max_grad_norm ''' ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) csv = logger.make_output_format('csv', '/home/jin/project/rlnabi/PPO/csv', 1) ave_return = returns.sum() / len(returns) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('average_return', ave_return) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) lossdict = {lossname: lossval} csv.writekvs(lossdict) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model