def main(): set_global_seeds(2021) args = parse_args() train_data = pd.read_pickle(args.train_file) valid_data = pd.read_pickle(args.valid_file) test_data = pd.read_pickle(args.test_file) word2vec = Word2Vec.load(args.emb_file).wv vocab_size = word2vec.vectors.shape[0] args.embed_size = word2vec.vectors.shape[1] embeddings = np.zeros((vocab_size + 1, args.embed_size), dtype="float32") embeddings[:vocab_size] = word2vec.vectors if not args.use_pretrain: embeddings = None if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) log_file = os.path.join(args.log_dir, '%d.log' % time.time()) model = CodeModel(args, vocab_size + 1, label_size=104, log_file=log_file, pretrain_emb=embeddings) if not args.only_test: model.train(train_data, valid_data, test_data) model_path = os.path.join(args.model_dir, 'best.model') model.load_model(model_path) test_acc = model.evaluate(test_data) print("Using model %s, Test Acc: %.4f" % (model_path, test_acc))
def train(args, n_actors, batch_queue, prios_queue, param_queue): env = wrapper.make_atari(args.env) env = wrapper.wrap_atari_dqn(env, args) utils.set_global_seeds(args.seed, use_torch=True) model = DuelingDQN(env, args).to(args.device) # model.load_state_dict(torch.load('model_30h.pth')) tgt_model = DuelingDQN(env, args).to(args.device) tgt_model.load_state_dict(model.state_dict()) writer = SummaryWriter(comment="-{}-learner".format(args.env)) optimizer = torch.optim.Adam(model.parameters(), args.lr) # optimizer = torch.optim.RMSprop(model.parameters(), args.lr, alpha=0.95, eps=1.5e-7, centered=True) check_connection(n_actors) param_queue.put(model.state_dict()) learn_idx = 0 ts = time.time() tb_dict = { k: [] for k in ['loss', 'grad_norm', 'max_q', 'mean_q', 'min_q'] } while True: *batch, idxes = batch_queue.get() loss, prios, q_values = utils.compute_loss(model, tgt_model, batch, args.n_steps, args.gamma) grad_norm = utils.update_parameters(loss, model, optimizer, args.max_norm) prios_queue.put((idxes, prios)) batch, idxes, prios = None, None, None learn_idx += 1 tb_dict["loss"].append(float(loss)) tb_dict["grad_norm"].append(float(grad_norm)) tb_dict["max_q"].append(float(torch.max(q_values))) tb_dict["mean_q"].append(float(torch.mean(q_values))) tb_dict["min_q"].append(float(torch.min(q_values))) if args.soft_target_update: tau = args.tau for p_tgt, p in zip(tgt_model.parameters(), model.parameters()): p_tgt.data *= 1 - tau p_tgt.data += tau * p elif learn_idx % args.target_update_interval == 0: print("Updating Target Network..") tgt_model.load_state_dict(model.state_dict()) if learn_idx % args.save_interval == 0: print("Saving Model..") torch.save(model.state_dict(), "model.pth") if learn_idx % args.publish_param_interval == 0: param_queue.put(model.state_dict()) if learn_idx % args.tb_interval == 0: bps = args.tb_interval / (time.time() - ts) print("Step: {:8} / BPS: {:.2f}".format(learn_idx, bps)) writer.add_scalar("learner/BPS", bps, learn_idx) for k, v in tb_dict.items(): writer.add_scalar(f'learner/{k}', np.mean(v), learn_idx) v.clear() ts = time.time()
def main(): loadpath = "./data/yelp_short_s10.p" x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] train_lab, val_lab, test_lab = x[3], x[4], x[5] wordtoix, ixtoword = x[6], x[7] train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt = Options() set_global_seeds(opt.seed) opt.n_words = len(ixtoword) sys.stdout = open(opt.log_path + '.log.txt', 'w') print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") print dict(opt) print('Total words: %d' % opt.n_words) if opt.part_data: # np.random.seed(123) train_ind = np.random.choice( len(train_lab), int(len(train_lab) * opt.train_percent / 100), replace=False) train = [train[t] for t in train_ind] train_lab = [train_lab[t] for t in train_ind] run_model(opt, train, val, test, train_lab, val_lab, test_lab, wordtoix, ixtoword)
async def send_batch_worker(buffer, exe, event, lock, batch_size, beta, actor_num, actor_ips): """ coroutine to send training batches to learner """ seed = int(str(time.time())[-4:]) utils.set_global_seeds(seed, use_torch=False) loop = asyncio.get_event_loop() ctx = Context.instance() socket = ctx.socket(zmq.DEALER) socket.connect("ipc:///tmp/5103.ipc") actors_sockets = [] for i in range(actor_num): ctx = zmq.Context() socket = ctx.socket(zmq.DEALER) socket.connect('tcp://{}:51004'.format(actor_ips[i])) actors_sockets.append(socket) await event.wait() while True: identity, _ = await socket.recv_multipart(copy=False) # TODO: Is there any other greay way to support lock but make sampling faster? async with lock: batch = await loop.run_in_executor(exe, sample_batch, buffer, batch_size, beta, actors_sockets) await socket.send_multipart([identity, batch], copy=False) batch = None return True
async def main(): """ main event loop """ args = argparser() utils.set_global_seeds(args.seed, use_torch=False) procs = [ Process(target=recv_batch_device), Process(target=recv_prios_device), Process(target=send_batch_device), ] for p in procs: p.start() buffer = CustomPrioritizedReplayBuffer(args.replay_buffer_size, args.alpha) exe = ThreadPoolExecutor() event = asyncio.Event() lock = asyncio.Lock() # TODO: How to decide the proper number of asyncio workers? workers = [] for _ in range(args.n_recv_batch_worker): w = recv_batch_worker(buffer, exe, event, lock, args.threshold_size) workers.append(w) for _ in range(args.n_recv_prios_worker): w = recv_prios_worker(buffer, exe, event, lock) workers.append(w) for _ in range(args.n_send_batch_worker): w = send_batch_worker(buffer, exe, event, lock, args.batch_size, args.beta) workers.append(w) await asyncio.gather(*workers) return True
def main(): args = argparser() args.clip_rewards = False env = make_atari(args.env) env = wrap_atari_dqn(env, args) seed = args.seed + 1122 utils.set_global_seeds(seed, use_torch=True) env.seed(seed) model = DuelingDQN(env) model.load_state_dict(torch.load('model.pth', map_location='cpu')) episode_reward, episode_length = 0, 0 state = env.reset() while True: if args.render: env.render() action, _ = model.act(torch.FloatTensor(np.array(state)), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() print("Episode Length / Reward: {} / {}".format( episode_length, episode_reward)) episode_reward = 0 episode_length = 0
def main(): args = parse_args() set_global_seeds(666) config = read_config(args.config, "TRAIN") config_main = read_config(args.config, "MAIN") pprint(config) factory = Factory(config['train_params']) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') callbacks = create_callbacks(config['train_params']['name'], config['dumps']) trainer = Runner(stages=config['stages'], factory=factory, callbacks=callbacks, device=device) aug_train = AUGMENTATIONS_TRAIN_CROP if config['train_params']['type'] == 'crop' else AUGMENTATIONS_TRAIN aug_test = AUGMENTATIONS_TEST_CROP if config['train_params']['type'] == 'crop' else AUGMENTATIONS_TEST train_dataset = SegmentationDataset(data_folder=config_main['path_to_data'], transforms=aug_train, phase='train', activation=config_main['activation'], fold=config['fold'], empty_mask_params=config['data_params']['empty_mask_increase']) val_dataset = SegmentationDataset(data_folder=config_main['path_to_data'], transforms=aug_test, phase='val', fold=config['fold'], activation=config_main['activation']) train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=16, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=16) os.makedirs(os.path.join(config['dumps']['path'], config['dumps']['weights'], config['train_params']['name']), exist_ok=True) shutil.copy(args.config, os.path.join(config['dumps']['path'], config['dumps']['weights'], config['train_params']['name'], args.config.split('/')[-1])) trainer.fit(train_loader, val_loader)
def exploration(args, actor_id, param_queue): writer = SummaryWriter(comment="-{}-eval".format(args.env)) args.clip_rewards = False args.episode_life = False env = make_atari(args.env) env = wrap_atari_dqn(env, args) seed = args.seed + actor_id utils.set_global_seeds(seed, use_torch=True) env.seed(seed) model = DuelingDQN(env, args) param = param_queue.get(block=True) model.load_state_dict(param) param = None print("Received First Parameter!") episode_reward, episode_length, episode_idx = 0, 0, 0 state = env.reset() tb_dict = {k: [] for k in ['episode_reward', 'episode_length']} while True: action, _ = model.act(torch.FloatTensor(np.array(state)), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done or episode_length == args.max_episode_length: state = env.reset() tb_dict["episode_reward"].append(episode_reward) tb_dict["episode_length"].append(episode_length) episode_reward = 0 episode_length = 0 episode_idx += 1 param = param_queue.get() model.load_state_dict(param) print(f"{datetime.now()} Updated Parameter..") if (episode_idx * args.num_envs_per_worker) % args.tb_interval == 0: writer.add_scalar('evaluator/episode_reward_mean', np.mean(tb_dict['episode_reward']), episode_idx) writer.add_scalar('evaluator/episode_reward_max', np.max(tb_dict['episode_reward']), episode_idx) writer.add_scalar('evaluator/episode_reward_min', np.min(tb_dict['episode_reward']), episode_idx) writer.add_scalar('evaluator/episode_reward_std', np.std(tb_dict['episode_reward']), episode_idx) writer.add_scalar('evaluator/episode_length_mean', np.mean(tb_dict['episode_length']), episode_idx) tb_dict['episode_reward'].clear() tb_dict['episode_length'].clear()
def exploration(args, actor_id, n_actors, param_queue, send_queue, req_param_queue): writer = SummaryWriter(comment="-{}-actor{}".format(args.env, actor_id)) env = make_atari(args.env) env = wrap_atari_dqn(env, args) seed = args.seed + actor_id utils.set_global_seeds(seed, use_torch=True) env.seed(seed) model = DuelingDQN(env) epsilon = args.eps_base**(1 + actor_id / (n_actors - 1) * args.eps_alpha) storage = BatchStorage(args.n_steps, args.gamma) req_param_queue.put(True) param = param_queue.get(block=True) model.load_state_dict(param) param = None print("Received First Parameter!") episode_reward, episode_length, episode_idx, actor_idx = 0, 0, 0, 0 state = env.reset() while True: action, q_values = model.act(torch.FloatTensor(np.array(state)), epsilon) next_state, reward, done, _ = env.step(action) com_state = zlib.compress(np.array(state).tobytes()) storage.add(com_state, reward, action, done, q_values) state = next_state episode_reward += reward episode_length += 1 actor_idx += 1 if done or episode_length == args.max_episode_length: state = env.reset() writer.add_scalar("actor/episode_reward", episode_reward, episode_idx) writer.add_scalar("actor/episode_length", episode_length, episode_idx) episode_reward = 0 episode_length = 0 episode_idx += 1 if actor_idx % args.update_interval == 0: try: req_param_queue.put(True) param = param_queue.get(block=True) model.load_state_dict(param) print("Updated Parameter..") except queue.Empty: pass if len(storage) == args.send_interval: batch, prios = storage.make_batch() send_queue.put((batch, prios)) batch, prios = None, None storage.reset()
def set_random_seed(self, seed): if seed is None: return set_global_seeds(seed) if self.env is not None: self.env.seed(seed) self.env.action_space.np_random.seed(seed) self.action_space.seed(seed)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) args = parser.parse_args() logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args.update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args.update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args.proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus = args.dynamics_bonus ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def train(args, n_actors, batch_queue, prios_queue, param_queue): env = RunTagEnv(width=5, height=5, number_of_subordinates=1, max_steps=1000) #env = wrapper.make_atari(args.env) #env = wrapper.wrap_atari_dqn(env, args) utils.set_global_seeds(args.seed, use_torch=True) model = DuelingDQN(env).to(args.device) tgt_model = DuelingDQN(env).to(args.device) tgt_model.load_state_dict(model.state_dict()) writer = SummaryWriter(comment="-{}-learner".format(args.env)) # optimizer = torch.optim.Adam(model.parameters(), args.lr) optimizer = torch.optim.RMSprop(model.parameters(), args.lr, alpha=0.95, eps=1.5e-7, centered=True) check_connection(n_actors) param_queue.put(model.state_dict()) learn_idx = 0 ts = time.time() while True: *batch, idxes = batch_queue.get() loss, prios = utils.compute_loss(model, tgt_model, batch, args.n_steps, args.gamma) grad_norm = utils.update_parameters(loss, model, optimizer, args.max_norm) print('Updated parameters!') prios_queue.put((idxes, prios)) batch, idxes, prios = None, None, None learn_idx += 1 writer.add_scalar("learner/loss", loss, learn_idx) writer.add_scalar("learner/grad_norm", grad_norm, learn_idx) if learn_idx % args.target_update_interval == 0: print("Updating Target Network..") tgt_model.load_state_dict(model.state_dict()) if learn_idx % args.save_interval == 0: print("Saving Model..") torch.save(model.state_dict(), "model.pth") if learn_idx % args.publish_param_interval == 0: param_queue.put(model.state_dict()) if learn_idx % args.bps_interval == 0: bps = args.bps_interval / (time.time() - ts) print("Step: {:8} / BPS: {:.2f}".format(learn_idx, bps)) writer.add_scalar("learner/BPS", bps, learn_idx) ts = time.time()
def main(): learner_ip = get_environ() args = argparser() writer = SummaryWriter(comment="-{}-eval".format(args.env)) ctx = zmq.Context() param_socket = ctx.socket(zmq.SUB) param_socket.setsockopt(zmq.SUBSCRIBE, b'') param_socket.setsockopt(zmq.CONFLATE, 1) param_socket.connect('tcp://{}:52001'.format(learner_ip)) env = make_atari(args.env) env = wrap_atari_dqn(env, args) seed = args.seed + 1122 utils.set_global_seeds(seed, use_torch=True) env.seed(seed) model = DuelingDQN(env) data = param_socket.recv(copy=False) param = pickle.loads(data) model.load_state_dict(param) print("Loaded first parameter from learner") episode_reward, episode_length, episode_idx = 0, 0, 0 state = env.reset() while True: if args.render: env.render() action, _ = model.act(torch.FloatTensor(np.array(state)), 0.01) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() writer.add_scalar("eval/episode_reward", episode_reward, episode_idx) writer.add_scalar("eval/episode_length", episode_length, episode_idx) episode_reward = 0 episode_length = 0 episode_idx += 1 if episode_idx % args.eval_update_interval == 0: data = param_socket.recv(copy=False) param = pickle.loads(data) model.load_state_dict(param)
def make_ple_envs(env_id, num_env, seed, start_index=0, *args, **kwargs): """ Create a monitored SubprocVecEnv for PLE. """ def make_env(rank): # pylint: disable=C0111 def _thunk(): env = gym.make(env_id) env.seed(seed + rank, *args, **kwargs) # TODO should be after the monitor command! env = Monitor(env, None, **kwargs) # env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), **kwargs) return env return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)], env_id)
def train(self): utils.set_global_seeds(self.seed, use_torch=True) learn_idx = 0 while True: beta = self.beta_by_frame(learn_idx) states, actions, rewards, next_states, dones, weights, idxes = self.buffer.sample( self.batch_size, beta) states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) weights = torch.FloatTensor(weights).to(self.device) batch = (states, actions, rewards, next_states, dones, weights) loss, prios = utils.compute_loss(self.model, self.tgt_model, batch, self.n_step, self.gamma) self.scheduler.step() grad_norm = utils.update_parameters(loss, self.model, self.optimizer, self.max_norm) self.buffer.update_priorities(idxes, prios) batch, idxes, prios = None, None, None learn_idx += 1 self.writer.add_scalar("learner/loss", loss, learn_idx) self.writer.add_scalar("learner/grad_norm", grad_norm, learn_idx) if learn_idx % self.target_update_interval == 0: print("Updating Target Network..") self.tgt_model.load_state_dict(self.model.state_dict()) if learn_idx % self.save_interval == 0: print("Saving Model..") torch.save(self.model.state_dict(), "model{}.pth".format(learn_idx)) if learn_idx % self.publish_param_interval == 0: self.batch_recorder.set_worker_weights( copy.deepcopy(self.model)) if learn_idx >= self.max_step: torch.save(self.model.state_dict(), "model{}.pth".format(learn_idx)) self.batch_recorder.cleanup() break
async def send_batch_worker(buffer, exe, event, lock, batch_size, beta): """ coroutine to send training batches to learner """ seed = int(str(time.time())[-4:]) utils.set_global_seeds(seed, use_torch=False) loop = asyncio.get_event_loop() ctx = Context.instance() socket = ctx.socket(zmq.DEALER) socket.connect("ipc:///tmp/5103.ipc") await event.wait() while True: identity, _ = await socket.recv_multipart(copy=False) # TODO: Is there any other greay way to support lock but make sampling faster? async with lock: batch = await loop.run_in_executor(exe, sample_batch, buffer, batch_size, beta) print('Replay: Sending batch...') await socket.send_multipart([identity, batch], copy=False) batch = None return True
def main(): args = get_args() utils.set_global_seeds(args.seed) env = make_atari_env(args.env, args.seed) benchmark_env = make_atari_env(args.env, args.seed+1) optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4) n_timesteps = 10000000 learning_starts = 50000 exploration_schedule = utils.PiecewiseSchedule( [(0, 1.0), (learning_starts, 1.0), (learning_starts + 1e6, 0.1)], outside_value=0.1, ) replay_memory = NStepReplayMemory( size=1000000, history_len=args.history_len, discount=0.99, nsteps=args.nsteps, ) q_func = AtariRecurrentConvNet() if args.recurrent else AtariConvNet() dqn.learn( env, benchmark_env, q_func, replay_memory, optimizer=optimizer, exploration=exploration_schedule, max_timesteps=n_timesteps, batch_size=32, learning_starts=learning_starts, learning_freq=4, target_update_freq=10000, grad_clip=40., log_every_n_steps=50000, ) env.close()
def main(): loadpath = "./data/yelp_short_s10.p" x = cPickle.load(open(loadpath, "rb")) train, val, test = x[0], x[1], x[2] train_lab, val_lab, test_lab = x[3], x[4], x[5] wordtoix, ixtoword = x[6], x[7] train_lab = np.array(train_lab, dtype='float32') val_lab = np.array(val_lab, dtype='float32') test_lab = np.array(test_lab, dtype='float32') opt = Options() set_global_seeds(opt.seed) opt.n_words = len(ixtoword) sys.stdout = open(opt.log_path + '.log.txt', 'w') print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") print dict(opt) print('Total words: %d' % opt.n_words) run_model(opt, train, val, test, test_lab, wordtoix, ixtoword)
def exploration_eval(args, actor_id, param_queue): writer = SummaryWriter(comment="-{}-eval".format(args.env)) args.clip_rewards = False env = make_atari(args.env) env = wrap_atari_dqn(env, args) seed = args.seed + actor_id utils.set_global_seeds(seed, use_torch=True) env.seed(seed) model = DuelingDQN(env) param = param_queue.get(block=True) model.load_state_dict(param) param = None print("Received First Parameter!") episode_reward, episode_length, episode_idx = 0, 0, 0 state = env.reset() while True: action, _ = model.act(torch.FloatTensor(np.array(state)), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done or episode_length == args.max_episode_length: state = env.reset() writer.add_scalar("evaluator/episode_reward", episode_reward, episode_idx) writer.add_scalar("evaluator/episode_length", episode_length, episode_idx) episode_reward = 0 episode_length = 0 episode_idx += 1 param = param_queue.get() model.load_state_dict(param) print("Updated Parameter..")
def main(): seed = 0 utils.set_global_seeds(seed) name = 'CartPole-v0' env = make_continuouscontrol_env(name, seed) benchmark_env = make_continuouscontrol_env(name, seed + 1) optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) n_timesteps = 500000 learning_starts = 50000 exploration_schedule = utils.PiecewiseSchedule( [(0, 1.0), (learning_starts, 1.0), (learning_starts + 3e5, 0.1)], outside_value=0.1, ) replay_memory = NStepReplayMemory( size=500000, history_len=1, discount=0.99, nsteps=1, ) dqn.learn( env, benchmark_env, CartPoleNet(), replay_memory, optimizer=optimizer, exploration=exploration_schedule, max_timesteps=n_timesteps, batch_size=32, learning_starts=learning_starts, learning_freq=4, target_update_freq=10000, log_every_n_steps=10000, ) env.close()
def start_experiment(**args): # create environment # coinrun environment is already vectorized env, test_env = make_env_all_params(args=args) # set random seeds for reproducibility utils.set_global_seeds(seed=args['seed']) # create tf.session tf_sess = utils.setup_tensorflow_session() if args['server_type'] == 'local': logger_context = logger.scoped_configure(dir=args['log_dir'], format_strs=['stdout', 'csv']) else: logger_context = logger.scoped_configure(dir=args['log_dir'], format_strs=['csv']) with logger_context, tf_sess: print("logging directory: {}".format(args['log_dir'])) # create trainer trainer = Trainer(env=env, test_env=test_env, args=args) if args['evaluation'] == 1: # load_path is changed to model_path print('run.py, def start_experiment, evaluating model: {}'.format( args['load_path'])) trainer.eval() # this is for visualizing the loss landscape elif args['visualize'] == 1: print('running visualization...') trainer.visualize() else: print('run.py, def start_experiment, training begins...') trainer.train()
def learn(policy, env, test_env, seed, total_timesteps, log_interval, test_interval, show_interval, logdir, lr, max_grad_norm, units_per_hlayer, activ_fcn, gamma=0.99, vf_coef=0.5, ent_coef=0.01, batch_size=5, early_stop=False, keep_model=2, save_model=True, restore_model=False, save_traj=False): logger = logging.getLogger(__name__) tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, lr=lr, max_grad_norm=max_grad_norm, activ_fcn=activ_fcn, units_per_hlayer=units_per_hlayer, log_interval=log_interval, logdir=logdir, nenvs=nenvs, batch_size=batch_size, ent_coef=ent_coef, vf_coef=vf_coef, keep_model=keep_model # total_timesteps=total_timesteps, ) sum_write = model.get_summary_writer() result_path = os.path.join(logdir, 'train_results.csv') if save_traj: rew_traj = [] rew_results_path = os.path.join( logdir, ('lr' + str(lr) + '_tracking_results.csv')) else: rew_results_path = None i_sample, i_train = 0, 0 return_threshold = -0.05 horizon = 100 avg_rm = deque(maxlen=30) runner = Runner(env, model, nsteps=batch_size, gamma=gamma, horizon=horizon, show_interval=show_interval, summary_writer=sum_write) if restore_model: for el in os.listdir(logdir): if 'final' in el and '.meta' in el: # Load pre trained model and set network parameters model.load(os.path.join(logdir, el[:-5])) # Reset global step parameter. model.sess.run(model.global_step.assign(0)) logger.info('Start Training') breaked = False nbatch = nenvs * batch_size tstart = time.time() max_returns = deque([50], maxlen=7) # returns of the 7 best training episodes for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, actions, values, reward_window, raw_rewards = runner.run( ) if rew_results_path is not None: rew_traj.append(raw_rewards) policy_loss, value_loss, policy_entropy, ap = model.train( obs, states, rewards, actions, values) if test_interval > 0 and i_train > 0 and (update % test_interval == 0): ep_return = model.test_run( test_env, n_eps=10, n_pipes=2000 ) # TODO test, whether results.csv is saved properly with open(result_path, "a") as csvfile: writer = csv.writer(csvfile) ep_return[0:0] = [i_sample, i_train] writer.writerow(ep_return) # Log the performance during training at every update step. # Save the current model if the average reward of the last # 100 time steps is above the return threshold if ('ContFlappyBird' in env.env_id): saved = False for i, rw in enumerate(reward_window): rm = sum(rw) / horizon if sum_write is not None: s_summary = tf.Summary() s_summary.value.add( tag='envs/environment%s/isample_return' % i, simple_value=rm) sum_write.add_summary(s_summary, i_sample) t_summary = tf.Summary() t_summary.value.add( tag='envs/environment%s/itrain_return' % i, simple_value=rm) sum_write.add_summary(t_summary, i_train) sum_write.flush() # logger.info(rm) if save_model and not saved and rm > return_threshold: return_threshold = rm logger.info('Save model at max rolling mean %s' % return_threshold) model.save('inter_model') saved = True avg_rm.append(rm) if early_stop: if (i_sample > 500000) and ( i_sample <= 500000 + nbatch ): # TODO how to determine early-stopping criteria non-heuristically, but automatically? - BOHB algorithm? if (sum(avg_rm) / 30) <= -0.88: print('breaked') breaked = True break i_sample += nbatch i_train += 1 if save_model: model.save('final_model') logger.info('Finished Training. Saving Final model.') if rew_results_path is not None: with open(rew_results_path, "a") as csvfile: writer = csv.writer(csvfile) traj = np.asanyarray(rew_traj).reshape(-1).tolist() traj[0:0] = [np.mean(traj)] # i_train, i_sample writer.writerow(traj) logger.info('*******************************************************') logger.info('Total number of interactions with the environment: %s' % i_sample) logger.info( 'Total number of finished episodes during training: sum(%s) = %s' % (runner.ep_idx, sum(runner.ep_idx))) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return breaked
def main(): parser = arg_parser() parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--max_episode_steps', type=int, default=4500) parser.add_argument('--num-timesteps', type=int, default=int(1e8)) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0.) parser.add_argument('--beta', type=float, default=1e-3) parser.add_argument('--exploration_type', type=str, default='bottleneck') parser.add_argument('--noise_type', type=str, default='none', choices=['none', 'box']) parser.add_argument('--noise_p', type=float, default=0.1) parser.add_argument('--use_sched', type=int, default=0) parser.add_argument('--exp_name', type=str, default='none') args = parser.parse_args() if args.policy == 'rnn': args.gamma_ext = 0.999 else: args.gamma_ext = 0.99 logger_dir = './results/' + args.env.replace("NoFrameskip-v4", "") logger_dir += datetime.datetime.now().strftime("-%m-%d-%H-%M-%S") logger.configure(dir=logger_dir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, exploration_type=args.exploration_type, beta=args.beta, noise_type=args.noise_type, noise_p=args.noise_p, use_sched=args.use_sched, exp_name=args.exp_name, ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def learn(policy, env, test_env, seed, total_timesteps, log_interval, test_interval, show_interval, logdir, lr, max_grad_norm, units_per_hlayer, activ_fcn, gamma=0.99, vf_coef=0.5, ent_coef=0.01, nsteps=5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, early_stop=False, keep_model=2, save_model=True, restore_model=False, save_traj=False): total_timesteps = int(total_timesteps) logger = logging.getLogger(__name__) tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # TODO number of samples per minibatch in an optimization episode make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, activ_fcn=activ_fcn, units_per_hlayer=units_per_hlayer, log_interval=log_interval, logdir=logdir, keep_model=keep_model, lr=lr, cliprange=cliprange) model = make_model() sum_write = model.get_summary_writer() result_path = os.path.join(logdir, 'train_results.csv') if save_traj: rew_traj = [] rew_results_path = os.path.join( logdir, ('lr' + str(lr) + '_tracking_results.csv')) else: rew_results_path = None i_sample, i_train = 0, 0 return_threshold = -2. horizon = 100 avg_rm = deque(maxlen=30) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, horizon=horizon, show_interval=show_interval, summary_writer=sum_write) if restore_model: for el in os.listdir(logdir): if 'final' in el and '.meta' in el: # Load pre trained model and set network parameters model.load(os.path.join(logdir, el[:-5])) # Reset global step parameter. model.sess.run(model.global_step.assign(0)) logger.info('Start Training') breaked = False nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # nbatch should be a multiple of nminibatches obs, returns, masks, actions, values, neglogpacs, states, reward_window, rewards = \ runner.run() #pylint: disable=E0632 # returns are estimates of the discounted reward if rew_results_path is not None: rew_traj.append(rewards) nbatch_train = nbatch // nminibatches # number of samples per minibatch tstart = time.time() # frac = 1.0 - (update - 1.0) / nupdates # converges to 0 # lrnow = lr(frac) # # cliprangenow = cliprange(frac) # cliprange converges to 0 # Update step mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) # for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mblossvals.append(model.train(lrnow, cliprangenow, *slices)) mblossvals.append(model.train(*slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches # minibatch contains batch data from several envs. envinds = np.arange(nenvs, dtype=np.int32) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = np.array( envinds[start:end] ) # TODO int() does not work here. ensure that indices are integers beforehand mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) if nenvs == 1: mbstates = states[:] else: if type(states) == tuple or type( states ) == tf.contrib.rnn.LSTMStateTuple: # LSTM state mbstates = [el[mbenvinds] for el in states] else: # GRU state mbstates = states[mbenvinds] # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) mblossvals.append(model.train(*slices, mbstates)) if test_interval > 0 and i_train > 0 and (update % test_interval == 0): ep_return = model.test_run( test_env, n_eps=10, n_pipes=2000 ) # TODO test, whether results.csv is saved properly with open(result_path, "a") as csvfile: writer = csv.writer(csvfile) ep_return[0:0] = [i_sample, i_train] writer.writerow(ep_return) if ('ContFlappyBird' in env.env_id): saved = False for i, rw in enumerate(reward_window): rm = sum(rw) / horizon if sum_write is not None: s_summary = tf.Summary() s_summary.value.add( tag='envs/environment%s/isample_return' % i, simple_value=rm) sum_write.add_summary(s_summary, i_sample) t_summary = tf.Summary() t_summary.value.add( tag='envs/environment%s/itrain_return' % i, simple_value=rm) sum_write.add_summary(t_summary, i_train) sum_write.flush() if save_model and not saved and rm > return_threshold: return_threshold = rm logger.info('Save model at max rolling mean %s' % return_threshold) model.save('inter_model') saved = True avg_rm.append(rm) if early_stop: if (i_sample > 500000) and ( i_sample <= 500000 + nbatch ): # TODO how to determine early-stopping criteria non-heuristically, but automatically? - BOHB algorithm? if (sum(avg_rm) / 30) <= -0.88: print('breaked') breaked = True break i_sample += nbatch i_train += 1 if save_model: model.save('final_model') logger.info('Finished Training. Saving Final model.') if rew_results_path is not None: with open(rew_results_path, "a") as csvfile: writer = csv.writer(csvfile) traj = np.asanyarray(rew_traj).reshape(-1).tolist() traj[0:0] = [np.mean(traj)] # i_train, i_sample writer.writerow(traj) logger.info('*******************************************************') logger.info('Total number of interactions with the environment: %s' % i_sample) logger.info( 'Total number of finished episodes during training: sum(%s) = %s' % (runner.ep_idx, sum(runner.ep_idx))) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return breaked
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument( "--num-timesteps", type=int, default=int(1e12), ) parser.add_argument( "--num_env", type=int, default=32, ) parser.add_argument( "--use_news", type=int, default=0, ) parser.add_argument( "--gamma", type=float, default=0.99, ) parser.add_argument( "--gamma_ext", type=float, default=0.999, ) parser.add_argument( "--lam", type=float, default=0.95, ) parser.add_argument( "--update_ob_stats_every_step", type=int, default=0, ) parser.add_argument( "--update_ob_stats_independently_per_gpu", type=int, default=0, ) parser.add_argument( "--update_ob_stats_from_random_agent", type=int, default=1, ) parser.add_argument( "--proportion_of_exp_used_for_predictor_update", type=float, default=1.0, ) parser.add_argument( "--tag", type=str, default="", ) parser.add_argument( "--policy", type=str, default="cnn", choices=["cnn", "rnn", "ffnn"], ) parser.add_argument( "--int_coeff", type=float, default=1.0, ) parser.add_argument( "--ext_coeff", type=float, default=2.0, ) parser.add_argument( "--dynamics_bonus", type=int, default=0, ) parser.add_argument( "--meta_rl", type=lambda x: True if x.lower() in {'true', 't'} else False, default=False, ) args = parser.parse_args() logger.configure( dir=logger.get_dir(), format_strs=["stdout", "log", "csv"] if MPI.COMM_WORLD.Get_rank() == 0 else [], ) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), "experiment_tag.txt"), "w") as f: f.write(args.tag) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, meta_rl=args.meta_rl, ) tf_util.make_session(make_default=True) train( env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, )
def train(variant): set_global_seeds(variant['seed']) if variant['mode'] == 'local': import colored_traceback.always ''' Set-up folder and files ''' snapshot_dir = logger.get_snapshot_dir() working_dir = config.PROJECT_PATH param_path = os.path.join(working_dir, 'params/params.json') # copyfile(param_path, os.path.join(snapshot_dir,'params.json')) try: ''' Save parameters ''' if 'params' in variant: logger.log('Load params from variant.') params = variant['params'] else: logger.log('Load params from file.') with open(param_path, 'r') as f: params = json.load(f) # Save to snapshot dir new_param_path = os.path.join(snapshot_dir, 'params.json') with open(new_param_path, 'w') as f: json.dump(params, f, sort_keys=True, indent=4, separators=(',', ': ')) # TODO: can use variant to modify here. dynamics_opt_params = params['dynamics_opt_params'] dynamics_opt_params['stop_critereon'] = stop_critereon( threshold=dynamics_opt_params['stop_critereon']['threshold'], offset=dynamics_opt_params['stop_critereon']['offset']) dynamics_opt_params = Dynamics_opt_params(**dynamics_opt_params) policy_opt_params = params['policy_opt_params'] policy_opt_params['stop_critereon'] = stop_critereon( threshold=policy_opt_params['stop_critereon']['threshold'], offset=policy_opt_params['stop_critereon']['offset'], percent_models_threshold=policy_opt_params['stop_critereon'] ['percent_models_threshold']) policy_opt_params = Policy_opt_params(**policy_opt_params) rollout_params = params['rollout_params'] rollout_params['monitorpath'] = os.path.join(snapshot_dir, 'videos') rollout_params = Rollout_params(**rollout_params) assert params['rollout_params']['max_timestep'] == \ params['policy_opt_params']['oracle_maxtimestep'] == \ params['policy_opt_params']['T'] ''' Policy model ''' def build_policy_from_rllab(scope_name='training_policy'): ''' Return both rllab policy and policy model function. ''' sess = tf.get_default_session() ### Initialize training_policy to copy from policy from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy output_nonlinearity = eval(params['policy']['output_nonlinearity']) training_policy = GaussianMLPPolicy( name=scope_name, env_spec=env.spec, hidden_sizes=params['policy']['hidden_layers'], init_std=policy_opt_params.trpo['init_std'], output_nonlinearity=output_nonlinearity) training_policy_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy') sess.run([tf.variables_initializer(training_policy_vars)]) ### Compute policy model function using the same weights. training_layers = training_policy._mean_network.layers def policy_model(x, stochastic=0.0, collect_summary=False): assert (training_layers[0].shape[1] == x.shape[1]) h = x for i, layer in enumerate(training_layers[1:]): w = layer.W b = layer.b pre_h = tf.matmul(h, w) + b h = layer.nonlinearity(pre_h, name='policy_out') if collect_summary: with tf.name_scope(scope_name + '/observation'): variable_summaries(x) with tf.name_scope(scope_name + '/layer%d' % i): with tf.name_scope('weights'): variable_summaries(w) with tf.name_scope('biases'): variable_summaries(b) with tf.name_scope('Wx_plus_b'): tf.summary.histogram('pre_activations', pre_h) tf.summary.histogram('activations', h) std = training_policy._l_std_param.param h += stochastic * tf.random_normal( shape=(tf.shape(x)[0], n_actions)) * tf.exp(std) return h return training_policy, policy_model ''' Dynamics model ''' def get_value(key, dict): return key in dict and dict[key] def prepare_input(xgu, xgu_norm, scope_name, variable_name, collect_summary, prediction_type): name_scope = '%s/%s' % (scope_name, variable_name) assert n_states > 1 and n_actions > 1 \ and xgu.shape[1] == n_states + n_actions + n_goals xu = tf.concat([xgu[:, :n_states], xgu[:, n_states + n_goals:]], axis=1) xu_norm = tf.concat( [xgu_norm[:, :n_states], xgu_norm[:, n_states + n_goals:]], axis=1) # Collect data summaries if collect_summary: with tf.name_scope(name_scope + '/inputs'): with tf.name_scope('states'): data_summaries(xgu[:, :n_states]) with tf.name_scope('goals'): data_summaries(xgu[:, n_states:n_states + n_goals]) with tf.name_scope('actions'): data_summaries(xgu[:, n_states + n_goals:]) # Ignore xy in the current state. if get_value('ignore_xy_input', params['dynamics_model']): n_inputs = n_states + n_actions - 2 nn_input = xu_norm[:, 2:] elif get_value('ignore_x_input', params['dynamics_model']): n_inputs = n_states + n_actions - 1 nn_input = xu_norm[:, 1:] else: n_inputs = n_states + n_actions nn_input = xu_norm hidden_layers = list(params['dynamics_model']['hidden_layers']) nonlinearity = [ eval(_x) for _x in params['dynamics_model']['nonlinearity'] ] assert (len(nonlinearity) == len(hidden_layers)) # Verify if the input type is valid. if prediction_type == 'state_change' or \ prediction_type == 'state_change_goal': n_outputs = n_states else: assert prediction_type == 'second_derivative' or \ prediction_type == 'second_derivative_goal' n_outputs = int(n_states / 2) nonlinearity.append(tf.identity) hidden_layers.append(n_outputs) return xu, nn_input, n_inputs, n_outputs, \ nonlinearity, hidden_layers def build_ff_neural_net(nn_input, n_inputs, hidden_layers, nonlinearity, scope_name, variable_name, collect_summary, logit_weights=None, initializer=layers.xavier_initializer()): assert len(hidden_layers) == len(nonlinearity) name_scope = '%s/%s' % (scope_name, variable_name) h = nn_input n_hiddens = n_inputs n_hiddens_next = hidden_layers[0] for i in range(len(hidden_layers)): w = get_scope_variable(scope_name, "%s/layer%d/weights" % (variable_name, i), shape=(n_hiddens, n_hiddens_next), initializer=initializer) b = get_scope_variable(scope_name, "%s/layer%d/biases" % (variable_name, i), shape=(n_hiddens_next), initializer=initializer) if collect_summary: with tf.name_scope(name_scope + '/layer%d' % i): with tf.name_scope('weights'): variable_summaries(w) with tf.name_scope('biases'): variable_summaries(b) with tf.name_scope('Wx_plus_b'): pre_h = tf.matmul(h, w) + b tf.summary.histogram('pre_activations', pre_h) h = nonlinearity[i](pre_h, name='activation') tf.summary.histogram('activations', h) else: pre_h = tf.matmul(h, w) + b h = nonlinearity[i](pre_h, name='activation') n_hiddens = hidden_layers[i] if i + 1 < len(hidden_layers): n_hiddens_next = hidden_layers[i + 1] if logit_weights is not None and i == len(hidden_layers) - 2: h *= logit_weights return h def build_dynamics_model(n_states, n_actions, n_goals, dt=None, input_rms=None, diff_rms=None): prediction_type = params['dynamics_model']['prediction_type'] def dynamics_model(xgu, scope_name, variable_name, collect_summary=False): ''' :param xu: contains states, goals, actions :param scope_name: :param variable_name: :param dt: :return: ''' xu, nn_input, n_inputs, n_outputs, nonlinearity, hidden_layers = \ prepare_input(xgu, (xgu - input_rms.mean)/input_rms.std, scope_name, variable_name, collect_summary, prediction_type) if "use_logit_weights" in params["dynamics_model"] and params[ "dynamics_model"]["use_logit_weights"]: logit_weights = build_ff_neural_net( nn_input, n_inputs, hidden_layers[:-1], nonlinearity[:-2] + [tf.nn.sigmoid], scope_name, variable_name + '_sig', collect_summary) else: logit_weights = None nn_output = build_ff_neural_net(nn_input, n_inputs, hidden_layers, nonlinearity, scope_name, variable_name, collect_summary, logit_weights=logit_weights) # predict the delta instead (x_next-x_current) if 'state_change' in prediction_type: next_state = tf.add( diff_rms.mean[:n_states] + diff_rms.std[:n_outputs] * nn_output, xu[:, :n_states]) else: assert 'second_derivative' in prediction_type # We train 'out' to match state_dot_dot # Currently only works for swimmer. qpos = xu[:, :n_outputs] + dt * xu[:, n_outputs:n_states] qvel = xu[:, n_outputs:n_states] + dt * nn_output next_state = tf.concat([qpos, qvel], axis=1) if '_goal' in prediction_type: assert n_goals > 1 g = xgu[:, n_states:n_states + n_goals] next_state = tf.concat([next_state, g], axis=1) return tf.identity(next_state, name='%s/%s/dynamics_out' % (scope_name, variable_name)) return dynamics_model def get_regularizer_loss(scope_name, variable_name): if params['dynamics_model']['regularization']['method'] in [ None, '' ]: return tf.constant(0.0, dtype=tf.float32) constant = params['dynamics_model']['regularization']['constant'] regularizer = eval( params['dynamics_model']['regularization']['method']) hidden_layers = params['dynamics_model']['hidden_layers'] reg_loss = 0.0 for i in range(len(hidden_layers) + 1): w = get_scope_variable( scope_name, "%s/layer%d/weights" % (variable_name, i)) b = get_scope_variable( scope_name, "%s/layer%d/biases" % (variable_name, i)) reg_loss += regularizer(w) + regularizer(b) return constant * reg_loss ''' Main ''' # with get_session() as sess: if variant['mode'] == 'local': sess = get_session(interactive=True, mem_frac=0.1) else: sess = get_session(interactive=True, mem_frac=1.0, use_gpu=variant['use_gpu']) # data = joblib.load(os.path.join(working_dir, params['trpo_path'])) env = get_env(variant['params']['env']) # policy = data['policy'] training_policy, policy_model = build_policy_from_rllab() if hasattr(env._wrapped_env, '_wrapped_env'): inner_env = env._wrapped_env._wrapped_env else: inner_env = env._wrapped_env.env.unwrapped n_obs = inner_env.observation_space.shape[0] n_actions = inner_env.action_space.shape[0] cost_np = inner_env.cost_np cost_tf = inner_env.cost_tf cost_np_vec = inner_env.cost_np_vec if hasattr(inner_env, 'n_goals'): n_goals = inner_env.n_goals n_states = inner_env.n_states assert n_goals + n_states == n_obs else: n_goals = 0 n_states = n_obs dt = None # Only necessary for second_derivative if hasattr(inner_env, 'model') and hasattr(inner_env, 'frame_skip'): dt = inner_env.model.opt.timestep * inner_env.frame_skip from running_mean_std import RunningMeanStd with tf.variable_scope('input_rms'): input_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals + n_actions)) with tf.variable_scope('diff_rms'): diff_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals)) dynamics_model = build_dynamics_model(n_states=n_states, n_actions=n_actions, n_goals=n_goals, dt=dt, input_rms=input_rms, diff_rms=diff_rms) kwargs = {} kwargs['input_rms'] = input_rms kwargs['diff_rms'] = diff_rms kwargs['mode'] = variant['mode'] if params['algo'] == 'vpg': from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from algos.vpg import VPG baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=training_policy, baseline=baseline, batch_size=policy_opt_params.vpg['batch_size'], max_path_length=policy_opt_params.T, discount=policy_opt_params.vpg['discount'], ) kwargs['rllab_algo'] = algo if params["policy_opt_params"]["vpg"]["reset"]: kwargs['reset_opt'] = tf.assign( training_policy._l_std_param.param, np.log(params["policy_opt_params"]["vpg"]["init_std"]) * np.ones(n_actions)) elif params['algo'] == 'trpo': ### Write down baseline and algo from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from algos.trpo import TRPO baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=training_policy, baseline=baseline, batch_size=policy_opt_params.trpo['batch_size'], max_path_length=policy_opt_params.T, discount=policy_opt_params.trpo['discount'], step_size=policy_opt_params.trpo['step_size'], ) kwargs['rllab_algo'] = algo if params["policy_opt_params"]["trpo"]["reset"]: kwargs['reset_opt'] = tf.assign( training_policy._l_std_param.param, np.log(params["policy_opt_params"]["trpo"]["init_std"]) * np.ones(n_actions)) # if "decay_rate" in params["policy_opt_params"]["trpo"]: # kwargs['trpo_std_decay'] = tf.assign_sub(training_policy._l_std_param.param, # np.log(params["policy_opt_params"]["trpo"]["decay_rate"])*np.ones(n_actions)) kwargs['inner_env'] = inner_env kwargs['algo_name'] = params['algo'] kwargs['logstd'] = training_policy._l_std_param.param # Save initial policy joblib.dump(training_policy, os.path.join(snapshot_dir, 'params-initial.pkl')) train_models(env=env, dynamics_model=dynamics_model, dynamics_opt_params=dynamics_opt_params, get_regularizer_loss=get_regularizer_loss, policy_model=policy_model, policy_opt_params=policy_opt_params, rollout_params=rollout_params, cost_np=cost_np, cost_np_vec=cost_np_vec, cost_tf=cost_tf, snapshot_dir=snapshot_dir, working_dir=working_dir, n_models=params['n_models'], sweep_iters=params['sweep_iters'], sample_size=params['sample_size'], verbose=False, variant=variant, saved_policy=training_policy, **kwargs) # Make sure not to reinitialize TRPO policy. # Save the final policy joblib.dump(training_policy, os.path.join(snapshot_dir, 'params.pkl')) except Exception as e: rmtree(snapshot_dir) import sys, traceback # traceback.print_exception(*sys.exc_info()) from IPython.core.ultratb import ColorTB c = ColorTB() exc = sys.exc_info() print(''.join(c.structured_traceback(*exc))) print('Removed the experiment folder %s.' % snapshot_dir)
def q_learning(q_network, env, test_env, seed, total_timesteps, log_interval, test_interval, show_interval, logdir, lr, max_grad_norm, units_per_hlayer, activ_fcn, gamma=0.95, epsilon=0.4, epsilon_decay=.95, buffer_size=4000, batch_size=128, trace_length=32, tau=0.99, update_interval=30, early_stop=False, keep_model=2, save_model=True, restore_model=False, save_traj=False): # """ # Q-Learning algorithm for off-policy TD control using Function Approximation. # Finds the optimal greedy policy while following an epsilon-greedy policy. # Implements the options of online learning or using experience replay and also # target calculation by target networks, depending on the flags. You can reuse # your Q-learning implementation of the last exercise. # # Args: # env: PLE game # approx: Action-Value function estimator # num_episodes: Number of episodes to run for. # max_time_per_episode: maximum number of time steps before episode is terminated # discount_factor: gamma, discount factor of future rewards. # epsilon: Chance to sample a random action. Float betwen 0 and 1. # epsilon_decay: decay rate of epsilon parameter # use_experience_replay: Indicator if experience replay should be used. # batch_size: Number of samples per batch. # target: Slowly updated target network to calculate the targets. Ignored if None. # # Returns: # An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. # """ logger = logging.getLogger(__name__) # logger.info(datetime.time) tf.reset_default_graph() set_global_seeds(seed) # Params ob_space = env.observation_space ac_space = env.action_space nd, = ob_space.shape n_ac = ac_space.n # Create learning agent and the replay buffer agent = DQNAgent(q_network=q_network, ob_space=ob_space, ac_space=ac_space, lr=lr, max_grad_norm=max_grad_norm, units_per_hlayer=units_per_hlayer, activ_fcn=activ_fcn, log_interval=log_interval, logdir=logdir, batch_size=batch_size, trace_length=trace_length, update_interval=update_interval, tau=tau, keep_model=keep_model) summary_writer = agent.get_summary_writer() result_path = os.path.join(logdir, 'train_results.csv') if save_traj: rew_traj = [] rew_results_path = os.path.join( logdir, ('lr' + str(lr) + '_tracking_results.csv')) else: rew_results_path = None replay_buffer = ReplayBuffer(buffer_size) # Keeps track of useful statistics stats = EpisodeStats if restore_model: for el in os.listdir(logdir): if 'final' in el and '.meta' in el: # Load pre trained model and set network parameters logger.info('load %s' % os.path.join(logdir, el[:-5])) agent.load(os.path.join(logdir, el[:-5])) # Reset global step parameter. agent.sess.run(agent.global_step.assign(0)) # ------------------ TRAINING -------------------------------------------- logger.info("Start Training") early_stopped = False i_episode, i_sample, i_train = 0, 0, 0 len, rew = 0, 0 horizon = 100 reward_window = deque(maxlen=horizon) avg_rm = deque(maxlen=30) nbatch = batch_size * trace_length return_threshold = -0.05 # 40 # Reset envnn obs = env.reset() obs = normalize_obs(obs) done = False rnn_state0 = agent.step_initial_state if rnn_state0 is None: # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences. trace_length = 1 # Set the target network to be equal to the primary network agent.update_target(agent.target_ops) while i_sample < total_timesteps: if np.random.rand(1) < epsilon: _, next_rnn_state = agent.step([obs], rnn_state0) # epsilon greedy action action = np.random.randint(0, n_ac) else: AP, next_rnn_state = agent.step( [obs], rnn_state0) # epsilon greedy action action = AP[0] next_obs, reward, done, _ = env.step(action) next_obs = normalize_obs(next_obs) i_sample += 1 # render only every i-th episode if show_interval != 0: if i_episode % show_interval == 0: env.render() len += 1 rew += reward reward_window.append(reward) # When episode is done, add episode information to tensorboard summary and stats if done: # env.game_over(): next_obs = list(np.zeros_like(next_obs, dtype=np.float32)) stats['episode_lengths'].append(len) stats['episode_rewards'].append(rew) if summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/ep_return', simple_value=stats['episode_rewards'][i_episode]) summary.value.add( tag="envs/ep_length", simple_value=stats['episode_lengths'][i_episode]) summary_writer.add_summary(summary, i_episode) summary_writer.flush() if save_model and rew > return_threshold: return_threshold = rew logger.info('Save model at max reward %s' % return_threshold) agent.save('inter_model') i_episode += 1 len, rew = 0, 0 # Update replay buffer replay_buffer.add_transition(obs, action, next_obs, reward, done) if save_traj: rew_traj.append(reward) # Update model parameters every #update_interval steps. Use real experience and replayed experience. if replay_buffer.size() > nbatch and (i_sample % update_interval == 0): if (env.spec._env_name == 'ContFlappyBird'): rm = sum(reward_window) / horizon if summary_writer is not None: s_summary = tf.Summary() s_summary.value.add(tag='envs/isample_return', simple_value=rm) summary_writer.add_summary(s_summary, i_sample) summary_writer.flush() if save_model and rm > return_threshold: return_threshold = rm logger.info('Save model at max rolling mean %s' % return_threshold) agent.save('inter_model') avg_rm.append(rm) if early_stop: if (i_sample > 60000) and (i_sample <= (60000 + update_interval)): if (sum(avg_rm) / 30) <= -0.88: print('breaked') early_stopped = True break agent.update_target(agent.target_ops) # reset rnn state (history knowledge) before every training step rnn_state_train = agent.train_initial_state # Sample training mini-batch from replay buffer if rnn_state_train is not None: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length) else: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch(batch_size) # Calculate TD target for batch. Use "old" fixed parameters if target network is available # to compute targets else use "old" parameters of value function estimate. # mb_next_obs = np.reshape(mb_next_obs, (-1, nd)) mb_next_q_values, _ = agent.target_model.predict( mb_next_obs, rnn_state_train) mb_best_next_action = np.argmax(mb_next_q_values, axis=1) mb_td_target = [ mb_rewards[j] + gamma * mb_next_q_values[j][mb_best_next_action[j]] for j in range(nbatch) ] # Update Q value estimator parameters by optimizing between Q network and Q-learning targets loss = agent.train(mb_obs, mb_actions, mb_td_target, rnn_state_train) i_train += 1 # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates if test_interval > 0 and i_train > 0 and (i_train % test_interval == 0): ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000) with open(result_path, "a") as csvfile: writer = csv.writer(csvfile) ep_return[0:0] = [i_sample, i_train] writer.writerow(ep_return) if done: # Reset the model next_obs = env.reset() next_obs = normalize_obs(next_obs) epsilon *= epsilon_decay obs = next_obs rnn_state0 = next_rnn_state # Save final model when training is finished. if save_model: agent.save('final_model') logger.info('Finished Training. Saving Final model.') if rew_results_path is not None: logger.info('Save reward trajectory to %s' % rew_results_path) with open(rew_results_path, "a") as csvfile: writer = csv.writer(csvfile) traj = np.asanyarray(rew_traj).reshape(-1).tolist() traj[0:0] = [np.mean(traj)] # i_train, i_sample writer.writerow(traj) logger.info('*******************************************************') logger.info('Total number of interactions with the environment: %s' % i_sample) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return early_stopped, i_sample
print("---------------------------------------") print("Settings: %s" % (file_name)) print("---------------------------------------") writer = SummaryWriter(comment=file_name) if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") env = gym.make(args.env_name) # Set seeds env.seed(args.seed) set_global_seeds(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer()
def eval_model(render, nepisodes, test_steps, save_traj=False, result_file='test_results.csv', **params): logger = logging.getLogger(__name__) logger.info('Evaluating learning algorithm...\n') logger.info(params["eval_model"]) logger.debug('\nMake Environment with seed %s' % params["seed"]) # TODO use different seed for every run!#, allow_early_resets=True) # TODO make non-clipped env, even if agent is trained on clipped env ple_env = make_ple_env(params["test_env"], seed=params["seed"]) tf.reset_default_graph() set_global_seeds(params["seed"]) model_idx = [] if save_traj: result_path = os.path.join(params["logdir"], result_file) else: result_path = None recurrent = (params["architecture"] == 'lstm' or params["architecture"] == 'gru') if params["eval_model"] == 'final': avg_performances = [] var_performances = [] maximal_returns = [] for f in glob.glob( os.path.join(params["logdir"], '*final_model-*.meta')): logger.info('Restore model: %s' % f) idx = f.find('final_model') f_name = f[idx:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model( sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, RNN_S_IN, RNN_S_OUT, PRED_Q, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() elif params["eval_model"] == 'inter': # Use all stored maximum performance models and the final model. # print('Eval now!') avg_performances = [] var_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx - 5:-5] model_idx.append(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model( sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, RNN_S_IN, RNN_S_OUT, PRED_Q, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) var_performances.append(np.var(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() elif params["eval_model"] == 'analysis': # Use all stored maximum performance models and the final model. avg_performances = [] std_performances = [] maximal_returns = [] for f in glob.glob(os.path.join(params["logdir"], '*.meta')): logger.info('Restore model: %s' % f) idx = f.find('_model') f_name = f[idx - 5:-5] model_idx.append(f_name) # print(f_name) with tf.Session() as sess: OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model( sess, logdir=params["logdir"], f_name=f_name, isrnn=recurrent) logger.info('Run %s evaluation episodes' % nepisodes) model_performance = run_episodes(sess, ple_env, nepisodes, test_steps, render, OBS, RNN_S_IN, RNN_S_OUT, PRED_Q, result_path, params["seed"]) # Add model performance metrics avg_performances.append(np.mean(model_performance)) std_performances.append(np.std(model_performance)) maximal_returns.append(np.max(model_performance)) tf.reset_default_graph() return model_idx, avg_performances, std_performances logger.info(params["logdir"]) logger.info('Results of the evaluation of the learning algorithm:') logger.info('Restored models: %s' % model_idx) logger.info('Average performance per model: %s' % avg_performances) logger.info('Performance variance per model: %s' % var_performances) logger.info('Maximum episode return per model: %s' % maximal_returns) ple_env.close() if not avg_performances == []: return np.mean(avg_performances), np.mean(var_performances), np.mean( maximal_returns) else: return -3000, 3000, -3000
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num_timesteps', type=float, default=100e6) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_div', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=1) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_updated', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--save_dir', help="dir to save and log", type=str, default="save_dir") parser.add_argument('--load_path', help="dir to load model", type=str, default=None) parser.add_argument('--base_load_path', help="dir to load model", type=str, default=None) parser.add_argument('--r_path', help="dir to load r network", type=str, default=None) parser.add_argument('--play', default=False, action='store_true') parser.add_argument('--only_train_r', default=False, action='store_true') parser.add_argument('--online_train_r', default=False, action='store_true') #parser.add_argument('--ec_type', type=str, default='episodic_curiosity', choices=['episodic_curiosity', 'none','oracle']) parser.add_argument('--rnd_type', type=str, default='rnd', choices=['rnd', 'oracle']) parser.add_argument('--reset', default=False, action='store_true') parser.add_argument('--dynamics_sample', default=False, action='store_true') parser.add_argument('--num_agents', type=int, default=1) parser.add_argument('--div_type', type=str, default='oracle', choices=['oracle', 'cls', 'rnd']) parser.add_argument('--load_ram', default=False, action='store_true') parser.add_argument('--debug', default=False, action='store_true') parser.add_argument('--rnd_mask_prob', type=float, default=1.) parser.add_argument('--rnd_mask_type', type=str, default='indep', choices=['prog', 'indep', 'shared']) parser.add_argument('--indep_rnd', default=False, action='store_true') parser.add_argument('--indep_policy', default=True, action='store_true') parser.add_argument('--sd_type', type=str, default='oracle', choices=['oracle', 'sd']) parser.add_argument('--from_scratch', default=False, action='store_true') parser.add_argument('--kl', default=False, action='store_true') args = parser.parse_args() log_path = os.path.join(args.save_dir, 'logs') save_path = os.path.join(args.save_dir, 'models') logger.configure(dir=log_path, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, gamma_div=args.gamma_div, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_updated, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, log_interval=10, save_path=save_path, load_path=args.load_path, r_path=args.r_path, play=args.play, only_train_r=args.only_train_r, online_train_r=args.online_train_r, #ec_type = args.ec_type, rnd_type=args.rnd_type, reset=args.reset, dynamics_sample=args.dynamics_sample, num_agents=args.num_agents, div_type=args.div_type, load_ram=args.load_ram, debug=args.debug, rnd_mask_prob=args.rnd_mask_prob, rnd_mask_type=args.rnd_mask_type, indep_rnd=args.indep_rnd, indep_policy=args.indep_policy, sd_type=args.sd_type, from_scratch=args.from_scratch, base_load_path=args.base_load_path, use_kl=args.kl) if args.play: args.num_env = 1 tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)