def gym_evaluate(parser, n_episode, algo): seed = parser.getint('TRAIN_CONFIG', 'SEED') env_name = parser.get('ENV_CONFIG', 'NAME') is_discrete = parser.getboolean('ENV_CONFIG', 'DISCRETE') env = GymEnv(env_name, is_discrete) env.seed(seed) n_a = env.n_a n_s = env.n_s sess = tf.Session() total_step = int(parser.getfloat('TRAIN_CONFIG', 'MAX_STEP')) if algo == 'a2c': model = A2C(sess, n_s, n_a, -1, model_config=parser['MODEL_CONFIG'], discrete=is_discrete) elif algo == 'ppo': model = PPO(sess, n_s, n_a, -1, model_config=parser['MODEL_CONFIG'], discrete=is_discrete) elif algo == 'ddpg': assert(not is_discrete) model = DDPG(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG']) else: model = None base_dir = parser.get('TRAIN_CONFIG', 'BASE_DIR') save_path, log_path = init_out_dir(base_dir, 'evaluate') sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() model.load(saver, save_path) evaluator = Evaluator(env, model, log_path, n_episode) evaluator.run() if parser.getboolean('ENV_CONFIG', 'ISDRONEENV'): env.get_results_df().to_csv(log_path + '/evaluate_RL_model_statistics.csv')
def evaluate_fn(agent_dir, output_dir, seeds, port, demo, policy_type): agent = agent_dir.split('/')[-1] if not check_dir(agent_dir): logging.error('Evaluation: %s does not exist!' % agent) return # load config file for env config_dir = find_file(agent_dir + '/data/') if not config_dir: return config = configparser.ConfigParser() config.read(config_dir) # init env env, greedy_policy = init_env(config['ENV_CONFIG'], port=port, naive_policy=True) logging.info( 'Evaluation: s dim: %d, a dim %d, s dim ls: %r, a dim ls: %r' % (env.n_s, env.n_a, env.n_s_ls, env.n_a_ls)) env.init_test_seeds(seeds) # load model for agent if agent != 'greedy': # init centralized or multi agent if agent == 'a2c': model = A2C(env.n_s, env.n_a, 0, config['MODEL_CONFIG']) elif agent == 'ia2c': model = IA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0, config['MODEL_CONFIG']) elif agent == 'ma2c': model = MA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, env.n_f_ls, 0, config['MODEL_CONFIG']) elif agent == 'iqld': model = IQL(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0, config['MODEL_CONFIG'], seed=0, model_type='dqn') else: model = IQL(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0, config['MODEL_CONFIG'], seed=0, model_type='lr') if not model.load(agent_dir + '/model/'): return else: model = greedy_policy env.agent = agent # collect evaluation data evaluator = Evaluator(env, model, output_dir, demo=demo, policy_type=policy_type) evaluator.run()
def RL_offload_evaluate(parser=None, test_seeds=None, algo=None, env_name='alwaysQueryEdge', query_budget_fraction_list=None, model_save_path=None, log_path=None): if env_name == 'stochastic': env = StochasticInputOffloadEnv() elif env_name == 'AQE': env = AlwaysQueryEdgeOffloadEnv() elif env_name == 'FourAction': env = FourActionOffloadEnv() else: pass n_a = env.n_a n_s = env.n_s sess = tf.Session() if algo == 'a2c': model = A2C(sess, n_s, n_a, -1, model_config=parser['MODEL_CONFIG'], discrete=True) elif algo == 'ppo': model = PPO(sess, n_s, n_a, -1, model_config=parser['MODEL_CONFIG'], discrete=True) else: model = None sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() model.load(saver, model_save_path) evaluator = RLOffloadEvaluator(env, model, log_path, test_seeds, query_budget_fraction_list) evaluator.run()
def evaluate_fn(agent_dir, output_dir, seeds, port, demo): agent = agent_dir.split('/')[-1] doubleQ = True if agent == 'ddqn': doubleQ = False agent = 'dqn' if not check_dir(agent_dir): logging.error('Evaluation: %s does not exist!' % agent) return # load config file for env config_dir = find_file(agent_dir + '/data/') if not config_dir: return config = configparser.ConfigParser() config.read(config_dir) # init env env, greedy_policy = init_env(config['ENV_CONFIG'], port=port, naive_policy=True) logging.info( 'Evaluation: s dim: %d, a dim %d, s dim ls: %r, a dim ls: %r' % (env.n_s, env.n_a, env.n_s_ls, env.n_a_ls)) env.init_test_seeds(seeds) # load model for agent if agent != 'greedy': # init centralized or multi agent if agent == 'a2c': model = A2C(env.n_s, env.n_a, 0, config['MODEL_CONFIG']) elif agent == 'ia2c': model = IA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0, config['MODEL_CONFIG']) elif agent == 'ma2c': model = MA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, env.n_f_ls, 0, config['MODEL_CONFIG']) elif agent == 'codql': print('This is codql') model = MFQ(nb_agent=len(env.n_s_ls), a_dim=env.n_a_ls[0], s_dim=env.n_s_ls[0], s_dim_wave=env.n_s_ls[0] - env.n_w_ls[0], s_dim_wait=env.n_w_ls[0], config=config['MODEL_CONFIG']) elif agent == 'dqn': model = DQN(nb_agent=len(env.n_s_ls), a_dim=env.n_a_ls[0], s_dim=env.n_s_ls[0], s_dim_wave=env.n_s_ls[0] - env.n_w_ls[0], s_dim_wait=env.n_w_ls[0], config=config['MODEL_CONFIG'], doubleQ=doubleQ) #doubleQ=False denotes dqn else ddqn elif agent == 'ddpg': model = DDPGEN(nb_agent=len(env.n_s_ls), share_params=True, a_dim=env.n_a_ls[0], s_dim=env.n_s_ls[0], s_dim_wave=env.n_s_ls[0] - env.n_w_ls[0], s_dim_wait=env.n_w_ls[0]) elif agent == 'iqld': model = IQL(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0, config['MODEL_CONFIG'], seed=0, model_type='dqn') else: model = IQL(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0, config['MODEL_CONFIG'], seed=0, model_type='lr') if not model.load(agent_dir + '/model/'): return else: model = greedy_policy env.agent = agent # collect evaluation data evaluator = Evaluator(env, model, output_dir, demo=demo) evaluator.run()
def gym_train(parser, algo): seed = parser.getint('TRAIN_CONFIG', 'SEED') num_env = parser.getint('TRAIN_CONFIG', 'NUM_ENV') env_name = parser.get('ENV_CONFIG', 'NAME') is_discrete = parser.getboolean('ENV_CONFIG', 'DISCRETE') print(' ') print('STARTING TO TRAIN: ', env_name) print('is discrete: ', is_discrete) print(' ') env = GymEnv(env_name, is_discrete) env.seed(seed) n_a = env.n_a n_s = env.n_s total_step = int(parser.getfloat('TRAIN_CONFIG', 'MAX_STEP')) base_dir = parser.get('TRAIN_CONFIG', 'BASE_DIR') save_step = int(parser.getfloat('TRAIN_CONFIG', 'SAVE_INTERVAL')) log_step = int(parser.getfloat('TRAIN_CONFIG', 'LOG_INTERVAL')) save_path, log_path = init_out_dir(base_dir, 'train') tf.set_random_seed(seed) config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) if algo == 'a2c': global_model = A2C(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG'], discrete=is_discrete) elif algo == 'ppo': global_model = PPO(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG'], discrete=is_discrete) elif algo == 'ddpg': assert(not is_discrete) global_model = DDPG(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG']) else: global_model = None global_counter = GlobalCounter(total_step, save_step, log_step) coord = tf.train.Coordinator() threads = [] trainers = [] model_summary = init_model_summary(global_model.name) if num_env == 1: # regular training summary_writer = tf.summary.FileWriter(log_path, sess.graph) trainer = Trainer(env, global_model, save_path, summary_writer, global_counter, model_summary) trainers.append(trainer) else: assert(algo in ['a2c', 'ppo']) # asynchronous training lr_scheduler = global_model.lr_scheduler beta_scheduler = global_model.beta_scheduler optimizer = global_model.optimizer lr = global_model.lr clip_scheduler = None if algo == 'ppo': clip = global_model.clip clip_scheduler = global_model.clip_scheduler wt_summary = None reward_summary = None summary_writer = tf.summary.FileWriter(log_path) for i in range(num_env): env = GymEnv(env_name, is_discrete) env.seed(seed + i) if algo == 'a2c': model = A2C(sess, n_s, n_a, total_step, i_thread=i, optimizer=optimizer, lr=lr, model_config=parser['MODEL_CONFIG'], discrete=is_discrete) else: model = PPO(sess, n_s, n_a, total_step, i_thread=i, optimizer=optimizer, lr=lr, clip=clip, model_config=parser['MODEL_CONFIG'], discrete=is_discrete) trainer = AsyncTrainer(env, model, save_path, summary_writer, global_counter, i, lr_scheduler, beta_scheduler, model_summary, wt_summary, reward_summary=reward_summary, clip_scheduler=clip_scheduler) if i == 0: reward_summary = (trainer.reward_summary, trainer.total_reward) trainers.append(trainer) sess.run(tf.global_variables_initializer()) global_model.init_train() saver = tf.train.Saver(max_to_keep=20) global_model.load(saver, save_path) def train_fn(i_thread): trainers[i_thread].run(sess, saver, coord) for i in range(num_env): thread = threading.Thread(target=train_fn, args=(i,)) thread.start() threads.append(thread) signal.signal(signal.SIGINT, signal_handler) signal.pause() coord.request_stop() coord.join(threads) save_flag = input('save final model? Y/N: ') if save_flag.lower().startswith('y'): print('saving model at step %d ...' % global_counter.cur_step) global_model.save(saver, save_path + 'checkpoint', global_counter.cur_step)
rank=0, seed=i, log_dir='mission_records/' + log_dir, record_fn=lambda x: x % 400 == 0) else: _thunk = make_lunarlander(task_id, rank=0, seed=i) thunk.append(_thunk) envs = SubprocVecEnv(thunk, minecraft=True) torch.manual_seed(0) torch.cuda.manual_seed(0) a2c = A2C(envs, model=MLP, nstep=nsteps, lr=lr, e_coeff=e_coeff, v_coeff=v_coeff, render=False) total = 0 for e in range(0, 50000): episode_obs, episode_rws, episode_values, episode_actions, episode_dones, returns = a2c.run_episode( e) loss, policy_loss, mse, advantage, train_values, entropy = a2c.train( returns, episode_obs, episode_actions) if e % 1000 == 0: torch.save(a2c.model.state_dict(), 'checkpoints/' + log_dir + '/model.pth') print(policy_loss.data[0], mse.data[0], entropy.data[0]) envs.close() else:
rank=0, seed=i) thunk.append(_thunk) envs = SubprocVecEnv(thunk, minecraft=True) from agents.policies import CNNPolicy from agents.models import A2C import torch torch.manual_seed(0) torch.cuda.manual_seed(0) a2c = A2C(envs, model=CNNPolicy, nstep=nsteps, nstack=nstack, lr=lr, e_coeff=e_coeff, v_coeff=v_coeff, render=False) total = 0 for e in range(0, 50000): episode_obs, episode_rws, episode_values, episode_actions, episode_dones, returns = a2c.run_episode( e) loss, policy_loss, mse, advantage, train_values, entropy = a2c.train( returns, episode_obs, episode_actions) if e % 1000 == 0: torch.save(a2c.model.state_dict(), 'checkpoints/' + log_dir + '/model.pth') print(policy_loss.data[0], mse.data[0], entropy.data[0]) envs.close() else: