Example #1
0
def gym_evaluate(parser, n_episode, algo):
    seed = parser.getint('TRAIN_CONFIG', 'SEED')
    env_name = parser.get('ENV_CONFIG', 'NAME')
    is_discrete = parser.getboolean('ENV_CONFIG', 'DISCRETE')
    env = GymEnv(env_name, is_discrete)

    env.seed(seed)
    n_a = env.n_a
    n_s = env.n_s
    sess = tf.Session()
    total_step = int(parser.getfloat('TRAIN_CONFIG', 'MAX_STEP'))
    if algo == 'a2c':
        model = A2C(sess, n_s, n_a, -1, model_config=parser['MODEL_CONFIG'],
                    discrete=is_discrete)
    elif algo == 'ppo':
        model = PPO(sess, n_s, n_a, -1, model_config=parser['MODEL_CONFIG'],
                    discrete=is_discrete)
    elif algo == 'ddpg':
        assert(not is_discrete)
        model = DDPG(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG'])
    else:
        model = None
    base_dir = parser.get('TRAIN_CONFIG', 'BASE_DIR')
    save_path, log_path = init_out_dir(base_dir, 'evaluate')
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    model.load(saver, save_path)
    evaluator = Evaluator(env, model, log_path, n_episode)
    evaluator.run()

    if parser.getboolean('ENV_CONFIG', 'ISDRONEENV'):
        env.get_results_df().to_csv(log_path + '/evaluate_RL_model_statistics.csv')
Example #2
0
def evaluate_fn(agent_dir, output_dir, seeds, port, demo, policy_type):
    agent = agent_dir.split('/')[-1]
    if not check_dir(agent_dir):
        logging.error('Evaluation: %s does not exist!' % agent)
        return
    # load config file for env
    config_dir = find_file(agent_dir + '/data/')
    if not config_dir:
        return
    config = configparser.ConfigParser()
    config.read(config_dir)

    # init env
    env, greedy_policy = init_env(config['ENV_CONFIG'],
                                  port=port,
                                  naive_policy=True)
    logging.info(
        'Evaluation: s dim: %d, a dim %d, s dim ls: %r, a dim ls: %r' %
        (env.n_s, env.n_a, env.n_s_ls, env.n_a_ls))
    env.init_test_seeds(seeds)

    # load model for agent
    if agent != 'greedy':
        # init centralized or multi agent
        if agent == 'a2c':
            model = A2C(env.n_s, env.n_a, 0, config['MODEL_CONFIG'])
        elif agent == 'ia2c':
            model = IA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0,
                         config['MODEL_CONFIG'])
        elif agent == 'ma2c':
            model = MA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, env.n_f_ls, 0,
                         config['MODEL_CONFIG'])
        elif agent == 'iqld':
            model = IQL(env.n_s_ls,
                        env.n_a_ls,
                        env.n_w_ls,
                        0,
                        config['MODEL_CONFIG'],
                        seed=0,
                        model_type='dqn')
        else:
            model = IQL(env.n_s_ls,
                        env.n_a_ls,
                        env.n_w_ls,
                        0,
                        config['MODEL_CONFIG'],
                        seed=0,
                        model_type='lr')
        if not model.load(agent_dir + '/model/'):
            return
    else:
        model = greedy_policy
    env.agent = agent
    # collect evaluation data
    evaluator = Evaluator(env,
                          model,
                          output_dir,
                          demo=demo,
                          policy_type=policy_type)
    evaluator.run()
def RL_offload_evaluate(parser=None,
                        test_seeds=None,
                        algo=None,
                        env_name='alwaysQueryEdge',
                        query_budget_fraction_list=None,
                        model_save_path=None,
                        log_path=None):

    if env_name == 'stochastic':
        env = StochasticInputOffloadEnv()
    elif env_name == 'AQE':
        env = AlwaysQueryEdgeOffloadEnv()
    elif env_name == 'FourAction':
        env = FourActionOffloadEnv()
    else:
        pass

    n_a = env.n_a
    n_s = env.n_s
    sess = tf.Session()
    if algo == 'a2c':
        model = A2C(sess,
                    n_s,
                    n_a,
                    -1,
                    model_config=parser['MODEL_CONFIG'],
                    discrete=True)
    elif algo == 'ppo':
        model = PPO(sess,
                    n_s,
                    n_a,
                    -1,
                    model_config=parser['MODEL_CONFIG'],
                    discrete=True)
    else:
        model = None

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    model.load(saver, model_save_path)
    evaluator = RLOffloadEvaluator(env, model, log_path, test_seeds,
                                   query_budget_fraction_list)
    evaluator.run()
Example #4
0
def evaluate_fn(agent_dir, output_dir, seeds, port, demo):
    agent = agent_dir.split('/')[-1]
    doubleQ = True
    if agent == 'ddqn':
        doubleQ = False
        agent = 'dqn'
    if not check_dir(agent_dir):
        logging.error('Evaluation: %s does not exist!' % agent)
        return
    # load config file for env
    config_dir = find_file(agent_dir + '/data/')
    if not config_dir:
        return
    config = configparser.ConfigParser()
    config.read(config_dir)

    # init env
    env, greedy_policy = init_env(config['ENV_CONFIG'],
                                  port=port,
                                  naive_policy=True)
    logging.info(
        'Evaluation: s dim: %d, a dim %d, s dim ls: %r, a dim ls: %r' %
        (env.n_s, env.n_a, env.n_s_ls, env.n_a_ls))
    env.init_test_seeds(seeds)

    # load model for agent
    if agent != 'greedy':
        # init centralized or multi agent
        if agent == 'a2c':
            model = A2C(env.n_s, env.n_a, 0, config['MODEL_CONFIG'])
        elif agent == 'ia2c':
            model = IA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, 0,
                         config['MODEL_CONFIG'])
        elif agent == 'ma2c':
            model = MA2C(env.n_s_ls, env.n_a_ls, env.n_w_ls, env.n_f_ls, 0,
                         config['MODEL_CONFIG'])
        elif agent == 'codql':
            print('This is codql')
            model = MFQ(nb_agent=len(env.n_s_ls),
                        a_dim=env.n_a_ls[0],
                        s_dim=env.n_s_ls[0],
                        s_dim_wave=env.n_s_ls[0] - env.n_w_ls[0],
                        s_dim_wait=env.n_w_ls[0],
                        config=config['MODEL_CONFIG'])
        elif agent == 'dqn':
            model = DQN(nb_agent=len(env.n_s_ls),
                        a_dim=env.n_a_ls[0],
                        s_dim=env.n_s_ls[0],
                        s_dim_wave=env.n_s_ls[0] - env.n_w_ls[0],
                        s_dim_wait=env.n_w_ls[0],
                        config=config['MODEL_CONFIG'],
                        doubleQ=doubleQ)  #doubleQ=False denotes dqn else ddqn
        elif agent == 'ddpg':
            model = DDPGEN(nb_agent=len(env.n_s_ls),
                           share_params=True,
                           a_dim=env.n_a_ls[0],
                           s_dim=env.n_s_ls[0],
                           s_dim_wave=env.n_s_ls[0] - env.n_w_ls[0],
                           s_dim_wait=env.n_w_ls[0])
        elif agent == 'iqld':
            model = IQL(env.n_s_ls,
                        env.n_a_ls,
                        env.n_w_ls,
                        0,
                        config['MODEL_CONFIG'],
                        seed=0,
                        model_type='dqn')
        else:
            model = IQL(env.n_s_ls,
                        env.n_a_ls,
                        env.n_w_ls,
                        0,
                        config['MODEL_CONFIG'],
                        seed=0,
                        model_type='lr')
        if not model.load(agent_dir + '/model/'):
            return
    else:
        model = greedy_policy
    env.agent = agent
    # collect evaluation data
    evaluator = Evaluator(env, model, output_dir, demo=demo)
    evaluator.run()
Example #5
0
def gym_train(parser, algo):
    seed = parser.getint('TRAIN_CONFIG', 'SEED')
    num_env = parser.getint('TRAIN_CONFIG', 'NUM_ENV')
    env_name = parser.get('ENV_CONFIG', 'NAME')
    is_discrete = parser.getboolean('ENV_CONFIG', 'DISCRETE')

    print(' ')
    print('STARTING TO TRAIN: ', env_name)
    print('is discrete: ', is_discrete)
    print(' ')

    env = GymEnv(env_name, is_discrete)
    
    env.seed(seed)
    n_a = env.n_a
    n_s = env.n_s
    total_step = int(parser.getfloat('TRAIN_CONFIG', 'MAX_STEP'))
    base_dir = parser.get('TRAIN_CONFIG', 'BASE_DIR')
    save_step = int(parser.getfloat('TRAIN_CONFIG', 'SAVE_INTERVAL'))
    log_step = int(parser.getfloat('TRAIN_CONFIG', 'LOG_INTERVAL'))
    save_path, log_path = init_out_dir(base_dir, 'train')

    tf.set_random_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True)
    sess = tf.Session(config=config)
    if algo == 'a2c':
        global_model = A2C(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG'],
                           discrete=is_discrete)
    elif algo == 'ppo':
        global_model = PPO(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG'],
                           discrete=is_discrete)
    elif algo == 'ddpg':
        assert(not is_discrete)
        global_model = DDPG(sess, n_s, n_a, total_step, model_config=parser['MODEL_CONFIG'])
    else:
        global_model = None
    global_counter = GlobalCounter(total_step, save_step, log_step)
    coord = tf.train.Coordinator()
    threads = []
    trainers = []
    model_summary = init_model_summary(global_model.name)

    if num_env == 1:
        # regular training
        summary_writer = tf.summary.FileWriter(log_path, sess.graph)
        trainer = Trainer(env, global_model, save_path, summary_writer, global_counter, model_summary)
        trainers.append(trainer)
    else:
        assert(algo in ['a2c', 'ppo'])
        # asynchronous training
        lr_scheduler = global_model.lr_scheduler
        beta_scheduler = global_model.beta_scheduler
        optimizer = global_model.optimizer
        lr = global_model.lr
        clip_scheduler = None
        if algo == 'ppo':
            clip = global_model.clip
            clip_scheduler = global_model.clip_scheduler
        wt_summary = None
        reward_summary = None
        summary_writer = tf.summary.FileWriter(log_path)

        for i in range(num_env):

            env = GymEnv(env_name, is_discrete)
            env.seed(seed + i)
            if algo == 'a2c':
                model = A2C(sess, n_s, n_a, total_step, i_thread=i, optimizer=optimizer,
                            lr=lr, model_config=parser['MODEL_CONFIG'], discrete=is_discrete)
            else:
                model = PPO(sess, n_s, n_a, total_step, i_thread=i, optimizer=optimizer,
                            lr=lr, clip=clip, model_config=parser['MODEL_CONFIG'], discrete=is_discrete)

            trainer = AsyncTrainer(env, model, save_path, summary_writer, global_counter,
                                   i, lr_scheduler, beta_scheduler, model_summary, wt_summary,
                                   reward_summary=reward_summary, clip_scheduler=clip_scheduler)
            if i == 0:
                reward_summary = (trainer.reward_summary, trainer.total_reward)
            trainers.append(trainer)

    sess.run(tf.global_variables_initializer())
    global_model.init_train()
    saver = tf.train.Saver(max_to_keep=20)
    global_model.load(saver, save_path)

    def train_fn(i_thread):
        trainers[i_thread].run(sess, saver, coord)

    for i in range(num_env):
        thread = threading.Thread(target=train_fn, args=(i,))
        thread.start()
        threads.append(thread)
    signal.signal(signal.SIGINT, signal_handler)
    signal.pause()
    coord.request_stop()
    coord.join(threads)
    save_flag = input('save final model? Y/N: ')
    if save_flag.lower().startswith('y'):
        print('saving model at step %d ...' % global_counter.cur_step)
        global_model.save(saver, save_path + 'checkpoint', global_counter.cur_step)
                                          rank=0,
                                          seed=i,
                                          log_dir='mission_records/' + log_dir,
                                          record_fn=lambda x: x % 400 == 0)
            else:
                _thunk = make_lunarlander(task_id, rank=0, seed=i)
            thunk.append(_thunk)
        envs = SubprocVecEnv(thunk, minecraft=True)

        torch.manual_seed(0)
        torch.cuda.manual_seed(0)

        a2c = A2C(envs,
                  model=MLP,
                  nstep=nsteps,
                  lr=lr,
                  e_coeff=e_coeff,
                  v_coeff=v_coeff,
                  render=False)
        total = 0
        for e in range(0, 50000):
            episode_obs, episode_rws, episode_values, episode_actions, episode_dones, returns = a2c.run_episode(
                e)
            loss, policy_loss, mse, advantage, train_values, entropy = a2c.train(
                returns, episode_obs, episode_actions)
            if e % 1000 == 0:
                torch.save(a2c.model.state_dict(),
                           'checkpoints/' + log_dir + '/model.pth')
            print(policy_loss.data[0], mse.data[0], entropy.data[0])
        envs.close()
    else:
                                        rank=0,
                                        seed=i)
            thunk.append(_thunk)
        envs = SubprocVecEnv(thunk, minecraft=True)

        from agents.policies import CNNPolicy
        from agents.models import A2C
        import torch

        torch.manual_seed(0)
        torch.cuda.manual_seed(0)

        a2c = A2C(envs,
                  model=CNNPolicy,
                  nstep=nsteps,
                  nstack=nstack,
                  lr=lr,
                  e_coeff=e_coeff,
                  v_coeff=v_coeff,
                  render=False)
        total = 0
        for e in range(0, 50000):
            episode_obs, episode_rws, episode_values, episode_actions, episode_dones, returns = a2c.run_episode(
                e)
            loss, policy_loss, mse, advantage, train_values, entropy = a2c.train(
                returns, episode_obs, episode_actions)
            if e % 1000 == 0:
                torch.save(a2c.model.state_dict(),
                           'checkpoints/' + log_dir + '/model.pth')
            print(policy_loss.data[0], mse.data[0], entropy.data[0])
        envs.close()
    else: