Esempio n. 1
0
def one_step(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)

    path = 'data/mh/trajs_mh.pkl'
    nfolds = 5
    n_splits = 5
    ckpts = (np.arange(10) + 1) * 5000

    num_actions = 5
    # configures
    config['online'] = False
    config['hiddens'] = [64, 64]
    config['double'] = False
    config['dueling'] = False
    config['lr'] = 5e-4
    config['decay_steps'] = 50000
    config['max_training_steps'] = 50000
    config['training_steps_to_checkpoint'] = 5000
    config['training_steps_to_eval'] = 100000

    index = pd.MultiIndex.from_product([np.arange(nfolds), ckpts])
    columns = ['dqn', 'dml', 'sale']
    rets = pd.DataFrame(index=index, columns=columns)

    print('-' * 20, 'start', '-' * 20)
    cvs = CVS(path, n_splits=nfolds, random_state=seed)
    cvs.split()
    for fold in range(nfolds):
        train_path = cvs.train_paths[fold] + 'trajs.pkl'
        kf = KFoldCV(train_path,
                     n_trajs=None,
                     n_splits=n_splits,
                     shuffle=False,
                     random_state=seed)
        kf.split()

        print('-' * 20, 'training agent', '-' * 20)
        # agent
        config['persistent_directory'] = kf.agent_path
        config['checkpoint_path'] = kf.ckpt_path
        agent = DQNAgent(num_actions=num_actions, config=config)
        agent.learn()

        print('-' * 20, 'training agents', '-' * 20)
        # agent_1, ..., agent_K
        for idx in range(kf.n_splits):
            config_idx = copy.deepcopy(config)
            config_idx['persistent_directory'] = kf.agent_paths[idx]
            config_idx['checkpoint_path'] = kf.ckpt_paths[idx]
            agent_idx = DQNAgent(num_actions=num_actions, config=config_idx)
            agent_idx.learn()

        # fitted q evaluation
        test_path = cvs.test_paths[fold] + 'trajs.pkl'
        with open(test_path, 'rb') as f:
            trajs = pickle.load(f)

        print('-' * 20, 'behavior cloning', '-' * 20)
        # behavior cloning
        bc = BehaviorCloning(num_actions=num_actions)
        states = np.array(
            [transition[0] for traj in kf.trajs for transition in traj])
        actions = np.array(
            [transition[1] for traj in kf.trajs for transition in traj])
        bc.train(states, actions)

        for ckpt in ckpts:
            print('-' * 20, 'ckpt: ', ckpt, '-' * 20)
            agent = DQNAgent(num_actions=num_actions, config=config)
            agent.load(kf.ckpt_path + 'dqn_{}.ckpt'.format(ckpt))

            agents = []
            for idx in range(kf.n_splits):
                config_idx = copy.deepcopy(config)
                config_idx['persistent_directory'] = kf.agent_paths[idx]
                config_idx['checkpoint_path'] = kf.ckpt_paths[idx]
                agent_idx = DQNAgent(num_actions=num_actions,
                                     config=config_idx)
                agent_idx.load(kf.ckpt_paths[idx] + 'dqn_{}.ckpt'.format(ckpt))
                agents.append(agent_idx)
            states, qvalues, qtildes = kf.update_q(agents, bc)

            print('-' * 20, 'adv learner', '-' * 20)
            advs1 = qvalues - qvalues.mean(axis=1, keepdims=True)
            agent1 = AdvantageLearner(num_actions=num_actions)
            agent1._train(states, advs1)

            advs2 = qtildes - qtildes.mean(axis=1, keepdims=True)
            agent2 = AdvantageLearner(num_actions=num_actions)
            agent2._train(states, advs2)

            print('-' * 20, 'fqe on dqn & dml & sale', '-' * 20)
            fqe_dqn = FQE(agent.greedy_actions, num_actions=num_actions)
            fqe_dqn.train(trajs)
            fqe_dml = FQE(agent1.greedy_actions, num_actions=num_actions)
            fqe_dml.train(trajs)
            fqe_sale = FQE(agent2.greedy_actions, num_actions=num_actions)
            fqe_sale.train(trajs)

            rets.loc[(fold, ckpt), 'dqn'] = fqe_dqn.values
            rets.loc[(fold, ckpt), 'dml'] = fqe_dml.values
            rets.loc[(fold, ckpt), 'sale'] = fqe_sale.values

    return rets
Esempio n. 2
0
def compare_within_ckpt(kf, bc, config, working_directory,
                        strategy = 'random',
                        num_trajectories = 200,
                        agent_name = 'dqn',
                        num_kf =2,
                        replica=1):
    # 0 for sale, 1 for dml, 2 for single agent
    ckpt_result = defaultdict(list)

    for ckpt in [i * int(1e4) for i in range(1, int(config['max_training_steps'] / 1e4) + 1)]:
        print('Evaluate with ckpt {}...'.format(ckpt))
        agents = []

        for idx in range(kf.n_splits):
            config_idx = copy.deepcopy(config)
            config_idx['persistent_directory'] = kf.agent_paths[idx]
            config_idx['checkpoint_path'] = kf.ckpt_paths[idx]

            agent_idx = DQNAgent(name='LunarLander-v2', num_actions=4, config=config_idx)
            agent_idx.load(kf.ckpt_paths[idx] + 'dqn_{}.ckpt'.format(ckpt))
            agents.append(agent_idx)

        states, qvalues, qtildes = kf.update_q(agents, bc)

        advs1 = qvalues - qvalues.mean(axis=1, keepdims=True)
        adv_learner1 = AdvantageLearner()
        adv_learner1._train(states, advs1)
        adv_learner1._eval(100)

        advs2 = qtildes - qtildes.mean(axis=1, keepdims=True)
        adv_learner2 = AdvantageLearner()
        adv_learner2._train(states, advs2)
        adv_learner2._eval(100)

        eval_episode_rewards1 = np.array(adv_learner1.eval_episode_rewards)
        eval_episode_rewards2 = np.array(adv_learner2.eval_episode_rewards)

        fig, axes = plt.subplots(1, 2, figsize=(18, 5))
        axes[0].hist(eval_episode_rewards1)
        axes[0].set_title(eval_episode_rewards1.mean())
        axes[1].hist(eval_episode_rewards2)
        axes[1].set_title(eval_episode_rewards2.mean())
        
        ad_pic_file_path = os.path.join(
            working_directory, 'pic', 'ag_{}-sp_{}-nt_{}-kf_{}-ckpt_{}-dt_{}_adv.jpg'.format(
            agent_name, strategy, num_trajectories, num_kf, ckpt, datetime.now().strftime('%Y%m%d_%H-%M-%s')))
        plt.savefig(ad_pic_file_path)
        
        # record dml, sale rewards
        ckpt_result['dml_mean_reward'].append(eval_episode_rewards1.mean())
        ckpt_result['sale_mean_reward'].append(eval_episode_rewards2.mean())

        [agents[idx]._eval(100) for idx in range(kf.n_splits)]

        # record cv agent rewards
        print('Evaluating cv agent score...')
        cv_agent_rewards = [np.array(agents[idx].eval_episode_rewards).mean() for idx in range(kf.n_splits)]
        for idx, ar in enumerate(cv_agent_rewards):
            ckpt_result['cv{}'.format(idx)].append(ar)
        print(cv_agent_rewards)

        print('Evaluating single score...')
        config['persistent_directory'] = kf.agent_path
        config['checkpoint_path'] = kf.ckpt_path

        agent = DQNAgent(name='LunarLander-v2', num_actions=4, config=config)
        agent.load(config['checkpoint_path']+'dqn_{}.ckpt'.format(ckpt))

        agent._eval(100)

        eval_episode_rewards = np.array(agent.eval_episode_rewards)
        ckpt_result['single_agent_mean_reward'].append(eval_episode_rewards.mean())

        plt.hist(eval_episode_rewards)
        plt.title(eval_episode_rewards.mean())
        
        single_pic_file_path = os.path.join(
            working_directory, 'pic', 
            'ag_{}-sp_{}-nt_{}-kf_{}-ckpt_{}-dt_{}_single.jpg'.format(agent_name, strategy, num_trajectories, num_kf,  
                                                                        ckpt, datetime.now().strftime('%Y%m%d_%H-%M-%s')))
        plt.savefig(single_pic_file_path)
        
        print('Recording check point results...')

    ckpt_result_pdf = pd.DataFrame(ckpt_result)
    ckpt_result_pdf = ckpt_result_pdf[['sale_mean_reward', 'dml_mean_reward', 
                                       'single_agent_mean_reward'] + ['cv{}'.format(i) for i in range(num_kf)]]
    
    file_directory = os.path.join(working_directory, 'csv')
    if not os.path.isdir(file_directory):
        os.mkdir(file_directory)
    file_path = os.path.join(file_directory, 'ag_{}-sp_{}-nt_{}-kf_{}-rca_{}.csv'.format(
        agent_name, strategy, num_trajectories, num_kf, replica))
    print('Save all records to {}'.format(file_path))
    ckpt_result_pdf.to_csv(file_path, index=False, encoding='UTF-8', header=True)

    return ckpt_result_pdf