def one_step(seed): np.random.seed(seed) tf.random.set_seed(seed) path = 'data/mh/trajs_mh.pkl' nfolds = 5 n_splits = 5 ckpts = (np.arange(10) + 1) * 5000 num_actions = 5 # configures config['online'] = False config['hiddens'] = [64, 64] config['double'] = False config['dueling'] = False config['lr'] = 5e-4 config['decay_steps'] = 50000 config['max_training_steps'] = 50000 config['training_steps_to_checkpoint'] = 5000 config['training_steps_to_eval'] = 100000 index = pd.MultiIndex.from_product([np.arange(nfolds), ckpts]) columns = ['dqn', 'dml', 'sale'] rets = pd.DataFrame(index=index, columns=columns) print('-' * 20, 'start', '-' * 20) cvs = CVS(path, n_splits=nfolds, random_state=seed) cvs.split() for fold in range(nfolds): train_path = cvs.train_paths[fold] + 'trajs.pkl' kf = KFoldCV(train_path, n_trajs=None, n_splits=n_splits, shuffle=False, random_state=seed) kf.split() print('-' * 20, 'training agent', '-' * 20) # agent config['persistent_directory'] = kf.agent_path config['checkpoint_path'] = kf.ckpt_path agent = DQNAgent(num_actions=num_actions, config=config) agent.learn() print('-' * 20, 'training agents', '-' * 20) # agent_1, ..., agent_K for idx in range(kf.n_splits): config_idx = copy.deepcopy(config) config_idx['persistent_directory'] = kf.agent_paths[idx] config_idx['checkpoint_path'] = kf.ckpt_paths[idx] agent_idx = DQNAgent(num_actions=num_actions, config=config_idx) agent_idx.learn() # fitted q evaluation test_path = cvs.test_paths[fold] + 'trajs.pkl' with open(test_path, 'rb') as f: trajs = pickle.load(f) print('-' * 20, 'behavior cloning', '-' * 20) # behavior cloning bc = BehaviorCloning(num_actions=num_actions) states = np.array( [transition[0] for traj in kf.trajs for transition in traj]) actions = np.array( [transition[1] for traj in kf.trajs for transition in traj]) bc.train(states, actions) for ckpt in ckpts: print('-' * 20, 'ckpt: ', ckpt, '-' * 20) agent = DQNAgent(num_actions=num_actions, config=config) agent.load(kf.ckpt_path + 'dqn_{}.ckpt'.format(ckpt)) agents = [] for idx in range(kf.n_splits): config_idx = copy.deepcopy(config) config_idx['persistent_directory'] = kf.agent_paths[idx] config_idx['checkpoint_path'] = kf.ckpt_paths[idx] agent_idx = DQNAgent(num_actions=num_actions, config=config_idx) agent_idx.load(kf.ckpt_paths[idx] + 'dqn_{}.ckpt'.format(ckpt)) agents.append(agent_idx) states, qvalues, qtildes = kf.update_q(agents, bc) print('-' * 20, 'adv learner', '-' * 20) advs1 = qvalues - qvalues.mean(axis=1, keepdims=True) agent1 = AdvantageLearner(num_actions=num_actions) agent1._train(states, advs1) advs2 = qtildes - qtildes.mean(axis=1, keepdims=True) agent2 = AdvantageLearner(num_actions=num_actions) agent2._train(states, advs2) print('-' * 20, 'fqe on dqn & dml & sale', '-' * 20) fqe_dqn = FQE(agent.greedy_actions, num_actions=num_actions) fqe_dqn.train(trajs) fqe_dml = FQE(agent1.greedy_actions, num_actions=num_actions) fqe_dml.train(trajs) fqe_sale = FQE(agent2.greedy_actions, num_actions=num_actions) fqe_sale.train(trajs) rets.loc[(fold, ckpt), 'dqn'] = fqe_dqn.values rets.loc[(fold, ckpt), 'dml'] = fqe_dml.values rets.loc[(fold, ckpt), 'sale'] = fqe_sale.values return rets
def compare_within_ckpt(kf, bc, config, working_directory, strategy = 'random', num_trajectories = 200, agent_name = 'dqn', num_kf =2, replica=1): # 0 for sale, 1 for dml, 2 for single agent ckpt_result = defaultdict(list) for ckpt in [i * int(1e4) for i in range(1, int(config['max_training_steps'] / 1e4) + 1)]: print('Evaluate with ckpt {}...'.format(ckpt)) agents = [] for idx in range(kf.n_splits): config_idx = copy.deepcopy(config) config_idx['persistent_directory'] = kf.agent_paths[idx] config_idx['checkpoint_path'] = kf.ckpt_paths[idx] agent_idx = DQNAgent(name='LunarLander-v2', num_actions=4, config=config_idx) agent_idx.load(kf.ckpt_paths[idx] + 'dqn_{}.ckpt'.format(ckpt)) agents.append(agent_idx) states, qvalues, qtildes = kf.update_q(agents, bc) advs1 = qvalues - qvalues.mean(axis=1, keepdims=True) adv_learner1 = AdvantageLearner() adv_learner1._train(states, advs1) adv_learner1._eval(100) advs2 = qtildes - qtildes.mean(axis=1, keepdims=True) adv_learner2 = AdvantageLearner() adv_learner2._train(states, advs2) adv_learner2._eval(100) eval_episode_rewards1 = np.array(adv_learner1.eval_episode_rewards) eval_episode_rewards2 = np.array(adv_learner2.eval_episode_rewards) fig, axes = plt.subplots(1, 2, figsize=(18, 5)) axes[0].hist(eval_episode_rewards1) axes[0].set_title(eval_episode_rewards1.mean()) axes[1].hist(eval_episode_rewards2) axes[1].set_title(eval_episode_rewards2.mean()) ad_pic_file_path = os.path.join( working_directory, 'pic', 'ag_{}-sp_{}-nt_{}-kf_{}-ckpt_{}-dt_{}_adv.jpg'.format( agent_name, strategy, num_trajectories, num_kf, ckpt, datetime.now().strftime('%Y%m%d_%H-%M-%s'))) plt.savefig(ad_pic_file_path) # record dml, sale rewards ckpt_result['dml_mean_reward'].append(eval_episode_rewards1.mean()) ckpt_result['sale_mean_reward'].append(eval_episode_rewards2.mean()) [agents[idx]._eval(100) for idx in range(kf.n_splits)] # record cv agent rewards print('Evaluating cv agent score...') cv_agent_rewards = [np.array(agents[idx].eval_episode_rewards).mean() for idx in range(kf.n_splits)] for idx, ar in enumerate(cv_agent_rewards): ckpt_result['cv{}'.format(idx)].append(ar) print(cv_agent_rewards) print('Evaluating single score...') config['persistent_directory'] = kf.agent_path config['checkpoint_path'] = kf.ckpt_path agent = DQNAgent(name='LunarLander-v2', num_actions=4, config=config) agent.load(config['checkpoint_path']+'dqn_{}.ckpt'.format(ckpt)) agent._eval(100) eval_episode_rewards = np.array(agent.eval_episode_rewards) ckpt_result['single_agent_mean_reward'].append(eval_episode_rewards.mean()) plt.hist(eval_episode_rewards) plt.title(eval_episode_rewards.mean()) single_pic_file_path = os.path.join( working_directory, 'pic', 'ag_{}-sp_{}-nt_{}-kf_{}-ckpt_{}-dt_{}_single.jpg'.format(agent_name, strategy, num_trajectories, num_kf, ckpt, datetime.now().strftime('%Y%m%d_%H-%M-%s'))) plt.savefig(single_pic_file_path) print('Recording check point results...') ckpt_result_pdf = pd.DataFrame(ckpt_result) ckpt_result_pdf = ckpt_result_pdf[['sale_mean_reward', 'dml_mean_reward', 'single_agent_mean_reward'] + ['cv{}'.format(i) for i in range(num_kf)]] file_directory = os.path.join(working_directory, 'csv') if not os.path.isdir(file_directory): os.mkdir(file_directory) file_path = os.path.join(file_directory, 'ag_{}-sp_{}-nt_{}-kf_{}-rca_{}.csv'.format( agent_name, strategy, num_trajectories, num_kf, replica)) print('Save all records to {}'.format(file_path)) ckpt_result_pdf.to_csv(file_path, index=False, encoding='UTF-8', header=True) return ckpt_result_pdf