def main(): args = parse_args() print(args) maxlen_obs = 150 maxlen_look = 150 maxlen_inv = 50 max_len_action = 12 sp = spm.SentencePieceProcessor() sp.Load('../spm_models/unigram_8k.model') rom_path = args.rom_path + utils.game_file(args.game_name) policy = Policy(args) # policy.model.load_weights('weights/%s_%s_round%s.5000.h5' % (args.game_name, args.uct_type, args.round)) policy.load_weights( 'gcp/weights/%s/round_%d/%s_weight_policy_best_seed%d.pickle' % (args.game_name, args.round, args.uct_type, args.seed)) # 63 / 100 env = JerichoEnv(rom_path, 1, args.env_step_limit) env.create() scores = [] for seed in range(5): env = JerichoEnv(rom_path, seed, args.env_step_limit) env.create() obs, info = env.reset() cum_reward = 0 step = 0 prev_action = '<s>' # livingroom_steps = ["S", "E"] # # for action in livingroom_steps: # obs, reward, done, info = env.step(action) # prev_action = action for _ in range(args.max_episode_len): print('#################################################') print('STEP: %s' % step) print() print(info['look']) print() print(info['inv']) print() obs, look, inv, prev_action, score = utils.state_representation( obs, info['look'], info['inv'], prev_action, info['score'], maxlen_obs, maxlen_look, maxlen_inv, max_len_action) probs = policy.calculate_probs(obs, look, inv, prev_action, score, info['valid']) print(info['valid']) print(probs) idx = np.argmax(probs) # idx = int(np.random.choice([i for i in range(probs.shape[0])], 1, p=probs[:,0])) action = info['valid'][idx] obs, reward, done, info = env.step(action) cum_reward += reward step += 1 print('ACTION: %s' % action) print() print('Reward: %s, Score: %s' % (reward, info['score'])) print() print(obs + info['look'] + info['inv']) print() prev_action = action scores.append(info['score']) print(scores) print('AVERAGE SCORE: %s' % np.mean(scores)) f = open('outputs/eval_result_%s_%s.txt' % (args.game_name, args.uct_type), 'a') f.write("- Round %d (learning) : num_eval=%d, mean_ep_return=%.3f, std_ep_return=%.3f\n" % \ (args.round, len(scores), np.mean(scores), np.std(scores))) f.close()