def reset(self): if self.__trainable and self.model.training: assert len(self.sa_buffer) == 0 else: self.sa_buffer = StateActionBuffer( max_buffer_size=self.args.max_table_size, buffer_add_prob=self.args.sampling_freq, ) self.result = ResultStat("reward", None) self.traj_dict = defaultdict(list) self.result_dict = {}
def __init__( self, coach, executor, device, args, writer, trainable=False, exec_sample=False, pg="ppo", tag="", ): self.__coach = coach self.__executor = executor self.__trainable = trainable self.device = device self.args = args self.tb_log = args.tb_log self.save_folder = None self.tag = tag self.best_test_win_pct = 0 self.tb_writer = writer self.traj_dict = defaultdict(list) self.result_dict = {} self.result = ResultStat("reward", None) self.model = self.load_model(self.__coach, self.__executor, self.args) self.exec_sample = exec_sample print("Using pg {} algorithm".format(args.pg)) if self.__trainable: # if args.split_train: # self.executor_optimizer = optim.Adam( # self.model.executor.parameters(), lr=args.lr # ) # self.coach_optimizer = optim.Adam( # self.model.coach.parameters(), lr=args.lr # ) # else: self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.sa_buffer = StateActionBuffer( max_buffer_size=args.max_table_size, buffer_add_prob=args.sampling_freq) # wandb.watch(self.model) self.pg = pg else: self.optimizer = None self.sa_buffer = None self.pg = None
# Copyright (c) Facebook, Inc. and its affiliates.
device = torch.device('cuda:%d' % args.gpu) model1 = load_model(args.coach1, args.executor1, args) model2 = load_model(args.coach2, args.executor2, args) game_option = get_game_option(args) ai1_option, ai2_option = get_ai_options( args, [model1.coach.num_instructions, model2.coach.num_instructions]) context, act1_dc, act2_dc = create_game(args.num_thread, ai1_option, ai2_option, game_option) context.start() dc = DataChannelManager([act1_dc, act2_dc]) result1 = ResultStat('reward', None) result2 = ResultStat('reward', None) i = 0 while not context.terminated(): i += 1 if i % 1000 == 0: print('%d, progress agent1: win %d, loss %d' % (i, result1.win, result1.loss)) data = dc.get_input(max_timeout_s=1) if len(data) == 0: continue for key in data: # print(key) batch = to_device(data[key], device) if key == 'act1':
device = torch.device('cuda:%d' % args.gpu) coach = ConvRnnCoach.load(args.coach_path).to(device) coach.max_raw_chars = args.max_raw_chars executor = Executor.load(args.model_path).to(device) executor_wrapper = ExecutorWrapper(coach, executor, coach.num_instructions, args.max_raw_chars, args.cheat, args.inst_mode) executor_wrapper.train(False) game_option = get_game_option(args) ai1_option, ai2_option = get_ai_options(args, coach.num_instructions) context, act_dc = create_game(args.num_thread, ai1_option, ai2_option, game_option) context.start() dc = DataChannelManager([act_dc]) result_stat = ResultStat('reward', None) while not context.terminated(): data = dc.get_input(max_timeout_s=1) if len(data) == 0: continue data = to_device(data['act'], device) result_stat.feed(data) reply = executor_wrapper.forward(data) dc.set_reply('act', reply) print(result_stat.log(0)) dc.terminate()
# Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # import argparse import os import sys import pprint from set_path import append_sys_path append_sys_path() import torch import tube from pytube import DataChannelManager import minirts import numpy as np import random import pickle from collections import defaultdict from rnn_coach import ConvRnnCoach from onehot_coach import ConvOneHotCoach from rnn_generator import RnnGenerator from itertools import groupby from executor_wrapper import ExecutorWrapper from executor import Executor from common_utils import to_device, ResultStat, Logger from best_models import best_executors, best_coaches from tqdm import tqdm p1dict = defaultdict(list)
def run_eval(args, model1, model2, device, num_games=100): num_eval_games = num_games result1 = ResultStat("reward", None) result2 = ResultStat("reward", None) game_option = get_game_option(args) ai1_option, ai2_option = get_ai_options( args, [model1.coach.num_instructions, model2.coach.num_instructions]) if args.opponent == "sp": context, act1_dc, act2_dc = init_mt_games(num_eval_games, 0, args, ai1_option, ai2_option, game_option) pbar = tqdm(total=num_eval_games * 2) else: context, act1_dc, act2_dc = init_mt_games(0, num_eval_games, args, ai1_option, ai2_option, game_option) pbar = tqdm(total=num_eval_games) # context, act1_dc, act2_dc = init_games( # num_eval_games, ai1_option, ai2_option, game_option) context.start() dc = DataChannelManager([act1_dc, act2_dc]) i = 0 model1.eval() model2.eval() while not context.terminated(): i += 1 # if i % 1000 == 0: # print('%d, progress agent1: win %d, loss %d' % (i, result1.win, result1.loss)) data = dc.get_input(max_timeout_s=1) if len(data) == 0: continue for key in data: # print(key) batch = to_device(data[key], device) if key == "act1": batch["actor"] = "act1" ## Add batches to state table using sampling before adding ## Add based on the game_id result1.feed(batch) with torch.no_grad(): reply, _ = model1.forward(batch) # , exec_sample=True) elif key == "act2": batch["actor"] = "act2" result2.feed(batch) with torch.no_grad(): reply, _ = model2.forward(batch) else: assert False dc.set_reply(key, reply) game_ids = batch["game_id"].cpu().numpy() terminals = batch["terminal"].cpu().numpy().flatten() for i, g_id in enumerate(game_ids): if terminals[i] == 1: pbar.update(1) model1.eval() model2.eval() pbar.close() return result1, result2