def __init__(self, config): with open(config, 'r') as f: config = json.load(f) self.epochs = config['train']['epochs'] self.policy_epochs = config['train']['policy_epochs'] self.test_iters = config['test']['iters'] layers = config['model']['layers'] conv_size = config['model']['conv_size'] logheat = config['model']['logheat'] self.net = ReversiNet(hidden_size=conv_size, layers=layers, logheat=logheat) env_samples = config['train']['env_samples'] self.factory = RolloutFactory(self.net, env_samples) self.value_loss = nn.MSELoss() epsilon = config['train']['epsilon'] self.ppo_low_bnd = 1 - epsilon self.ppo_up_bnd = 1 + epsilon lr = config['train']['lr'] weight_decay = config['train']['weight_decay'] self.optim = optim.Adam(self.net.parameters(), lr=lr, weight_decay=weight_decay) self.plosses = [] self.vlosses = [] self.avg_wins = [] self.stand_time = [] if torch.cuda.is_available(): torch.cuda.set_device(1) self.net.cuda() self.device = torch.device("cuda") print("Using GPU") else: self.device = torch.device("cpu") print("No GPU detected") self.write_interval = config['model']['write_interval'] self.train_info_path = config['model']['trainer_save_path'] self.policy_path = config['model']['policy_save_path'].split('.pt')[0] self.graph_path = config['model']['graph_save_path'].split('.png')[0]
class Trainer(): """docstring for Trainer.""" def __init__(self, config): with open(config, 'r') as f: config = json.load(f) self.epochs = config['train']['epochs'] self.policy_epochs = config['train']['policy_epochs'] self.test_iters = config['test']['iters'] layers = config['model']['layers'] conv_size = config['model']['conv_size'] logheat = config['model']['logheat'] self.net = ReversiNet(hidden_size=conv_size, layers=layers, logheat=logheat) env_samples = config['train']['env_samples'] self.factory = RolloutFactory(self.net, env_samples) self.value_loss = nn.MSELoss() epsilon = config['train']['epsilon'] self.ppo_low_bnd = 1 - epsilon self.ppo_up_bnd = 1 + epsilon lr = config['train']['lr'] weight_decay = config['train']['weight_decay'] self.optim = optim.Adam(self.net.parameters(), lr=lr, weight_decay=weight_decay) self.plosses = [] self.vlosses = [] self.avg_wins = [] self.stand_time = [] if torch.cuda.is_available(): torch.cuda.set_device(1) self.net.cuda() self.device = torch.device("cuda") print("Using GPU") else: self.device = torch.device("cpu") print("No GPU detected") self.write_interval = config['model']['write_interval'] self.train_info_path = config['model']['trainer_save_path'] self.policy_path = config['model']['policy_save_path'].split('.pt')[0] self.graph_path = config['model']['graph_save_path'].split('.png')[0] def train(self, itr=0): acc = self.test() for i in range(self.epochs): avg_policy_loss = 0 avg_val_loss = 0 rollouts = self.factory.get_rollouts() # Update the policy experience_dataset = ExperienceDataset(rollouts) data_loader = DataLoader(experience_dataset, batch_size=256, shuffle=True, pin_memory=True) self.net.train() for _ in range(self.policy_epochs): avg_policy_loss = 0 avg_val_loss = 0 for state, aprob, value in data_loader: state = _prepare_tensor_batch(state, self.device).unsqueeze(1) aprob = _prepare_tensor_batch(aprob, self.device) value = _prepare_tensor_batch(value, self.device).unsqueeze(1) # Calculate the ratio term pdist, pval = self.net(state) policy_loss = loss_pi(aprob, pdist) val_loss = loss_v(value, pval) # For logging avg_val_loss += val_loss.item() avg_policy_loss += policy_loss.item() # Backpropagate self.optim.zero_grad() loss = policy_loss + val_loss loss.backward() self.optim.step() # Log info avg_val_loss /= len(data_loader) avg_val_loss /= self.policy_epochs avg_policy_loss /= len(data_loader) avg_policy_loss /= self.policy_epochs self.vlosses.append(avg_val_loss) self.plosses.append(avg_policy_loss) if (itr + i) % self.write_interval == 0: acc = self.test() self.avg_wins.append(acc) print( 'itr: % i, avg wins: % 6.2f, value loss: % 6.2f, policy loss: % 6.2f' \ % ((itr+i), acc, avg_val_loss, avg_policy_loss) ) self.write_out(itr + i) def test(self): self.net.eval() env = ReversiEnv() rounds = env.length() // 2 tot_rew = 0 tot_wins = 0 runs = self.test_iters for _ in range(runs): state, turn = env.reset() actions = env.action_space() done = False for i in range(rounds): in_state = torch.FloatTensor(state).unsqueeze(0).unsqueeze( 0).to(self.device) probs, _ = self.net(in_state) probs = probs.squeeze().cpu().detach().numpy() action = sample(probs, actions) state, turn, reward, done = env.step(action) actions = env.action_space() # print('end p1') if done: break probs = np.ones(actions.shape[0]) action = sample(probs, actions) state, turn, reward, done = env.step(action) actions = env.action_space() # print('end p2') if done: break # print(reward) tot_rew += reward if reward > 0: tot_wins += 1 # elif reward == 0: # tot_wins += 1 tot_rew /= runs # print('Avg reward over {} runs: {}'.format(runs, tot_rew)) # print('Wins: {}/{}: {}'.format(tot_wins, runs, tot_wins/runs)) return tot_wins / runs def read_in(self, itr=None): train_info = {} train_info = torch.load(self.train_info_path) if itr is None: itr = train_info['iter'] self.plosses = train_info['plosses'] self.vlosses = train_info['vlosses'] self.avg_wins = train_info['avg_wins'] self.optim = train_info['optimizer'] self.net.load_state_dict( torch.load(str(self.policy_path + '_' + str(itr) + '.pt'))) print('loaded: ' + str(self.policy_path + '_' + str(itr) + '.pt')) self.epochs += itr return itr def write_out(self, itr): train_info = {} train_info['iter'] = itr train_info['plosses'] = self.plosses train_info['vlosses'] = self.vlosses train_info['avg_wins'] = self.avg_wins train_info['optimizer'] = self.optim torch.save(train_info, self.train_info_path) torch.save(self.net.state_dict(), str(self.policy_path + '_' + str(itr) + '.pt')) if itr > 2: plt.plot(self.vlosses, label='value loss') plt.plot(self.plosses, label='policy loss') plt.legend() plt.xlabel('epochs') plt.ylabel('loss') plt.savefig(str(self.graph_path + '_loss.png')) plt.clf() plt.plot(self.avg_wins, label='avg wins') plt.legend() plt.xlabel('epochs') plt.ylabel('rewards') plt.savefig(str(self.graph_path + '_wins.png')) plt.clf() def run(self, cont=False): # check to see if we should continue from an existing checkpoint # otherwise start from scratch if cont: itr = self.read_in() print('continuing') self.train(itr) else: self.train()
class RLTrainer(): """docstring for RLTrainer.""" def __init__(self, config): with open(config, 'r') as f: config = json.load(f) self.epochs = config['train']['epochs'] self.value_epochs = config['train']['value_epochs'] self.policy_epochs = config['train']['policy_epochs'] self.policy_batch_size = config['train']['policy_batch_size'] state_size = config['model']['state_size'] action_size = config['model']['action_size'] self.action_size = action_size self.policy_net = Policy1D(state_size, action_size) self.value_loss = nn.MSELoss() epsilon = config['train']['epsilon'] self.ppoloss = PPOLoss(epsilon) self.ppo_low_bnd = 1 - epsilon self.ppo_up_bnd = 1 + epsilon betas = (config['train']['betas1'], config['train']['betas2']) weight_decay = config['train']['weight_decay'] lr = config['train']['lr'] # params = chain(self.policy_net.parameters(), self.value_net.parameters()) self.optim = optim.Adam(self.policy_net.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.plosses = [] self.vlosses = [] self.avg_rewards = [] self.stand_time = [] if torch.cuda.is_available(): self.policy_net.cuda() self.value_net.cuda() self.device = torch.device("cuda") print("Using GPU") else: self.device = torch.device("cpu") print("No GPU detected") env = gym.make(config['model']['gym']) env_samples = config['train']['env_samples'] episode_length = config['train']['episode_length'] gamma = config['train']['gamma'] self.rollFact = RolloutFactory(env, config['model']['gym'], self.policy_net, env_samples, episode_length, gamma, cutearly=config['train']['cutearly']) self.write_interval = config['model']['write_interval'] self.train_info_path = config['model']['trainer_save_path'] self.policy_path = config['model']['policy_save_path'].split('.pt')[0] self.value_path = config['model']['value_save_path'].split('.pt')[0] self.graph_path = config['model']['graph_save_path'].split('.png')[0] def train(self, itr=0): loop = tqdm(total=self.epochs, position=0, leave=False) for i in range(self.epochs): avg_r = 0 avg_policy_loss = 0 avg_val_loss = 0 rollouts = self.rollFact.get_rollouts() for r1 in rollouts: for r2 in r1: avg_r += r2[-2] avg_r /= len(rollouts) # Update the policy experience_dataset = ExperienceDataset(rollouts) data_loader = DataLoader(experience_dataset, batch_size=self.policy_batch_size, shuffle=True, pin_memory=True) for _ in range(self.policy_epochs): avg_policy_loss = 0 avg_val_loss = 0 for state, aprob, action, reward, value in data_loader: state = _prepare_tensor_batch(state, self.device) aprob = _prepare_tensor_batch(aprob, self.device) action = _prepare_tensor_batch(action, self.device) value = _prepare_tensor_batch(value, self.device).unsqueeze(1) # Calculate the ratio term pdist, pval = self.policy_net(state, False) clik = multinomial_likelihood(pdist, action) olik = multinomial_likelihood(aprob, action) ratio = (clik / olik) # Calculate the value loss val_loss = self.value_loss(pval, value) # Calculate the policy loss advantage = value - pval.detach() lhs = ratio * advantage rhs = torch.clamp(ratio, self.ppo_low_bnd, self.ppo_up_bnd) * advantage policy_loss = -torch.mean(torch.min(lhs, rhs)) # For logging avg_val_loss += val_loss.item() avg_policy_loss += policy_loss.item() # Backpropagate self.optim.zero_grad() loss = policy_loss + val_loss loss.backward() self.optim.step() # Log info avg_val_loss /= len(data_loader) avg_policy_loss /= len(data_loader) self.vlosses.append(avg_val_loss) self.plosses.append(avg_policy_loss) self.avg_rewards.append(avg_r) loop.set_description( 'avg reward: % 6.2f, value loss: % 6.2f, policy loss: % 6.2f' \ % (avg_r, avg_val_loss, avg_policy_loss)) loop.update(1) if (itr + i) % self.write_interval == 0: self.write_out(itr + i) def read_in(self, itr=None): train_info = {} train_info = torch.load(self.train_info_path) if itr is None: itr = train_info['iter'] self.plosses = train_info['plosses'] self.vlosses = train_info['vlosses'] self.avg_rewards = train_info['avg_reward'] self.optim = train_info['optimizer'] self.policy_net.load_state_dict( torch.load(str(self.policy_path + '_' + str(itr) + '.pt'))) self.epochs += itr return itr def write_out(self, itr): train_info = {} train_info['iter'] = itr train_info['plosses'] = self.plosses train_info['vlosses'] = self.vlosses train_info['avg_reward'] = self.avg_rewards train_info['optimizer'] = self.optim # train_info['policy_optimizer'] = self.policy_optim # train_info['value_optimizer'] = self.value_optim torch.save(train_info, self.train_info_path) torch.save(self.policy_net.state_dict(), str(self.policy_path + '_' + str(itr) + '.pt')) if itr > 2: plt.plot(self.vlosses[2:], label='value loss') plt.plot(self.plosses[2:], label='policy loss') plt.legend() plt.xlabel('epochs') plt.ylabel('loss') plt.savefig(str(self.graph_path + '_loss.png')) plt.clf() plt.plot(self.avg_rewards[2:], label='rewards') plt.legend() plt.xlabel('epochs') plt.ylabel('rewards') plt.savefig(str(self.graph_path + '_reward.png')) plt.clf() def run(self, cont=False): # check to see if we should continue from an existing checkpoint # otherwise start from scratch if cont: itr = self.read_in() print('continuing') self.train(itr) else: self.train()
def __init__(self, config): with open(config, 'r') as f: config = json.load(f) self.epochs = config['train']['epochs'] self.value_epochs = config['train']['value_epochs'] self.policy_epochs = config['train']['policy_epochs'] self.policy_batch_size = config['train']['policy_batch_size'] state_size = config['model']['state_size'] action_size = config['model']['action_size'] self.action_size = action_size self.policy_net = Policy1D(state_size, action_size) self.value_loss = nn.MSELoss() epsilon = config['train']['epsilon'] self.ppoloss = PPOLoss(epsilon) self.ppo_low_bnd = 1 - epsilon self.ppo_up_bnd = 1 + epsilon betas = (config['train']['betas1'], config['train']['betas2']) weight_decay = config['train']['weight_decay'] lr = config['train']['lr'] # params = chain(self.policy_net.parameters(), self.value_net.parameters()) self.optim = optim.Adam(self.policy_net.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.plosses = [] self.vlosses = [] self.avg_rewards = [] self.stand_time = [] if torch.cuda.is_available(): self.policy_net.cuda() self.value_net.cuda() self.device = torch.device("cuda") print("Using GPU") else: self.device = torch.device("cpu") print("No GPU detected") env = gym.make(config['model']['gym']) env_samples = config['train']['env_samples'] episode_length = config['train']['episode_length'] gamma = config['train']['gamma'] self.rollFact = RolloutFactory(env, config['model']['gym'], self.policy_net, env_samples, episode_length, gamma, cutearly=config['train']['cutearly']) self.write_interval = config['model']['write_interval'] self.train_info_path = config['model']['trainer_save_path'] self.policy_path = config['model']['policy_save_path'].split('.pt')[0] self.value_path = config['model']['value_save_path'].split('.pt')[0] self.graph_path = config['model']['graph_save_path'].split('.png')[0]
class RLTrainer(): """docstring for RLTrainer.""" def __init__(self, config): with open(config, 'r') as f: config = json.load(f) self.epochs = config['train']['epochs'] self.env_samples = config['train']['env_samples'] self.episode_length = config['train']['episode_length'] self.gamma = config['train']['gamma'] self.value_epochs = config['train']['value_epochs'] self.policy_epochs = config['train']['policy_epochs'] self.batch_size = config['train']['batch_size'] self.policy_batch_size = config['train']['policy_batch_size'] epsilon = config['train']['epsilon'] self.env = gym.make(config['model']['gym']) state_size = config['model']['state_size'] action_size = config['model']['action_size'] hidden_size = config['model']['hidden_size'] layer_size = config['model']['hidden_layers'] logheat = config['model']['logheat'] self.action_size = action_size self.policy_net = Policy1D(state_size, action_size, hidden_size=hidden_size, layers=layer_size, logheat=logheat) self.value_net = Value1D(state_size, hidden_size=hidden_size, layers=layer_size) self.value_loss = nn.MSELoss() self.ppoloss = PPOLoss(epsilon) betas = (config['train']['betas1'], config['train']['betas2']) weight_decay = config['train']['weight_decay'] lr = config['train']['lr'] params = chain(self.policy_net.parameters(), self.value_net.parameters()) self.optim = optim.Adam(params, lr=lr, betas=betas, weight_decay=weight_decay) self.plosses = [] self.vlosses = [] self.avg_reward = [] if torch.cuda.is_available(): self.policy_net.cuda() self.value_net.cuda() self.device = torch.device("cuda") print("Using GPU") else: self.device = torch.device("cpu") print("No GPU detected") self.rollFact = RolloutFactory(self.env, config['model']['gym'], self.policy_net, self.env_samples, self.episode_length, self.gamma, cutearly=config['train']['cutearly']) self.write_interval = config['model']['write_interval'] self.train_info_path = config['model']['trainer_save_path'] self.policy_path = config['model']['policy_save_path'].split('.pt')[0] self.value_path = config['model']['value_save_path'].split('.pt')[0] self.gif_path = config['model']['gif_save_path'].split('.gif')[0] self.graph_path = config['model']['graph_save_path'].split('.png')[0] def train(self, itr=0): for i in range(self.epochs): # generate rollouts rollouts = self.rollFact.get_rollouts() # Learn a policy vlosses = [] plosses = [] dataset = RLDataset(rollouts) dataloader = DataLoader(dataset, batch_size=self.policy_batch_size, shuffle=True, pin_memory=True) for _ in range(self.policy_epochs): # train policy network for state, aprob, action, reward, value in dataloader: state, aprob = state.to(self.device), aprob.to(self.device) action, value = action.to(self.device), value.to( self.device) pdist = self.policy_net(state) clik = self.multinomial_likelihood(pdist, action) olik = self.multinomial_likelihood(aprob, action) ratio = (clik / olik) pval = self.value_net(state) vloss = self.value_loss(pval, value) vlosses.append(vloss.cpu().item()) advantage = value - pval.detach() ploss = self.ppoloss(ratio, advantage) plosses.append(ploss.cpu().item()) self.optim.zero_grad() loss = ploss + vloss loss.backward() self.optim.step() gc.collect() self.vlosses.append(np.mean(vlosses)) self.plosses.append(np.mean(plosses)) if (itr + i) % self.write_interval == 0: self.avg_reward = self.rollFact.avg_reward print( 'iter: {}, avg reward: {}, vloss: {}, ploss: {}, avg_len: {}' .format(itr + i, self.avg_reward[-1], vloss, ploss, len(rollouts[-1]))) self.write_out(itr + i) # print(torch.cuda.memory_allocated(0) / 1e9) def multinomial_likelihood(self, dist, idx): return dist[range(dist.shape[0]), idx.long()[:, 0]].unsqueeze(1) def read_in(self, itr=None): train_info = {} train_info = torch.load(self.train_info_path) if itr is None: itr = train_info['iter'] self.plosses = train_info['plosses'] self.vlosses = train_info['vlosses'] self.avg_reward = train_info['avg_reward'] self.optim = train_info['optimizer'] # self.policy_optim = train_info['policy_optimizer'] # self.value_optim = train_info['value_optimizer'] self.policy_net.load_state_dict( torch.load(str(self.policy_path + '_' + str(itr) + '.pt'))) self.value_net.load_state_dict( torch.load(str(self.value_path + '_' + str(itr) + '.pt'))) self.epochs += itr return itr def write_out(self, itr): train_info = {} train_info['iter'] = itr train_info['plosses'] = self.plosses train_info['vlosses'] = self.vlosses train_info['avg_reward'] = self.avg_reward train_info['optimizer'] = self.optim # train_info['policy_optimizer'] = self.policy_optim # train_info['value_optimizer'] = self.value_optim torch.save(train_info, self.train_info_path) torch.save(self.policy_net.state_dict(), str(self.policy_path + '_' + str(itr) + '.pt')) torch.save(self.value_net.state_dict(), str(self.value_path + '_' + str(itr) + '.pt')) if itr > 2: plt.plot(self.vlosses[2:], label='value loss') plt.plot(self.plosses[2:], label='policy loss') plt.legend() plt.xlabel('epochs') plt.ylabel('loss') plt.savefig(str(self.graph_path + '_loss.png')) plt.clf() plt.plot(self.avg_reward[2:], label='rewards') plt.legend() plt.xlabel('epochs') plt.ylabel('rewards') plt.savefig(str(self.graph_path + '_reward.png')) plt.clf() def run(self, cont=False): # check to see if we should continue from an existing checkpoint # otherwise start from scratch if cont: itr = self.read_in() print('continuing') self.train(itr) else: self.train()