Beispiel #1
0
    def __init__(self, config):
        with open(config, 'r') as f:
            config = json.load(f)

        self.epochs = config['train']['epochs']
        self.policy_epochs = config['train']['policy_epochs']
        self.test_iters = config['test']['iters']

        layers = config['model']['layers']
        conv_size = config['model']['conv_size']
        logheat = config['model']['logheat']
        self.net = ReversiNet(hidden_size=conv_size,
                              layers=layers,
                              logheat=logheat)

        env_samples = config['train']['env_samples']
        self.factory = RolloutFactory(self.net, env_samples)

        self.value_loss = nn.MSELoss()

        epsilon = config['train']['epsilon']
        self.ppo_low_bnd = 1 - epsilon
        self.ppo_up_bnd = 1 + epsilon

        lr = config['train']['lr']
        weight_decay = config['train']['weight_decay']
        self.optim = optim.Adam(self.net.parameters(),
                                lr=lr,
                                weight_decay=weight_decay)

        self.plosses = []
        self.vlosses = []
        self.avg_wins = []
        self.stand_time = []

        if torch.cuda.is_available():
            torch.cuda.set_device(1)
            self.net.cuda()
            self.device = torch.device("cuda")
            print("Using GPU")
        else:
            self.device = torch.device("cpu")
            print("No GPU detected")

        self.write_interval = config['model']['write_interval']
        self.train_info_path = config['model']['trainer_save_path']
        self.policy_path = config['model']['policy_save_path'].split('.pt')[0]
        self.graph_path = config['model']['graph_save_path'].split('.png')[0]
Beispiel #2
0
class Trainer():
    """docstring for Trainer."""
    def __init__(self, config):
        with open(config, 'r') as f:
            config = json.load(f)

        self.epochs = config['train']['epochs']
        self.policy_epochs = config['train']['policy_epochs']
        self.test_iters = config['test']['iters']

        layers = config['model']['layers']
        conv_size = config['model']['conv_size']
        logheat = config['model']['logheat']
        self.net = ReversiNet(hidden_size=conv_size,
                              layers=layers,
                              logheat=logheat)

        env_samples = config['train']['env_samples']
        self.factory = RolloutFactory(self.net, env_samples)

        self.value_loss = nn.MSELoss()

        epsilon = config['train']['epsilon']
        self.ppo_low_bnd = 1 - epsilon
        self.ppo_up_bnd = 1 + epsilon

        lr = config['train']['lr']
        weight_decay = config['train']['weight_decay']
        self.optim = optim.Adam(self.net.parameters(),
                                lr=lr,
                                weight_decay=weight_decay)

        self.plosses = []
        self.vlosses = []
        self.avg_wins = []
        self.stand_time = []

        if torch.cuda.is_available():
            torch.cuda.set_device(1)
            self.net.cuda()
            self.device = torch.device("cuda")
            print("Using GPU")
        else:
            self.device = torch.device("cpu")
            print("No GPU detected")

        self.write_interval = config['model']['write_interval']
        self.train_info_path = config['model']['trainer_save_path']
        self.policy_path = config['model']['policy_save_path'].split('.pt')[0]
        self.graph_path = config['model']['graph_save_path'].split('.png')[0]

    def train(self, itr=0):
        acc = self.test()
        for i in range(self.epochs):
            avg_policy_loss = 0
            avg_val_loss = 0

            rollouts = self.factory.get_rollouts()

            # Update the policy
            experience_dataset = ExperienceDataset(rollouts)
            data_loader = DataLoader(experience_dataset,
                                     batch_size=256,
                                     shuffle=True,
                                     pin_memory=True)
            self.net.train()
            for _ in range(self.policy_epochs):
                avg_policy_loss = 0
                avg_val_loss = 0
                for state, aprob, value in data_loader:
                    state = _prepare_tensor_batch(state,
                                                  self.device).unsqueeze(1)
                    aprob = _prepare_tensor_batch(aprob, self.device)
                    value = _prepare_tensor_batch(value,
                                                  self.device).unsqueeze(1)

                    # Calculate the ratio term
                    pdist, pval = self.net(state)
                    policy_loss = loss_pi(aprob, pdist)
                    val_loss = loss_v(value, pval)

                    # For logging
                    avg_val_loss += val_loss.item()
                    avg_policy_loss += policy_loss.item()

                    # Backpropagate
                    self.optim.zero_grad()
                    loss = policy_loss + val_loss
                    loss.backward()
                    self.optim.step()

            # Log info
            avg_val_loss /= len(data_loader)
            avg_val_loss /= self.policy_epochs
            avg_policy_loss /= len(data_loader)
            avg_policy_loss /= self.policy_epochs
            self.vlosses.append(avg_val_loss)
            self.plosses.append(avg_policy_loss)

            if (itr + i) % self.write_interval == 0:
                acc = self.test()
                self.avg_wins.append(acc)
                print(
                'itr: % i, avg wins: % 6.2f, value loss: % 6.2f, policy loss: % 6.2f' \
        % ((itr+i), acc, avg_val_loss, avg_policy_loss) )
                self.write_out(itr + i)

    def test(self):
        self.net.eval()
        env = ReversiEnv()
        rounds = env.length() // 2
        tot_rew = 0
        tot_wins = 0
        runs = self.test_iters

        for _ in range(runs):
            state, turn = env.reset()
            actions = env.action_space()
            done = False
            for i in range(rounds):
                in_state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(
                    0).to(self.device)
                probs, _ = self.net(in_state)
                probs = probs.squeeze().cpu().detach().numpy()
                action = sample(probs, actions)
                state, turn, reward, done = env.step(action)
                actions = env.action_space()
                # print('end p1')
                if done:
                    break

                probs = np.ones(actions.shape[0])
                action = sample(probs, actions)
                state, turn, reward, done = env.step(action)
                actions = env.action_space()
                # print('end p2')
                if done:
                    break

            # print(reward)
            tot_rew += reward
            if reward > 0:
                tot_wins += 1
            # elif reward == 0:
            #   tot_wins += 1
        tot_rew /= runs
        # print('Avg reward over {} runs: {}'.format(runs, tot_rew))
        # print('Wins: {}/{}: {}'.format(tot_wins, runs, tot_wins/runs))
        return tot_wins / runs

    def read_in(self, itr=None):
        train_info = {}
        train_info = torch.load(self.train_info_path)
        if itr is None:
            itr = train_info['iter']
        self.plosses = train_info['plosses']
        self.vlosses = train_info['vlosses']
        self.avg_wins = train_info['avg_wins']
        self.optim = train_info['optimizer']

        self.net.load_state_dict(
            torch.load(str(self.policy_path + '_' + str(itr) + '.pt')))
        print('loaded: ' + str(self.policy_path + '_' + str(itr) + '.pt'))

        self.epochs += itr
        return itr

    def write_out(self, itr):
        train_info = {}
        train_info['iter'] = itr
        train_info['plosses'] = self.plosses
        train_info['vlosses'] = self.vlosses
        train_info['avg_wins'] = self.avg_wins
        train_info['optimizer'] = self.optim
        torch.save(train_info, self.train_info_path)

        torch.save(self.net.state_dict(),
                   str(self.policy_path + '_' + str(itr) + '.pt'))

        if itr > 2:
            plt.plot(self.vlosses, label='value loss')
            plt.plot(self.plosses, label='policy loss')
            plt.legend()
            plt.xlabel('epochs')
            plt.ylabel('loss')
            plt.savefig(str(self.graph_path + '_loss.png'))
            plt.clf()

            plt.plot(self.avg_wins, label='avg wins')
            plt.legend()
            plt.xlabel('epochs')
            plt.ylabel('rewards')
            plt.savefig(str(self.graph_path + '_wins.png'))
            plt.clf()

    def run(self, cont=False):
        # check to see if we should continue from an existing checkpoint
        # otherwise start from scratch
        if cont:
            itr = self.read_in()
            print('continuing')
            self.train(itr)
        else:
            self.train()
Beispiel #3
0
class RLTrainer():
    """docstring for RLTrainer."""
    def __init__(self, config):
        with open(config, 'r') as f:
            config = json.load(f)

        self.epochs = config['train']['epochs']
        self.value_epochs = config['train']['value_epochs']
        self.policy_epochs = config['train']['policy_epochs']
        self.policy_batch_size = config['train']['policy_batch_size']

        state_size = config['model']['state_size']
        action_size = config['model']['action_size']
        self.action_size = action_size
        self.policy_net = Policy1D(state_size, action_size)

        self.value_loss = nn.MSELoss()

        epsilon = config['train']['epsilon']
        self.ppoloss = PPOLoss(epsilon)
        self.ppo_low_bnd = 1 - epsilon
        self.ppo_up_bnd = 1 + epsilon

        betas = (config['train']['betas1'], config['train']['betas2'])
        weight_decay = config['train']['weight_decay']
        lr = config['train']['lr']
        # params = chain(self.policy_net.parameters(), self.value_net.parameters())
        self.optim = optim.Adam(self.policy_net.parameters(),
                                lr=lr,
                                betas=betas,
                                weight_decay=weight_decay)

        self.plosses = []
        self.vlosses = []
        self.avg_rewards = []
        self.stand_time = []

        if torch.cuda.is_available():
            self.policy_net.cuda()
            self.value_net.cuda()
            self.device = torch.device("cuda")
            print("Using GPU")
        else:
            self.device = torch.device("cpu")
            print("No GPU detected")

        env = gym.make(config['model']['gym'])
        env_samples = config['train']['env_samples']
        episode_length = config['train']['episode_length']
        gamma = config['train']['gamma']
        self.rollFact = RolloutFactory(env,
                                       config['model']['gym'],
                                       self.policy_net,
                                       env_samples,
                                       episode_length,
                                       gamma,
                                       cutearly=config['train']['cutearly'])

        self.write_interval = config['model']['write_interval']
        self.train_info_path = config['model']['trainer_save_path']
        self.policy_path = config['model']['policy_save_path'].split('.pt')[0]
        self.value_path = config['model']['value_save_path'].split('.pt')[0]
        self.graph_path = config['model']['graph_save_path'].split('.png')[0]

    def train(self, itr=0):

        loop = tqdm(total=self.epochs, position=0, leave=False)

        for i in range(self.epochs):
            avg_r = 0
            avg_policy_loss = 0
            avg_val_loss = 0

            rollouts = self.rollFact.get_rollouts()
            for r1 in rollouts:
                for r2 in r1:
                    avg_r += r2[-2]
            avg_r /= len(rollouts)

            # Update the policy
            experience_dataset = ExperienceDataset(rollouts)
            data_loader = DataLoader(experience_dataset,
                                     batch_size=self.policy_batch_size,
                                     shuffle=True,
                                     pin_memory=True)
            for _ in range(self.policy_epochs):
                avg_policy_loss = 0
                avg_val_loss = 0
                for state, aprob, action, reward, value in data_loader:
                    state = _prepare_tensor_batch(state, self.device)
                    aprob = _prepare_tensor_batch(aprob, self.device)
                    action = _prepare_tensor_batch(action, self.device)
                    value = _prepare_tensor_batch(value,
                                                  self.device).unsqueeze(1)

                    # Calculate the ratio term
                    pdist, pval = self.policy_net(state, False)
                    clik = multinomial_likelihood(pdist, action)
                    olik = multinomial_likelihood(aprob, action)
                    ratio = (clik / olik)

                    # Calculate the value loss
                    val_loss = self.value_loss(pval, value)

                    # Calculate the policy loss
                    advantage = value - pval.detach()
                    lhs = ratio * advantage
                    rhs = torch.clamp(ratio, self.ppo_low_bnd,
                                      self.ppo_up_bnd) * advantage
                    policy_loss = -torch.mean(torch.min(lhs, rhs))

                    # For logging
                    avg_val_loss += val_loss.item()
                    avg_policy_loss += policy_loss.item()

                    # Backpropagate
                    self.optim.zero_grad()
                    loss = policy_loss + val_loss
                    loss.backward()
                    self.optim.step()

                # Log info
                avg_val_loss /= len(data_loader)
                avg_policy_loss /= len(data_loader)
                self.vlosses.append(avg_val_loss)
                self.plosses.append(avg_policy_loss)
                self.avg_rewards.append(avg_r)

                loop.set_description(
                  'avg reward: % 6.2f, value loss: % 6.2f, policy loss: % 6.2f' \
                  % (avg_r, avg_val_loss, avg_policy_loss))
                loop.update(1)

                if (itr + i) % self.write_interval == 0:
                    self.write_out(itr + i)

    def read_in(self, itr=None):
        train_info = {}
        train_info = torch.load(self.train_info_path)
        if itr is None:
            itr = train_info['iter']
        self.plosses = train_info['plosses']
        self.vlosses = train_info['vlosses']
        self.avg_rewards = train_info['avg_reward']
        self.optim = train_info['optimizer']

        self.policy_net.load_state_dict(
            torch.load(str(self.policy_path + '_' + str(itr) + '.pt')))

        self.epochs += itr
        return itr

    def write_out(self, itr):
        train_info = {}
        train_info['iter'] = itr
        train_info['plosses'] = self.plosses
        train_info['vlosses'] = self.vlosses
        train_info['avg_reward'] = self.avg_rewards
        train_info['optimizer'] = self.optim
        # train_info['policy_optimizer'] = self.policy_optim
        # train_info['value_optimizer'] = self.value_optim
        torch.save(train_info, self.train_info_path)

        torch.save(self.policy_net.state_dict(),
                   str(self.policy_path + '_' + str(itr) + '.pt'))

        if itr > 2:
            plt.plot(self.vlosses[2:], label='value loss')
            plt.plot(self.plosses[2:], label='policy loss')
            plt.legend()
            plt.xlabel('epochs')
            plt.ylabel('loss')
            plt.savefig(str(self.graph_path + '_loss.png'))
            plt.clf()

            plt.plot(self.avg_rewards[2:], label='rewards')
            plt.legend()
            plt.xlabel('epochs')
            plt.ylabel('rewards')
            plt.savefig(str(self.graph_path + '_reward.png'))
            plt.clf()

    def run(self, cont=False):
        # check to see if we should continue from an existing checkpoint
        # otherwise start from scratch
        if cont:
            itr = self.read_in()
            print('continuing')
            self.train(itr)
        else:
            self.train()
Beispiel #4
0
    def __init__(self, config):
        with open(config, 'r') as f:
            config = json.load(f)

        self.epochs = config['train']['epochs']
        self.value_epochs = config['train']['value_epochs']
        self.policy_epochs = config['train']['policy_epochs']
        self.policy_batch_size = config['train']['policy_batch_size']

        state_size = config['model']['state_size']
        action_size = config['model']['action_size']
        self.action_size = action_size
        self.policy_net = Policy1D(state_size, action_size)

        self.value_loss = nn.MSELoss()

        epsilon = config['train']['epsilon']
        self.ppoloss = PPOLoss(epsilon)
        self.ppo_low_bnd = 1 - epsilon
        self.ppo_up_bnd = 1 + epsilon

        betas = (config['train']['betas1'], config['train']['betas2'])
        weight_decay = config['train']['weight_decay']
        lr = config['train']['lr']
        # params = chain(self.policy_net.parameters(), self.value_net.parameters())
        self.optim = optim.Adam(self.policy_net.parameters(),
                                lr=lr,
                                betas=betas,
                                weight_decay=weight_decay)

        self.plosses = []
        self.vlosses = []
        self.avg_rewards = []
        self.stand_time = []

        if torch.cuda.is_available():
            self.policy_net.cuda()
            self.value_net.cuda()
            self.device = torch.device("cuda")
            print("Using GPU")
        else:
            self.device = torch.device("cpu")
            print("No GPU detected")

        env = gym.make(config['model']['gym'])
        env_samples = config['train']['env_samples']
        episode_length = config['train']['episode_length']
        gamma = config['train']['gamma']
        self.rollFact = RolloutFactory(env,
                                       config['model']['gym'],
                                       self.policy_net,
                                       env_samples,
                                       episode_length,
                                       gamma,
                                       cutearly=config['train']['cutearly'])

        self.write_interval = config['model']['write_interval']
        self.train_info_path = config['model']['trainer_save_path']
        self.policy_path = config['model']['policy_save_path'].split('.pt')[0]
        self.value_path = config['model']['value_save_path'].split('.pt')[0]
        self.graph_path = config['model']['graph_save_path'].split('.png')[0]
Beispiel #5
0
class RLTrainer():
    """docstring for RLTrainer."""
    def __init__(self, config):
        with open(config, 'r') as f:
            config = json.load(f)

        self.epochs = config['train']['epochs']
        self.env_samples = config['train']['env_samples']
        self.episode_length = config['train']['episode_length']
        self.gamma = config['train']['gamma']
        self.value_epochs = config['train']['value_epochs']
        self.policy_epochs = config['train']['policy_epochs']
        self.batch_size = config['train']['batch_size']
        self.policy_batch_size = config['train']['policy_batch_size']
        epsilon = config['train']['epsilon']

        self.env = gym.make(config['model']['gym'])

        state_size = config['model']['state_size']
        action_size = config['model']['action_size']
        hidden_size = config['model']['hidden_size']
        layer_size = config['model']['hidden_layers']
        logheat = config['model']['logheat']
        self.action_size = action_size
        self.policy_net = Policy1D(state_size,
                                   action_size,
                                   hidden_size=hidden_size,
                                   layers=layer_size,
                                   logheat=logheat)
        self.value_net = Value1D(state_size,
                                 hidden_size=hidden_size,
                                 layers=layer_size)

        self.value_loss = nn.MSELoss()
        self.ppoloss = PPOLoss(epsilon)

        betas = (config['train']['betas1'], config['train']['betas2'])
        weight_decay = config['train']['weight_decay']
        lr = config['train']['lr']
        params = chain(self.policy_net.parameters(),
                       self.value_net.parameters())
        self.optim = optim.Adam(params,
                                lr=lr,
                                betas=betas,
                                weight_decay=weight_decay)

        self.plosses = []
        self.vlosses = []
        self.avg_reward = []

        if torch.cuda.is_available():
            self.policy_net.cuda()
            self.value_net.cuda()
            self.device = torch.device("cuda")
            print("Using GPU")
        else:
            self.device = torch.device("cpu")
            print("No GPU detected")

        self.rollFact = RolloutFactory(self.env,
                                       config['model']['gym'],
                                       self.policy_net,
                                       self.env_samples,
                                       self.episode_length,
                                       self.gamma,
                                       cutearly=config['train']['cutearly'])

        self.write_interval = config['model']['write_interval']
        self.train_info_path = config['model']['trainer_save_path']
        self.policy_path = config['model']['policy_save_path'].split('.pt')[0]
        self.value_path = config['model']['value_save_path'].split('.pt')[0]
        self.gif_path = config['model']['gif_save_path'].split('.gif')[0]
        self.graph_path = config['model']['graph_save_path'].split('.png')[0]

    def train(self, itr=0):
        for i in range(self.epochs):
            # generate rollouts
            rollouts = self.rollFact.get_rollouts()

            # Learn a policy
            vlosses = []
            plosses = []
            dataset = RLDataset(rollouts)
            dataloader = DataLoader(dataset,
                                    batch_size=self.policy_batch_size,
                                    shuffle=True,
                                    pin_memory=True)
            for _ in range(self.policy_epochs):
                # train policy network
                for state, aprob, action, reward, value in dataloader:
                    state, aprob = state.to(self.device), aprob.to(self.device)
                    action, value = action.to(self.device), value.to(
                        self.device)

                    pdist = self.policy_net(state)
                    clik = self.multinomial_likelihood(pdist, action)
                    olik = self.multinomial_likelihood(aprob, action)
                    ratio = (clik / olik)

                    pval = self.value_net(state)
                    vloss = self.value_loss(pval, value)
                    vlosses.append(vloss.cpu().item())

                    advantage = value - pval.detach()
                    ploss = self.ppoloss(ratio, advantage)
                    plosses.append(ploss.cpu().item())

                    self.optim.zero_grad()
                    loss = ploss + vloss
                    loss.backward()
                    self.optim.step()
                    gc.collect()
            self.vlosses.append(np.mean(vlosses))
            self.plosses.append(np.mean(plosses))

            if (itr + i) % self.write_interval == 0:
                self.avg_reward = self.rollFact.avg_reward
                print(
                    'iter: {}, avg reward: {}, vloss: {}, ploss: {}, avg_len: {}'
                    .format(itr + i, self.avg_reward[-1], vloss, ploss,
                            len(rollouts[-1])))
                self.write_out(itr + i)

            # print(torch.cuda.memory_allocated(0) / 1e9)

    def multinomial_likelihood(self, dist, idx):
        return dist[range(dist.shape[0]), idx.long()[:, 0]].unsqueeze(1)

    def read_in(self, itr=None):
        train_info = {}
        train_info = torch.load(self.train_info_path)
        if itr is None:
            itr = train_info['iter']
        self.plosses = train_info['plosses']
        self.vlosses = train_info['vlosses']
        self.avg_reward = train_info['avg_reward']
        self.optim = train_info['optimizer']
        # self.policy_optim = train_info['policy_optimizer']
        # self.value_optim = train_info['value_optimizer']

        self.policy_net.load_state_dict(
            torch.load(str(self.policy_path + '_' + str(itr) + '.pt')))

        self.value_net.load_state_dict(
            torch.load(str(self.value_path + '_' + str(itr) + '.pt')))

        self.epochs += itr
        return itr

    def write_out(self, itr):
        train_info = {}
        train_info['iter'] = itr
        train_info['plosses'] = self.plosses
        train_info['vlosses'] = self.vlosses
        train_info['avg_reward'] = self.avg_reward
        train_info['optimizer'] = self.optim
        # train_info['policy_optimizer'] = self.policy_optim
        # train_info['value_optimizer'] = self.value_optim
        torch.save(train_info, self.train_info_path)

        torch.save(self.policy_net.state_dict(),
                   str(self.policy_path + '_' + str(itr) + '.pt'))

        torch.save(self.value_net.state_dict(),
                   str(self.value_path + '_' + str(itr) + '.pt'))

        if itr > 2:
            plt.plot(self.vlosses[2:], label='value loss')
            plt.plot(self.plosses[2:], label='policy loss')
            plt.legend()
            plt.xlabel('epochs')
            plt.ylabel('loss')
            plt.savefig(str(self.graph_path + '_loss.png'))
            plt.clf()

            plt.plot(self.avg_reward[2:], label='rewards')
            plt.legend()
            plt.xlabel('epochs')
            plt.ylabel('rewards')
            plt.savefig(str(self.graph_path + '_reward.png'))
            plt.clf()

    def run(self, cont=False):
        # check to see if we should continue from an existing checkpoint
        # otherwise start from scratch
        if cont:
            itr = self.read_in()
            print('continuing')
            self.train(itr)
        else:
            self.train()