Exemple #1
0
def main():
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    torch.manual_seed(args.seed)

    # Fetch Some Data...
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader)
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Exemple #2
0
class PolicyGradient:
    def __init__(
        self,
        s_dim,
        a_num,
        device,
        hidden,
        lr,
        gamma,
    ):
        # Parameter Initialization
        self.s_dim = s_dim
        self.a_num = a_num
        self.device = device
        self.hidden = hidden
        self.lr = lr
        self.gamma = gamma

        # network initialization
        self.net = Net(s_dim, hidden, a_num).to(self.device)
        self.opt = torch.optim.Adam(self.net.parameters(), lr=lr)

        # the memory only need to store a trajectory
        self.memory_s = []
        self.memory_a = []
        self.memory_r = []

    def get_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        prob_weights = self.net(s)
        # select action w.r.t the actions prob
        dist = Categorical(prob_weights)
        action = (dist.sample()).detach().item()
        return action

    def store_transition(self, s, a, r):
        self.memory_s.append(s)
        self.memory_a.append(a)
        self.memory_r.append(r)

    def learn(self):
        discounted_r = self._discounted_r(self.memory_r)
        s = torch.FloatTensor(self.memory_s).to(self.device)
        a = torch.LongTensor(self.memory_a).to(self.device)
        r = torch.FloatTensor(discounted_r).to(self.device)
        # calculate loss
        prob = self.net(s)
        dist = Categorical(prob)
        loss = -torch.sum(dist.log_prob(a) * r)
        # train on episode
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        # empty episode data
        self.memory_s = []
        self.memory_a = []
        self.memory_r = []

    def _discounted_r(self, r):
        length = len(r)
        discounted_r = np.zeros(length)
        running_add = 0
        for t in range(length - 1, -1, -1):
            running_add = r[t] + running_add * self.gamma
            discounted_r[t] = running_add
        # normalize episode rewards
        discounted_r -= np.mean(discounted_r)
        discounted_r /= np.std(discounted_r)
        return discounted_r