def prepare_train_data_method(self, window_data: List = [], word2index: dict = {}, weighting_dic: dict = {}, X_ik: dict = {}): u_p = [] v_p = [] co_p = [] weight_p = [] # Reference # view # http://pytorch.org/docs/master/tensors.html#torch.Tensor.view for pair in window_data: u_p.append(prepare_word(pair[0], word2index).view(1, -1)) v_p.append(prepare_word(pair[1], word2index).view(1, -1)) try: cooc = X_ik[pair] except: cooc = 1 co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1, -1)) weight_p.append( Variable(FloatTensor([weighting_dic[pair]])).view(1, -1)) train_data = list(zip(u_p, v_p, co_p, weight_p)) return train_data
def GAE(reward, value, mask, gamma, lam): adv = FloatTensor(reward.shape) delta = FloatTensor(reward.shape) # pre_value, pre_adv = 0, 0 pre_value = torch.zeros(reward.shape[1:], device=device) pre_adv = torch.zeros(reward.shape[1:], device=device) for i in reversed(range(reward.shape[0])): delta[i] = reward[i] + gamma * pre_value * mask[i] - value[i] adv[i] = delta[i] + gamma * lam * pre_adv * mask[i] pre_adv = adv[i, ...] pre_value = value[i, ...] returns = value + adv adv = (adv - adv.mean()) / adv.std() return adv, returns
def get_net_log_prob(self, net_input_state, net_input_discrete_action, net_input_continuous_action): net = getattr(self, net_name) n_action_dim = getattr(self, 'n_' + action_name) discrete_action_dim = getattr(self, 'discrete_' + action_name + '_dim') sections = getattr(self, 'discrete_' + action_name + '_sections') continuous_action_log_std = getattr( self, net_name + '_' + action_name + '_std') discrete_action_probs_with_continuous_mean = net(net_input_state) discrete_actions_log_prob = 0 continuous_actions_log_prob = 0 if discrete_action_dim != 0: dist = MultiOneHotCategorical( discrete_action_probs_with_continuous_mean[ ..., :discrete_action_dim], sections) discrete_actions_log_prob = dist.log_prob( net_input_discrete_action) if n_action_dim - discrete_action_dim != 0: continuous_actions_mean = discrete_action_probs_with_continuous_mean[ ..., discrete_action_dim:] continuous_log_std = continuous_action_log_std.expand_as( continuous_actions_mean) continuous_actions_std = torch.exp(continuous_log_std) continuous_dist = MultivariateNormal( continuous_actions_mean, torch.diag_embed(continuous_actions_std)) continuous_actions_log_prob = continuous_dist.log_prob( net_input_continuous_action) return FloatTensor(discrete_actions_log_prob + continuous_actions_log_prob).unsqueeze(-1)
def get_net_action(self, state, size=1): net = getattr(self, net_name) n_action_dim = getattr(self, 'n_' + action_name) discrete_action_dim = getattr(self, 'discrete_' + action_name + '_dim') sections = getattr(self, 'discrete_' + action_name + '_sections') continuous_action_log_std = getattr( self, net_name + '_' + action_name + '_std') discrete_action_probs_with_continuous_mean = net(state) discrete_actions = torch.empty((size, 0), device=self.device) continuous_actions = torch.empty((size, 0), device=self.device) discrete_actions_log_prob = 0 continuous_actions_log_prob = 0 if discrete_action_dim != 0: dist = MultiOneHotCategorical( discrete_action_probs_with_continuous_mean[ ..., :discrete_action_dim], sections) discrete_actions = dist.sample() discrete_actions_log_prob = dist.log_prob(discrete_actions) if n_action_dim - discrete_action_dim != 0: continuous_actions_mean = discrete_action_probs_with_continuous_mean[ ..., discrete_action_dim:] continuous_log_std = continuous_action_log_std.expand_as( continuous_actions_mean) continuous_actions_std = torch.exp(continuous_log_std) continuous_dist = MultivariateNormal( continuous_actions_mean, torch.diag_embed(continuous_actions_std)) continuous_actions = continuous_dist.sample() continuous_actions_log_prob = continuous_dist.log_prob( continuous_actions) return discrete_actions, continuous_actions, FloatTensor( discrete_actions_log_prob + continuous_actions_log_prob).unsqueeze(-1)
def get_policy_net_log_prob(self, net_input_state, net_input_discrete_action, net_input_continuous_action): net = self.policy n_action_dim = args.n_continuous_action + args.n_discrete_action discrete_action_dim = args.n_discrete_action sections = discrete_action_sections continuous_action_log_std = self.policy_net_action_std discrete_action_probs_with_continuous_mean = net(net_input_state) discrete_actions_log_prob = 0 continuous_actions_log_prob = 0 if discrete_action_dim != 0: dist = MultiOneHotCategorical( discrete_action_probs_with_continuous_mean[ ..., :discrete_action_dim], sections) discrete_actions_log_prob = dist.log_prob( net_input_discrete_action) if n_action_dim - discrete_action_dim != 0: continuous_actions_mean = discrete_action_probs_with_continuous_mean[ ..., discrete_action_dim:] continuous_log_std = continuous_action_log_std.expand_as( continuous_actions_mean) continuous_actions_std = torch.exp(continuous_log_std) continuous_dist = MultivariateNormal( continuous_actions_mean, torch.diag_embed(continuous_actions_std)) continuous_actions_log_prob = continuous_dist.log_prob( net_input_continuous_action) return FloatTensor(discrete_actions_log_prob + continuous_actions_log_prob).unsqueeze(-1)
def get_action(self, state, num_trajs=1): net = self.policy n_action_dim = args.n_continuous_action + args.n_discrete_action discrete_action_dim = args.n_discrete_action sections = discrete_action_sections continuous_action_log_std = self.policy_net_action_std discrete_action_probs_with_continuous_mean = self.policy(state) discrete_actions = torch.empty((num_trajs, 0), device=device) continuous_actions = torch.empty((num_trajs, 0), device=device) discrete_actions_log_prob = 0 continuous_actions_log_prob = 0 if discrete_action_dim != 0: dist = MultiOneHotCategorical( discrete_action_probs_with_continuous_mean[ ..., :discrete_action_dim], sections) discrete_actions = dist.sample() discrete_actions_log_prob = dist.log_prob(discrete_actions) if n_action_dim - discrete_action_dim != 0: continuous_actions_mean = discrete_action_probs_with_continuous_mean[ ..., discrete_action_dim:] continuous_log_std = continuous_action_log_std.expand_as( continuous_actions_mean) continuous_actions_std = torch.exp(continuous_log_std) continuous_dist = MultivariateNormal( continuous_actions_mean, torch.diag_embed(continuous_actions_std)) continuous_actions = continuous_dist.sample() continuous_actions_log_prob = continuous_dist.log_prob( continuous_actions) return discrete_actions, continuous_actions, FloatTensor( discrete_actions_log_prob + continuous_actions_log_prob).unsqueeze(-1)
def get_probs(logging_policy, x, clip_threshold, tao): # Forward pass on logging policy logging_output = logging_policy(Variable(FloatTensor([normalize(x) ]))).data[0] # Do a temperature softmax logging_output = logging_output * tao pi_o = F.softmax(logging_output, dim=0) pi_o, num_z = clip_and_renorm(pi_o, clip_threshold) return pi_o, num_z
def sample_image(args, generator, n_row, batches_done): """Saves a grid of generated digits ranging from 0 to n_classes""" # Sample noise z = Variable( FloatTensor(np.random.normal(0, 1, (n_row**2, args.latent_dim)))) # Get labels for the n rows labels = np.array([num for _ in range(n_row) for num in range(n_row)]) labels = Variable(LongTensor(labels)) gen_imgs = generator(z, labels) save_image(gen_imgs.data, "images/%d.png" % batches_done, nrow=n_row, normalize=True)
def get_net_action(self, state, num_trajs=1): net = getattr(self, net_name) n_action_dim = getattr(self, 'n_' + action_name) onehot_action_dim = getattr(self, 'onehot_' + action_name + '_dim') multihot_action_dim = getattr(self, 'multihot_' + action_name + '_dim') sections = getattr(self, 'onehot_' + action_name + '_sections') continuous_action_log_std = getattr( self, net_name + '_' + action_name + '_std') onehot_action_probs_with_continuous_mean = net(state) onehot_actions = torch.empty((num_trajs, 0), device=self.device) multihot_actions = torch.empty((num_trajs, 0), device=self.device) continuous_actions = torch.empty((num_trajs, 0), device=self.device) onehot_actions_log_prob = 0 multihot_actions_log_prob = 0 continuous_actions_log_prob = 0 if onehot_action_dim != 0: dist = MultiOneHotCategorical( onehot_action_probs_with_continuous_mean[ ..., :onehot_action_dim], sections) onehot_actions = dist.sample() onehot_actions_log_prob = dist.log_prob(onehot_actions) if multihot_action_dim != 0: multihot_actions_prob = torch.sigmoid( onehot_action_probs_with_continuous_mean[ ..., onehot_action_dim:onehot_action_dim + multihot_action_dim]) dist = torch.distributions.bernoulli.Bernoulli( probs=multihot_actions_prob) multihot_actions = dist.sample() multihot_actions_log_prob = dist.log_prob(multihot_actions).sum( dim=1) if n_action_dim - onehot_action_dim - multihot_action_dim != 0: continuous_actions_mean = onehot_action_probs_with_continuous_mean[ ..., onehot_action_dim + multihot_action_dim:] continuous_log_std = continuous_action_log_std.expand_as( continuous_actions_mean) continuous_actions_std = torch.exp(continuous_log_std) continuous_dist = MultivariateNormal( continuous_actions_mean, torch.diag_embed(continuous_actions_std)) continuous_actions = continuous_dist.sample() continuous_actions_log_prob = continuous_dist.log_prob( continuous_actions) return onehot_actions, multihot_actions, continuous_actions, FloatTensor( onehot_actions_log_prob + multihot_actions_log_prob + continuous_actions_log_prob).unsqueeze(-1)
def get_net_log_prob(self, net_input_state, net_input_onehot_action, net_input_multihot_action, net_input_continuous_action): net = getattr(self, net_name) n_action_dim = getattr(self, 'n_' + action_name) onehot_action_dim = getattr(self, 'onehot_' + action_name + '_dim') multihot_action_dim = getattr(self, 'multihot_' + action_name + '_dim') sections = getattr(self, 'onehot_' + action_name + '_sections') continuous_action_log_std = getattr( self, net_name + '_' + action_name + '_std') onehot_action_probs_with_continuous_mean = net(net_input_state) onehot_actions_log_prob = 0 multihot_actions_log_prob = 0 continuous_actions_log_prob = 0 if onehot_action_dim != 0: dist = MultiOneHotCategorical( onehot_action_probs_with_continuous_mean[ ..., :onehot_action_dim], sections) onehot_actions_log_prob = dist.log_prob(net_input_onehot_action) if multihot_action_dim != 0: multihot_actions_prob = torch.sigmoid( onehot_action_probs_with_continuous_mean[ ..., onehot_action_dim:onehot_action_dim + multihot_action_dim]) dist = torch.distributions.bernoulli.Bernoulli( probs=multihot_actions_prob) multihot_actions_log_prob = dist.log_prob( net_input_multihot_action).sum(dim=1) if n_action_dim - onehot_action_dim - multihot_action_dim != 0: continuous_actions_mean = onehot_action_probs_with_continuous_mean[ ..., onehot_action_dim + multihot_action_dim:] continuous_log_std = continuous_action_log_std.expand_as( continuous_actions_mean) continuous_actions_std = torch.exp(continuous_log_std) continuous_dist = MultivariateNormal( continuous_actions_mean, torch.diag_embed(continuous_actions_std)) continuous_actions_log_prob = continuous_dist.log_prob( net_input_continuous_action) return FloatTensor(onehot_actions_log_prob + multihot_actions_log_prob + continuous_actions_log_prob).unsqueeze(-1)
def GAE(reward, value, mask, gamma, lam): # adv = FloatTensor(reward.shape, device=device) # delta = FloatTensor(reward.shape, device=device) # # pre_value, pre_adv = 0, 0 # pre_value = torch.zeros(reward.shape[1:], device=device) # pre_adv = torch.zeros(reward.shape[1:], device=device) # for i in reversed(range(reward.shape[0])): # delta[i] = reward[i] + gamma * pre_value * mask[i] - value[i] # adv[i] = delta[i] + gamma * lam * pre_adv * mask[i] # pre_adv = adv[i, ...] # pre_value = value[i, ...] # returns = value + adv # adv = (adv - adv.mean()) / adv.std() reward = reward.reshape(-1, args.sample_traj_length, 1) value = value.reshape(-1, args.sample_traj_length, 1) mask = mask.reshape(-1, args.sample_traj_length, 1) adv = FloatTensor(reward.shape, device=device) delta = FloatTensor(reward.shape, device=device) # pre_value, pre_adv = 0, 0 pre_value = torch.zeros((reward.shape[0], 1), device=device) pre_adv = torch.zeros((reward.shape[0], 1), device=device) for i in reversed(range(reward.shape[1])): delta[:, i] = reward[:, i] + gamma * pre_value * mask[:, i] - value[:, i] adv[:, i] = delta[:, i] + gamma * lam * pre_adv * mask[:, i] pre_adv = adv[:, i, ...] pre_value = value[:, i, ...] returns = value + adv adv = (adv - adv.mean()) / adv.std() returns = returns.reshape(-1, 1) adv = adv.reshape(-1, 1) return adv, returns
def train(generator, discriminator, dataloader, args, cuda, adversarial_loss, auxiliary_loss): optimizer_G = torch.optim.Adam(generator.parameters(), lr=args.lr, betas=(args.b1, args.b2)) optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=args.lr, betas=(args.b1, args.b2)) for epoch in range(args.n_epochs): for i, (imgs, labels) in enumerate(dataloader): batch_size = imgs.shape[0] # Adversarial ground truths valid = Variable(FloatTensor(batch_size, 1).fill_(1.0), requires_grad=False) fake = Variable(FloatTensor(batch_size, 1).fill_(0.0), requires_grad=False) # Configure input real_imgs = Variable(imgs.type(FloatTensor)) labels = Variable(labels.type(LongTensor)) # ----------------- # Train Generator # ----------------- optimizer_G.zero_grad() # Sample noise as generator input z = Variable( FloatTensor( np.random.normal(0, 1, (batch_size, args.latent_dim)))) gen_labels = Variable( LongTensor(np.random.randint(0, args.n_classes, batch_size))) # Generate a batch of images gen_imgs = generator(z, gen_labels) # Loss measures generator's ability to fool the discriminator validity, pred_label = discriminator(gen_imgs) g_loss = 0.5 * adversarial_loss(validity, valid) + auxiliary_loss( pred_label, gen_labels) g_loss.backward() optimizer_G.step() # --------------------- # Train Discriminator # --------------------- optimizer_D.zero_grad() # Loss for real images real_pred, real_aux = discriminator(real_imgs) d_real_loss = (adversarial_loss(real_pred, valid) + auxiliary_loss(real_aux, labels)) / 2 # Loss for fake images fake_pred, fake_aux = discriminator(gen_imgs.detach()) d_fake_loss = (adversarial_loss(fake_pred, fake) + auxiliary_loss(fake_aux, gen_labels)) / 2 # Measure discriminator's ability to classify real from generated samples d_loss = (d_real_loss + d_fake_loss) / 2 # Calculate discriminator accuracy pred = np.concatenate( [real_aux.data.cpu().numpy(), fake_aux.data.cpu().numpy()], axis=0) gt = np.concatenate( [labels.data.cpu().numpy(), gen_labels.data.cpu().numpy()], axis=0) d_acc = np.mean(np.argmax(pred, axis=1) == gt) d_loss.backward() optimizer_D.step() print("[Epoch %d/%d] [Batch %d/%d] [D loss: %f] [G loss: %f]" % (epoch, args.n_epochs, i, len(dataloader), d_loss.item(), g_loss.item())) batches_done = epoch * len(dataloader) + i if batches_done % args.sample_interval == 0: sample_image(args, generator, n_row=10, batches_done=batches_done)
def train(game, num_steps=60000000, lr=0.00025, gamma=0.99, C=20000, batch_size=32): env = wrappers.wrap(gym.make(GAMES[game])) num_actions = env.action_space.n Q1 = QNetwork(num_actions) Q2 = QNetwork(num_actions) Q2.load_state_dict(Q1.state_dict()) if torch.cuda.is_available(): Q1.cuda() Q2.cuda() epsilon = Epsilon(1, 0.05, 1000000) optimizer = torch.optim.Adam(Q1.parameters(), lr=lr) optimizer.zero_grad() state1 = env.reset() t, last_t, loss, episode, score = 0, 0, 0, 0, 0 last_ts, scores = datetime.now(), collections.deque(maxlen=100) while t < num_steps: qvalues = Q1(state1) if random() < epsilon(t): action = env.action_space.sample() else: action = qvalues.data.max(dim=1)[1][0] q = qvalues[0][action] state2, reward, done, _info = env.step(action) score += reward if not done: y = gamma * Q2(state2).detach().max(dim=1)[0][0] + reward state1 = state2 else: reward = FloatTensor([reward]) y = torch.autograd.Variable(reward, requires_grad=False) state1 = env.reset() scores.append(score) score = 0 episode += 1 loss += torch.nn.functional.smooth_l1_loss(q, y) t += 1 if done or t % batch_size == 0: loss.backward() optimizer.step() optimizer.zero_grad() loss = 0 if t % C == 0: Q2.load_state_dict(Q1.state_dict()) torch.save(Q1.state_dict(), 'qlearning_{}.pt'.format(game)) if t % 1000 == 0: ts = datetime.now() datestr = ts.strftime('%Y-%m-%dT%H:%M:%S.%f') avg = mean(scores) if scores else float('nan') steps_per_sec = (t - last_t) / (ts - last_ts).total_seconds() l = '{} step {} episode {} avg last 100 scores: {:.2f} ε: {:.2f}, steps/s: {:.0f}' print(l.format(datestr, t, episode, avg, epsilon(t), steps_per_sec)) last_t, last_ts = t, ts
def observation(self, observation): x = FloatTensor(np.swapaxes(observation, 2, 0)) return torch.autograd.Variable(x, requires_grad=False).unsqueeze(0)