def sampler(P_op, P_skip, num_samples): cat_op = categorical.Categorical(P_op) cat_sk = bernoulli.Bernoulli(P_skip) ops, sks = cat_op.sample([num_samples]), cat_sk.sample([num_samples]) #print(ops.shape) #print(sks.shape) return CM.ChildModelBatch(ops, sks)
def edge_decision(self, type, alphas, selected_idxs, candidate_flags, probs_history, epoch): """Calculate the decision for each edge. :param type: the type of cell :type type: str ('normal' or 'reduce') """ mat = F.softmax(torch.stack(alphas, dim=0), dim=-1).detach() logging.info('alpha: {}'.format(mat)) importance = torch.sum(mat[:, 1:], dim=-1) logging.info(type + " importance {}".format(importance)) probs = mat[:, 1:] / importance[:, None] logging.info(type + " probs {}".format(probs)) entropy = cate.Categorical(probs=probs).entropy() / math.log(probs.shape[1]) logging.info(type + " entropy {}".format(entropy)) if self.use_history: # SGAS Cri.2 logging.info(type + " probs history {}".format(probs_history)) histogram_inter = self.histogram_average(probs_history, probs) logging.info(type + " histogram intersection average {}".format(histogram_inter)) probs_history.append(probs) if (len(probs_history) > self.history_size): probs_history.pop(0) score = self.normalize(importance) * self.normalize(1 - entropy) * self.normalize(histogram_inter) logging.info(type + " score {}".format(score)) else: # SGAS Cri.1 score = self.normalize(importance) * self.normalize(1 - entropy) logging.info(type + " score {}".format(score)) if torch.sum(candidate_flags.int()) > 0 and epoch >= self.warmup_dec_epoch and \ (epoch - self.warmup_dec_epoch) % self.decision_freq == 0: masked_score = torch.min(score, (2 * candidate_flags.float() - 1) * np.inf) selected_edge_idx = torch.argmax(masked_score) # add 1 since none op selected_op_idx = torch.argmax(probs[selected_edge_idx]) + 1 selected_idxs[selected_edge_idx] = selected_op_idx candidate_flags[selected_edge_idx] = False alphas[selected_edge_idx].requires_grad = False if type == 'normal': reduction = False elif type == 'reduce': reduction = True else: raise Exception('Unknown Cell Type') candidate_flags, selected_idxs = self.check_edges(candidate_flags, selected_idxs, reduction=reduction) logging.info("#" * 30 + " Decision Epoch " + "#" * 30) logging.info("epoch {}, {}_selected_idxs {}, added edge {} with op idx {}".format( epoch, type, selected_idxs, selected_edge_idx, selected_op_idx)) logging.info(type + "_candidate_flags {}".format(candidate_flags)) return True, selected_idxs, candidate_flags else: logging.info("#" * 30 + " Not a Decision Epoch " + "#" * 30) logging.info("epoch {}, {}_selected_idxs {}".format(epoch, type, selected_idxs)) logging.info(type + "_candidate_flags {}".format(candidate_flags)) return False, selected_idxs, candidate_flags
def sample(self, num_samples=100000, binomial_n=None, visualize=True): if binomial_n is None: binomial_n = self.num_workers raise NotImplementedError dist = categorical.Categorical(probs=ch.tensor(pdf_at_s)) sample = dist.sample((num_samples, )) new_s_stars = sample.float() / (len(pdf_at_s) - 1) bin_dist = binomial.Binomial(total_count=binomial_n, probs=new_s_stars) samples = bin_dist.sample().numpy() if visualize: plt.show() xs = np.arange(self.num_workers + 1) def make_freqs(ys): counts = np.array([(ys == x).sum() for x in xs]) counts = counts / counts.sum() return counts plt.bar(xs, make_freqs(samples), label='samples', color='red', alpha=0.5) plt.bar(xs, make_freqs(s_hat), label='empirical dist', alpha=0.5) plt.legend() return samples
def optimize_model(device, model, optimizer, rewards, actions, states): L1, L2 = 0, 0 # Compute g T = len(rewards) g = np.zeros(T) g[-1] = rewards[-1] for i in range(T - 2, -1, -1): g[i] = rewards[i] + GAMMA * g[i + 1] g = torch.tensor(g, dtype=torch.float, device=device) # Compute pi states = torch.tensor(states, dtype=torch.float, device=device) actions = torch.tensor(actions, dtype=torch.float, device=device) pi, v = model(states) v = v.squeeze(1) actual_log_prob = cat.Categorical(pi) actual_log_prob = actual_log_prob.log_prob(actions) # Compute L for t in range(T): L1 += -(GAMMA**t) * (g[t] - v[t].detach()) * actual_log_prob[t] L2 = F.smooth_l1_loss(g, v) loss = L1 + LOSS2_C * L2 # Optimize model optimizer.zero_grad() loss.backward() optimizer.step()
def optimize_model(device, model, optimizer, rewards, actions, states): # Convert to tensors states = torch.tensor(states, dtype=torch.float, device=device) actions = torch.tensor(actions, dtype=torch.float, device=device) rewards = torch.tensor(rewards, dtype=torch.float, device=device) # Compute pi and v pi, v = model(states) v = v.squeeze(1) actual_log_prob = cat.Categorical(pi) actual_log_prob = actual_log_prob.log_prob(actions) # Compute Losses T, R = len(rewards), 0 if T > MAX_T: _, R = model(torch.unsqueeze(states[-1], 0)) L1, L2 = 0, 0 for t in range(T - 1, -1, -1): R = rewards[t] + GAMMA * R L1 -= actual_log_prob[t] * (R - v[t]).detach() L2 += F.smooth_l1_loss(R, v[t]) loss = L1 + LOSS2_C * L2 # Optimize model optimizer.zero_grad() loss.backward() optimizer.step()
def sample(self, logits): """ Sample distribution """ logits_tensor = torch.tensor(logits) logits_tensor_soft = F.softmax(logits_tensor, dim=-1) m = cat.Categorical(logits_tensor_soft) return m.sample()
def sample_class_weights(class_weights, n_samples=1): """ draw a sample from Categorical variable with probabilities class_weights """ # draw a sample from Categorical variable with # probabilities class_weights assert not torch.any(torch.isnan(class_weights)) cat_rv = categorical.Categorical(probs=class_weights) return cat_rv.sample((n_samples, )).detach().squeeze()
def _sample_trajectory_disceret( self, initial_states: Tensor, previous_action) -> Tuple[Tensor, Tensor, Tensor]: """Randomly samples T actions and computes the trajectory. :returns: (sequence of states, sequence of actions, costs) """ actions = categorical.Categorical( torch.ones(self.num_actions) / self.num_actions).sample(sample_shape=(self._num_rollouts, self._time_horizon, 1)) if previous_action is not None: actions[0, :-1, 0] = previous_action[1:self._time_horizon, 0] # One more state than the time horizon because of the initial state. trajectories = torch.empty((self.no_models, self._num_rollouts, self._time_horizon + 1, self._state_dimen), device=initial_states.device) trajectories[:, :, 0, :] = initial_states objective_costs = torch.zeros(( self.no_models, self._time_horizon, self._num_rollouts, ), device=initial_states.device) dones = torch.zeros(( self.no_models, self._num_rollouts, ), device=initial_states.device) for t in range(self._time_horizon): for d, dynamic in enumerate(self._dynamics): next_states, costs, done = dynamic.step( trajectories[d, :, t, :], actions[:, t, 0]) # assert_shape(next_states, (self._num_rollouts, self._state_dimen)) # assert_shape(costs, (self._num_rollouts,)) trajectories[d, :, t + 1, :] = next_states #print(dones) dones[d, :] = torch.maximum(done, dones[d, :]) objective_costs[d, t, :] = (gamma)**t * costs * (1 - dones[d, :]) #print(dones.max()) if self.mountain_car: for d in range(self.no_models): objective_costs[d, :, :] = objective_costs[ d, :, :] - 0.01 * torch.max(trajectories[d, :, :, 0], 1)[0] #print(objective_costs.sum(1).min(1)[0]) objective_costs = torch.mean(objective_costs, 0) objective_costs = torch.sum(objective_costs, 0) #print(objective_costs.min()) return trajectories, actions, objective_costs
def select_action(state, model, device, steps_done): sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) \ * max((1 - steps_done / EPS_DECAY), 0) if sample > eps_threshold: with torch.no_grad(): state = torch.tensor([state], dtype=torch.float, device=device) pi, _ = model(state) return cat.Categorical(pi).sample().item() else: return random.randrange(5)
def get_negative_sampler(self, smooth_par=0.75): """ """ node_idx, node_degrees = np.unique(self.edge_index[0, :], return_counts=True) # there may be isolated nodes that are not present in edge_index all_degrees = np.zeros(self.n_x) all_degrees[node_idx] = node_degrees Pn = all_degrees**smooth_par Pn = Pn / np.sum(Pn) self.neg_sampler = categorical.Categorical(torch.from_numpy(Pn))
def __init__(self, in_channels, kernel_size): super(Shift, self).__init__() self.channels = in_channels self.kernel_size = kernel_size if kernel_size == 3: p = torch.Tensor([0.3, 0.4, 0.3]) elif kernel_size == 5: p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) elif kernel_size == 7: p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075]) elif kernel_size == 9: p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05]) else: raise RuntimeError('Unsupported kernel size') shift_t = categorical.Categorical(p).sample((in_channels, 2)) - (kernel_size // 2) self.register_buffer('shift_t', shift_t.int())
def sample_action(self, policy_parameters): if self.discrete: sy_logits_na = policy_parameters sy_sampled_ac = Cat.Categorical(logits=sy_logits_na) else: if policy_parameters.dim() == 1: sy_mean, sy_logstd = policy_parameters[:self. ac_dim], policy_parameters[ self.ac_dim:] else: sy_mean, sy_logstd = policy_parameters[:, :self. ac_dim], policy_parameters[:, self . ac_dim:] #print(sy_mean) sy_sampled_ac = Norm.Normal(loc=sy_mean, scale=torch.exp(sy_logstd)) return sy_sampled_ac.sample()
def get_log_prob(self, policy_parameters, sy_ac_na): if self.discrete: sy_logits_na = policy_parameters sy_sampled_ac = Cat.Categorical(logits=sy_logits_na) else: if policy_parameters.dim() == 1: sy_mean, sy_logstd = policy_parameters[:self. ac_dim], policy_parameters[ self.ac_dim:] else: sy_mean, sy_logstd = policy_parameters[:, :self. ac_dim], policy_parameters[:, self . ac_dim:] sy_sampled_ac = Norm.Normal(loc=sy_mean.view(self.ac_dim, -1), scale=torch.exp( sy_logstd.view(self.ac_dim, -1))) return sy_sampled_ac.log_prob(sy_ac_na)
def sample_episode(self): self.total_rewards = 0 self.losses = [] done = False state = self.state while done==False: probs = (self.model((torch.from_numpy(state).unsqueeze(0)).float().to(device))) m = c.Categorical(probs) action = m.sample() next_state, reward, done, _ = self.env.step(action.item()) self.episode.append([state,action,reward]) self.losses.append(-(m.log_prob(action))) self.env.render() state = next_state self.total_rewards += reward self.G = [] for i in range(len(self.episode)): self.G.append(0) for j in range(i,len(self.episode)): _,_,r = self.episode[j] self.G[-1] += (GAMMA**(j-i))*r
def train_loop(self): probs, v = (self.model( (torch.from_numpy(self.state).unsqueeze(0).float().to(device)))) probs = torch.squeeze(probs) m = c.Categorical(probs) action = m.sample() next_state, reward, done, _ = self.env.step(action.item()) self.env.render() _, v_prime = self.model( torch.from_numpy(next_state).unsqueeze(0).float().to(device)) # probs2 = torch.squeeze(probs2) # q = self.critic(torch.from_numpy(self.state).unsqueeze(0).float().to(device)) # probs2 = (self.actor((torch.from_numpy(next_state).unsqueeze(0)).float().to(device))) # m2 = c.Categorical(probs2) # action2 = m2.sample().item() if done == True: v_prime[0] = 0 td_error = reward + GAMMA * v_prime[0] - v[0] loss_a = -m.log_prob(action) * td_error loss_obj = nn.MSELoss() loss_c = loss_obj( torch.tensor([reward], device=device).float() + GAMMA * v_prime[0].float(), v[0].float()) loss = loss_a + loss_c # self.optimizer_actor.zero_grad() # self.optimizer_critic.zero_grad() self.optimizer.zero_grad() # loss_a.backward() # loss_c.backward() loss.backward() # self.optimizer_actor.step() # self.optimizer_critic.step() self.optimizer.step() # update next state if done == True: self.state = self.env.reset() else: self.state = next_state return done, reward
def act(self, state, deterministic=False): x, v = self(state) if self.is_continuous: if deterministic: action = x action_log_prob = None entropy = None else: c = normal.Normal(x, self.pi.log_std.exp()) action = c.sample() action_log_prob = c.log_prob(action).mean() entropy = c.entropy() else: # discrete if deterministic: action = torch.max(F.log_softmax(x, dim=1), dim=1)[1] action_log_prob = None entropy = None else: c = categorical.Categorical(logits=F.log_softmax(x, dim=1)) action = c.sample() action_log_prob = c.log_prob(action) entropy = c.entropy() return action, action_log_prob, v, entropy
def edge_decision(type, alphas, selected_idxs, candidate_flags, probs_history, epoch, model, args): mat = F.softmax(torch.stack(alphas, dim=0), dim=-1).detach() print(mat) importance = torch.sum(mat[:, 1:], dim=-1) # logging.info(type + " importance {}".format(importance)) probs = mat[:, 1:] / importance[:, None] # print(type + " probs", probs) entropy = cate.Categorical(probs=probs).entropy() / math.log( probs.size()[1]) # logging.info(type + " entropy {}".format(entropy)) if args.use_history: # SGAS Cri.2 # logging.info(type + " probs history {}".format(probs_history)) histogram_inter = histogram_average(probs_history, probs) # logging.info(type + " histogram intersection average {}".format(histogram_inter)) probs_history.append(probs) if (len(probs_history) > args.history_size): probs_history.pop(0) score = utils.normalize(importance) * utils.normalize( 1 - entropy) * utils.normalize(histogram_inter) # logging.info(type + " score {}".format(score)) else: # SGAS Cri.1 score = utils.normalize(importance) * utils.normalize(1 - entropy) # logging.info(type + " score {}".format(score)) if torch.sum(candidate_flags.int()) > 0 and \ epoch >= args.warmup_dec_epoch and \ (epoch - args.warmup_dec_epoch) % args.decision_freq == 0: masked_score = torch.min(score, (2 * candidate_flags.float() - 1) * np.inf) selected_edge_idx = torch.argmax(masked_score) selected_op_idx = torch.argmax( probs[selected_edge_idx]) + 1 # add 1 since none op selected_idxs[selected_edge_idx] = selected_op_idx candidate_flags[selected_edge_idx] = False alphas[selected_edge_idx].requires_grad = False if type == 'normal': reduction = False elif type == 'reduce': reduction = True else: raise Exception('Unknown Cell Type') candidate_flags, selected_idxs = model.check_edges( candidate_flags, selected_idxs) logging.info("#" * 30 + " Decision Epoch " + "#" * 30) logging.info( "epoch {}, {}_selected_idxs {}, added edge {} with op idx {}". format(epoch, type, selected_idxs, selected_edge_idx, selected_op_idx)) print(type + "_candidate_flags {}".format(candidate_flags)) score_image(type, score, epoch) return True, selected_idxs, candidate_flags else: logging.info("#" * 30 + " Not a Decision Epoch " + "#" * 30) logging.info("epoch {}, {}_selected_idxs {}".format( epoch, type, selected_idxs)) print(type + "_candidate_flags {}".format(candidate_flags)) score_image(type, score, epoch) return False, selected_idxs, candidate_flags
]].values # self.y=y.values def __getitem__(self, ind): return torch.FloatTensor(self.con[ind,:]),\ self.uniq2[ind,:],\ self.uniq3[ind,:],\ self.uniq4[ind,:] def __len__(self): return self.uniq2.shape[0] testdf = pd.read_csv(r'test.csv') testdf.shape test = HealthDatasetPred(testdf) test_ldr = dataloader.DataLoader(test, batch_size=testdf.shape[0]) tst_ldr = iter(test_ldr) con, x2, x3, x4 = next(tst_ldr) model = torch.load('cat_embed.pkl') pred = model(con, x2, x3, x4) from torch.distributions import categorical cat = categorical.Categorical(pred) res = cat.sample() testdf.index testdf['class'] = res testdf['class'].to_csv('cat_embed.csv')
def actor_critic(device="cpu"): discount_factor = 0.7 lr = 1e-3 random_chance = 0.05 save_path = "actor_critic/" train = True fenv = FceuxNesEmulatorEnvironment() policy_estimator = PolicyEstimator() value_estimator = ValueEstimator() policy_optimizer = torch.optim.Adam(policy_estimator.parameters(), lr=lr) value_optimizer = torch.optim.Adam(value_estimator.parameters(), lr=lr) if os.path.isfile(save_path + "policy_estimator"): policy_estimator.load_state_dict( torch.load(save_path + "policy_estimator")) print("Policy estimator loaded") if os.path.isfile(save_path + "value_estimator"): value_estimator.load_state_dict( torch.load(save_path + "value_estimator")) print("Value estimator loaded") if train: avg_reward = 0 avg_length = 0 for i in range(200000): state = fenv.reset() episode_reward = 0.0 episode_length = 0 rewards = [] states = [] actions = [] for t in itertools.count(): action_probs = policy_estimator(state) #print(action_probs) if np.random.uniform() < random_chance: action = torch.FloatTensor(1).random_(0, 255).detach()[0] else: action = cat.Categorical(action_probs).sample().detach() #print(action) true_act = toAction(action) next_state, reward, done, _ = fenv.step(true_act) rewards.append(reward) states.append(state) actions.append(action) episode_reward += reward episode_length = t next_value = value_estimator(next_state) target_value = reward + discount_factor * next_value predict_value = value_estimator(state) advance = target_value.detach() - predict_value value_loss = (target_value.detach() - predict_value)**2 value_optimizer.zero_grad() value_loss.backward() value_optimizer.step() m = cat.Categorical(action_probs) #action_prob = action_probs[action] policy_loss = -m.log_prob(action) * advance.detach() # print(policy_loss) policy_optimizer.zero_grad() policy_loss.backward() policy_optimizer.step() if done: break state = next_state #print("Episode reward: {}".format(episode_reward)) #print("Episode length: {}".format(episode_length)) avg_reward += episode_reward avg_length += episode_length # print("Average reward: {}".format(avg_reward/(i+1))) avg_reward = 0 avg_length = 0 print("Saving model...") torch.save(policy_estimator.state_dict(), save_path + "policy_estimator") torch.save(value_estimator.state_dict(), save_path + "value_estimator")
obs_history[i] = np.vstack((obs_t_minus_0[i], obs_t_minus_1[i], obs_t_minus_2[i], obs_t_minus_3[i], obs_t_minus_4[i], obs_t_minus_5[i])) if isinstance(obs_history, np.ndarray): obs_history = th.from_numpy(obs_history).float() length = 0 for t in range(MAX_STEPS): obs_history = obs_history.type(FloatTensor) action_probs = maddpg.select_action(obs_history, pose).data.cpu() action_probs_valid = np.copy(action_probs) action = [] for i, probs in enumerate(action_probs): rbt = world.robots[i] for j, frt in enumerate(rbt.get_frontiers()): if len(frt) == 0: action_probs_valid[i][j] = 0 action.append(categorical.Categorical(probs=th.tensor(action_probs_valid[i])).sample()) action = th.tensor(onehot_from_action(action)) acts = np.argmax(action, axis=1) obs_, reward, done, _, next_pose = world.step(acts) length = length+np.sum(world.path_length) next_pose = th.tensor(next_pose) reward = th.FloatTensor(reward).type(FloatTensor) obs_ = np.stack(obs_) obs_ = th.from_numpy(obs_).float() obs_t_minus_5 = copy(obs_t_minus_4) obs_t_minus_4 = copy(obs_t_minus_3) obs_t_minus_3 = copy(obs_t_minus_2) obs_t_minus_2 = copy(obs_t_minus_1) obs_t_minus_1 = copy(obs_t_minus_0)
def sample_next_char_id(predicted_logits): next_char_id = categorical.Categorical(logits=predicted_logits).sample() return next_char_id