def save_function(self): if separate_emb: ModelIO.save( img_encoder , savename + "_Img") ModelIO.save( txt_encoder , savename + "_Txt") ModelIO.save(f_emb, savename) print "Similarity Embedding function(s) saved while training"
class REINFORCE: def __init__(self, state_dim, action_dim, gamma=0.99, hidden_dim=64, policy_lr=0.001, baseline_lr=0.001): self._V = StateValueFunction(state_dim, hidden_dim=hidden_dim) self._pi = Policy(state_dim, action_dim, hidden_dim=hidden_dim) # self._V.cuda() # self._pi.cuda() self._gamma = gamma self._loss_function = nn.MSELoss() self._V_optimizer = optim.Adam(self._V.parameters(), lr=baseline_lr) self._pi_optimizer = optim.Adam(self._pi.parameters(), lr=policy_lr) self._action_dim = action_dim # --- ModelIO --- self._modelio = ModelIO(model_path=Path(__file__).resolve().parent / 'models') def get_action(self, s): mu_action = self._pi(tt(s)) # mu_action = self._pi(tt(s)).detach().numpy() action_sampled = np.random.normal(loc=mu_action.detach().numpy(), scale=0.1, size=1) action_sampled = np.clip(action_sampled, a_min=-1.0, a_max=1.0) log_prob = torch.log(mu_action + torch.normal(mean=mu_action)) return action_sampled, log_prob def train(self, env, episodes, time_steps): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) for i_episode in range(1, episodes + 1): # Generate an episode. # An episode is an array of (state, action, reward) tuples episode = [] s = env.reset() for t in range(time_steps): a, log_prob_a = self.get_action(s) ns, r, d, _ = env.step(a) stats.episode_rewards[i_episode - 1] += r stats.episode_lengths[i_episode - 1] = t episode.append((s, a, log_prob_a, r)) if d: break s = ns # collect all rewards at one place T = len(episode) G = 0.0 for t in reversed(range(T)): s, a, log_prob, r = episode[t] G = self._gamma * G + r baseline = self._V(tt(s)) advantage = G - baseline self._train_baseline(G, baseline) self._train_policy(advantage, t, log_prob) print("\r{} Steps in Episode {}/{}. Reward {}".format( len(episode), i_episode, episodes, sum([e[3] for i, e in enumerate(episode)]))) return stats def _train_baseline(self, G, baseline): self._V_optimizer.zero_grad() loss = self._loss_function(tt(np.array([G])), baseline) loss.backward(retain_graph=True) self._V_optimizer.step() def _train_policy(self, error, t, log_prob_a): self._pi_optimizer.zero_grad() neg_log_prob_a = -log_prob_a target = np.power(self._gamma, t) * error * neg_log_prob_a target.backward(retain_graph=True) self._pi_optimizer.step() def save_models(self, model_name): self._modelio.save(model=self._pi, model_name=f'r_c_policy_{model_name}.pt') self._modelio.save(model=self._V, model_name=f'r_c_baseline_{model_name}.pt') def load_models(self, model_name): # if self._model self._modelio.load(model=self._pi, model_name=f'r_c_policy_{model_name}.pt') self._modelio.load(model=self._V, model_name=f'r_c_baseline_{model_name}.pt')
if n_sbu: dataset_name += "+sbu%d" % n_sbu # global vectorizer vect_name = 'tokenizer_%s' % dataset_name mlb_name = 'mlb_%s' % dataset_name try: if mlb_name: mlb = ModelIO.load(mlb_name) print "MLB loaded from file" vect = ModelIO.load(vect_name) # vect = ModelIO.load('tokenizer_reddit') # gloveglove print "Tokenizer loaded from file." except: if mlb_name: vect, mlb = prepVect(n_sbu=n_sbu, n_captions=1, multilabel=True) ModelIO.save(vect, vect_name) ModelIO.save(mlb, mlb_name) print "Saved %s, %s for future use." % (vect_name, mlb_name) else: vect = prepVect(n_sbu=n_sbu, n_captions=1) ModelIO.save(vect, vect_name) print "Saved %s for future use." % vect_name class DataETL(): @staticmethod def getFinalStream(X, Y, sources, sources_k, batch_size=128, embedding_dim=300, shuffle=False): """ Returns -------
class DDPG: def __init__(self, state_dim, action_dim, gamma=0.99, noise_std=0.02, hidden_dim=64, actor_lr=0.001, critic_lr=0.001, verbose=False): self.gamma = gamma # self.tau = 0.01 self.tau = 0.001 self.actor = Actor(state_dim, noise_std=noise_std, hidden_dim=hidden_dim) self.actor_target = Actor(state_dim, noise_std=noise_std, hidden_dim=hidden_dim) self.critic = Critic(state_dim, action_dim, hidden_dim=hidden_dim) self.critic_target = Critic(state_dim, action_dim, hidden_dim=hidden_dim) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.buffer = ReplayBuffer(max_size=1e5) self.logging_period = 10 if verbose else 100 # --- ModelIO --- self.modelio = ModelIO(model_path=Path(__file__).resolve().parent / 'models') def update_target(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) def get_action(self, state, is_testing=False): """ used for test time (not training) """ action = self.actor(state, is_testing).detach() # env_action = torch.clamp(action, min=-1.0, max=1.0).detach().numpy() action = self.actor(state, is_testing).detach().numpy() return action def train(self, env, episodes, timesteps): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) for i_episode in range(1, episodes + 1): state = env.reset() for t in range(timesteps): # --- choose action action = self.actor(state).detach() env_action = torch.clamp(action, min = -2.0, max = 2.0).detach().numpy() next_state, reward, done, _, _ = env.step(env_action) # --- saving stats stats.episode_rewards[i_episode - 1] += reward stats.episode_lengths[i_episode - 1] = t # --- save the transision self.buffer.add_transition( state=torch.from_numpy(state).float().to(device), action=action, next_state=torch.from_numpy(next_state).float().to(device), reward=reward, done=done) # --- sample a batch of transitions batch = self.buffer.next_random_batch(batch_size=32) # --- train self.train_batch(batch) # --- update target networks self.update_target(target=self.actor_target, source=self.actor) self.update_target(target=self.critic_target, source=self.critic) if done: break state = next_state # logging if i_episode % self.logging_period == 0: print((f"{int(stats.episode_lengths[i_episode - 1])} Steps in" f"Episode {i_episode}/{episodes}. " f"Reward {stats.episode_rewards[i_episode-1]}")) # snapshot instants snaps_moments = np.array([400, 800, 1200, 1600]) for snap in snaps_moments: if (i_episode == snap): self.save_models(model_name=snap) return stats def train_batch(self, batch): states, actions, next_states, rewards, dones = batch batch_rewards = torch.FloatTensor(rewards).to(device) batch_states = torch.stack(states).to(device) batch_actions = torch.stack(actions).to(device) batch_next_states = torch.stack(next_states).to(device) batch_na = self.actor_target(batch_next_states) batch_q_ns_na = self.critic_target(batch_next_states, batch_na.detach().view(-1, 1)) update_targets = batch_rewards.view(-1, 1) + self.gamma * batch_q_ns_na batch_q_s_a = self.critic(batch_states, batch_actions.view(-1, 1)) critic_loss = F.mse_loss(batch_q_s_a, update_targets) actor_loss = -self.critic(batch_states, self.actor(batch_states).view(-1, 1)).mean() # actor_loss = self.critic(batch_states, # self.actor(batch_states).view(-1, 1)).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def save_models(self, model_name): self.modelio.save(model=self.actor, model_name=f'ddpg_c_actor_{model_name}.pt') self.modelio.save(model=self.critic, model_name=f'ddpg_c_critic_{model_name}.pt') def load_models(self, model_name): # if self._model self.modelio.load(model=self.actor, model_name=f'ddpg_c_actor_{model_name}.pt') self.modelio.load(model=self.actor_target, model_name=f'ddpg_c_actor_{model_name}.pt') self.modelio.load(model=self.critic, model_name=f'ddpg_c_critic_{model_name}.pt') self.modelio.load(model=self.critic_target, model_name=f'ddpg_c_critic_{model_name}.pt')
class ActorCritic: def __init__(self, state_dim, action_dim, gamma, d2c=None): self._V = StateValueFunction(state_dim) self._pi = Policy(state_dim, action_dim) self.d2c = d2c # discrete to continuous actions # self._V.cuda() # self._pi.cuda() self._gamma = gamma self._loss_function = nn.MSELoss() self._V_optimizer = optim.Adam(self._V.parameters(), lr=0.001) self._pi_optimizer = optim.Adam(self._pi.parameters(), lr=0.0001) self._action_dim = action_dim # --- ModelIO --- self._modelio = ModelIO(model_path=Path(__file__).resolve().parent / 'models') self._baseline_model_name = 'ac_baseline.pt' self._policy_model_name = 'ac_policy.pt' def get_action(self, s): probs = self._pi(tt(s)) action = np.random.choice(a=self._action_dim, p=np.squeeze(probs.detach().numpy())) log_prob = torch.log(probs.squeeze(0)[action]) # converting the discrete action [0,1,2,...] # to an action in the continuous # range (actionspace.low <--> actionspace.high) if self.d2c: action = self.d2c(action) return action, log_prob def train(self, env, episodes, time_steps): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) for i_episode in range(1, episodes + 1): # Generate an episode. # An episode is an array of (state, action, reward) tuples s = env.reset() comounded_decay = 1 for t in range(time_steps): a, log_prob_a = self.get_action(s) ns, r, d, _ = env.step(a) stats.episode_rewards[i_episode - 1] += r stats.episode_lengths[i_episode - 1] = t target = r if not d: target = target + self._gamma * self._V( tt(ns)).cpu().detach() baseline = self._V(tt(s)) advantage = target - baseline comounded_decay *= self._gamma self._train_baseline(target, baseline) self._train_policy(advantage, comounded_decay, log_prob_a) if d: break s = ns print( f"{stats.episode_lengths[i_episode-1]} Steps in Episode {i_episode}/{episodes}. Reward {stats.episode_rewards[i_episode-1]}" ) return stats def _train_policy(self, advantage, comp_decay, log_prob_a): self._pi_optimizer.zero_grad() neg_log_prob_a = -log_prob_a target_objective = comp_decay * advantage * neg_log_prob_a target_objective.backward() self._pi_optimizer.step() def _train_baseline(self, target, baseline): self._V_optimizer.zero_grad() loss = self._loss_function(tt(np.array([target])), baseline) loss.backward(retain_graph=True) self._V_optimizer.step() def save_models(self): self._modelio.save(model=self._pi, model_name=self._policy_model_name) self._modelio.save(model=self._V, model_name=self._baseline_model_name) def load_models(self): # if self._model self._modelio.load(model=self._pi, model_name=self._policy_model_name) self._modelio.load(model=self._V, model_name=self._baseline_model_name)
class DQN: def __init__(self, state_dim, action_dim, gamma, d2c=None): self._q = Q(state_dim, action_dim) self._q_target = Q(state_dim, action_dim) # self._q.cuda() # self._q_target.cuda() self._gamma = gamma self._loss_function = nn.MSELoss() self._q_optimizer = optim.Adam(self._q.parameters(), lr=0.0001) self._action_dim = action_dim self._replay_buffer = ReplayBuffer(5000) self._d2c = d2c # --- ModelIO --- self._modelio = ModelIO(model_path=Path(__file__).resolve().parent / 'models') self._q_model_name = 'q.pt' self._target_model_name = 'target.pt' def get_action(self, x, epsilon): u = np.argmax(self._q(tt(x)).cpu().detach().numpy()) r = np.random.uniform() if r < epsilon: u = np.random.randint(self._action_dim) if self._d2c: u = self._d2c(u) return u def train(self, env, episodes, time_steps, epsilon): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) for i_episode in range(1, episodes + 1): state = env.reset() for t in range(time_steps): action = self.get_action(state, epsilon) next_state, reward, done, _ = env.step(action) stats.episode_rewards[i_episode - 1] += reward stats.episode_lengths[i_episode - 1] = t # calculate priority of the transition (td-error) q_s_a = self._q(tt(state)).cpu().detach().numpy()[int( np.squeeze(action))] target = reward + self._gamma * np.max( self._q_target(tt(next_state)).cpu().detach().numpy()) priority = -abs(target - q_s_a) + np.random.randn() * 1e-2 # add the experience into the buffer self._replay_buffer.add_transition(state=state, action=action, next_state=next_state, reward=reward, done=done, priority=priority) # sample a batch of experiences samples = self._replay_buffer.next_batch(batch_size=64) # update q network parameters self._train_batch(samples) # update target network periodically/slowly soft_update(target=self._q_target, source=self._q, tau=0.01) if done: break state = next_state print( f"{int(stats.episode_lengths[i_episode-1])} Steps in Episode {i_episode}/{episodes}. Reward {stats.episode_rewards[i_episode-1]}" ) return stats def _train_batch(self, batch): states, actions, next_states, rewards, dones = batch batch_size = len(rewards) # calculating q(s,a) batch_actions = np.array(actions).squeeze() batch_qs = self._q(tt(np.array(states))) batch_qs = batch_qs[np.arange(batch_size), batch_actions] # calculating r + gamma * max_a' q(s', a') targets = tt(np.array(rewards)) non_terminal_idx = np.array(dones) batch_next_qs = self._q_target(tt(np.array(next_states))) batch_max_next_qs, batch_argmax_next_qs = batch_next_qs.max(1) targets[non_terminal_idx] = targets[ non_terminal_idx] + self._gamma * batch_max_next_qs[ non_terminal_idx] self._q_optimizer.zero_grad() loss = self._loss_function(batch_qs, targets) loss.backward() self._q_optimizer.step() def save_models(self): self._modelio.save(model=self._q, model_name=self._q_model_name) self._modelio.save(model=self._q_target, model_name=self._target_model_name) def load_models(self): # if self._model self._modelio.load(model=self._q, model_name=self._q_model_name) self._modelio.load(model=self._q_target, model_name=self._target_model_name)
class PPO: def __init__(self, state_dim, action_dim, action_std=0.1, gamma=0.99, hidden_dim=64, actor_lr=0.001, critic_lr=0.001, K_epochs=5, eps_clip=0.2, entropy_coeff=0.02, verbose=False): self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.entropy_coeff = entropy_coeff self.verbose = verbose self.critic = Critic(state_dim, hidden_dim=hidden_dim).to(device) self.actor = Actor(state_dim, action_dim, action_std=action_std, hidden_dim=hidden_dim).to(device) self.actor_old = Actor(state_dim, action_dim, action_std=action_std, hidden_dim=hidden_dim).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.actor_old.load_state_dict(self.actor.state_dict()) self.buffer = Buffer() # --- ModelIO --- self.modelio = ModelIO(model_path=Path(__file__).resolve().parent / 'models') def get_action(self, state): """ used for test time (not training) """ action, _ = self.actor_old(state) action = torch.clamp(action, min=-1.0, max=1.0).detach().numpy() return action def train(self, env, episodes, timesteps, update_timestep): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) timestep = 0 for i_episode in range(1, episodes + 1): state = env.reset() for t in range(timesteps): timestep += 1 # Running policy_old: action, action_logprob = self.actor_old(state) env_action = torch.clamp(action, min=-1.0, max=1.0).detach().numpy() next_state, reward, done, _ = env.step(env_action) # saving stats stats.episode_rewards[i_episode - 1] += reward stats.episode_lengths[i_episode - 1] = t # Saving the experience in buffer: self.buffer.states.append( torch.from_numpy(state).float().to(device)) self.buffer.actions.append(action) self.buffer.logprobs.append(action_logprob) self.buffer.rewards.append(reward) self.buffer.is_terminals.append(done) # update if its time if timestep % update_timestep == 0: self.update() self.buffer.clear_buffer() timestep = 0 if done: break state = next_state # logging if self.verbose: if i_episode % 10 == 0: print(( f"{int(stats.episode_lengths[i_episode - 1])} Steps in" f"Episode {i_episode}/{episodes}. " f"Reward {stats.episode_rewards[i_episode-1]}")) else: if i_episode % 1000 == 0: print(( f"{int(stats.episode_lengths[i_episode - 1])} Steps in" f"Episode {i_episode}/{episodes}. " f"Reward {stats.episode_rewards[i_episode-1]}")) return stats def update(self): # Monte Carlo estimate of the return over all steps # (possibly across episodes): rewards = np.zeros_like(self.buffer.rewards, dtype=np.float32) discounted_reward = 0 for i, (reward, is_terminal) in enumerate( zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals))): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) rewards[-(i + 1)] = discounted_reward # Normalizing the rewards: rewards = torch.tensor(rewards).to(device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states = torch.stack(self.buffer.states).to(device).detach() old_actions = torch.stack(self.buffer.actions).to(device).detach() old_logprobs = torch.stack(self.buffer.logprobs).to(device).detach() # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values: logprobs, dist_entropy = self.actor.evaluate( old_states, old_actions) # getting the state_values from the critic state_values = self.critic(old_states) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages actor_loss = -torch.min(surr1, surr2) - self.entropy_coeff * dist_entropy critic_loss = 0.5 * F.mse_loss(state_values, rewards) # take gradient step (actor) self.actor_optimizer.zero_grad() actor_loss.mean().backward() self.actor_optimizer.step() # take gradient step (critic) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Copy new weights into old policy: self.actor_old.load_state_dict(self.actor.state_dict()) def save_models(self, model_name): self.modelio.save(model=self.actor, model_name=f'ppo_c_actor_{model_name}.pt') self.modelio.save(model=self.critic, model_name=f'ppo_c_critic_{model_name}.pt') def load_models(self, model_name): # if self._model self.modelio.load(model=self.actor, model_name=f'ppo_c_actor_{model_name}.pt') self.modelio.load(model=self.actor_old, model_name=f'ppo_c_actor_{model_name}.pt') self.modelio.load(model=self.critic, model_name=f'ppo_c_critic_{model_name}.pt')
if n_sbu: dataset_name += "+sbu%d" % n_sbu # global vectorizer vect_name = 'tokenizer_%s' % dataset_name mlb_name = 'mlb_%s' % dataset_name try: if mlb_name: mlb = ModelIO.load(mlb_name) print "MLB loaded from file" vect = ModelIO.load(vect_name) # vect = ModelIO.load('tokenizer_reddit') # gloveglove print "Tokenizer loaded from file." except: if mlb_name: vect, mlb = prepVect(n_sbu=n_sbu, n_captions=1, multilabel=True) ModelIO.save(vect, vect_name) ModelIO.save(mlb, mlb_name) print "Saved %s, %s for future use." % (vect_name, mlb_name) else: vect = prepVect(n_sbu=n_sbu, n_captions=1) ModelIO.save(vect, vect_name) print "Saved %s for future use." % vect_name class DataETL(): @staticmethod def getFinalStream(X, Y, sources, sources_k, batch_size=128,