def main(): # Create carpole environment and network env = gym.make('CartPole-v0').unwrapped if not os.path.exists(model_path): raise Exception("You should train the DQN first!") net = DQN(n_state=env.observation_space.shape[0], n_action=env.action_space.n, epsilon=epsilon, batch_size=batch_size, model_path=model_path) net.load() net.cuda() reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 while True: # env.render() # Select action and obtain the reward a = net.chooseAction(s) s_, r, finish, _ = env.step(a) total_reward += r if finish: print("Episode: %d \t Total reward: %d \t Eps: %f" % (i, total_reward, net.epsilon)) reward_list.append(total_reward) break s = s_ env.close() print("Testing average reward: ", np.mean(reward_list))
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, args.atoms) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.online_net.load_state_dict( torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) if args.cuda: self.online_net.cuda() self.target_net.cuda() self.support = self.support.cuda() # Resets noisy weights in all linear layers (of online net only) def reset_noise(self): self.online_net.reset_noise() # Acts based on single state (no batch) def act(self, state): return (self.online_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0] # Acts with an ε-greedy policy def act_e_greedy(self, state, epsilon=0.001): return random.randrange( self.action_space) if random.random() < epsilon else self.act( state) def learn(self, mem): # Sample transitions idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample( self.batch_size) # Calculate current state probabilities self.online_net.reset_noise() # Sample new noise for online network ps = self.online_net(states) # Probabilities p(s_t, ·; θonline) ps_a = ps[range(self.batch_size), actions] # p(s_t, a_t; θonline) # Calculate nth next state probabilities self.online_net.reset_noise() # Sample new noise for action selection pns = self.online_net( next_states).data # Probabilities p(s_t+n, ·; θonline) dns = self.support.expand_as( pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θonline)) argmax_indices_ns = dns.sum(2).max( 1 )[1] # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] self.target_net.reset_noise() # Sample new target net noise pns = self.target_net( next_states).data # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range( self.batch_size ), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * ( self.discount**self.n) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().long(), b.ceil().long() # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = states.data.new(self.batch_size, self.atoms).zero_() offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand( self.batch_size, self.atoms).type_as(actions) m.view(-1).index_add_( 0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) ps_a = ps_a.clamp(min=1e-3) # Clamp for numerical stability in log loss = -torch.sum( Variable(m) * ps_a.log(), 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) self.online_net.zero_grad() (weights * loss).mean().backward() # Importance weight losses self.optimiser.step() mem.update_priorities( idxs, loss.data) # Update priorities of sampled transitions def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) def save(self, path): torch.save(self.online_net.state_dict(), os.path.join(path, 'model.pth')) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): return (self.online_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0] def train(self): self.online_net.train() def eval(self): self.online_net.eval()
class Agent(): def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, args.atoms) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.priority_exponent = args.priority_exponent self.max_gradient_norm = args.max_gradient_norm self.policy_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.policy_net.load_state_dict(torch.load(args.model)) self.policy_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.eval() self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps) if args.cuda: self.policy_net.cuda() self.target_net.cuda() self.support = self.support.cuda() # Resets noisy weights in all linear layers (of policy and target nets) def reset_noise(self): self.policy_net.reset_noise() self.target_net.reset_noise() # Acts based on single state (no batch) def act(self, state): return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0] def learn(self, mem): idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(self.batch_size) batch_size = len(idxs) # May return less than specified if invalid transitions sampled # Calculate current state probabilities ps = self.policy_net(states) # Probabilities p(s_t, ·; θpolicy) ps_a = ps[range(batch_size), actions] # p(s_t, a_t; θpolicy) # Calculate nth next state probabilities pns = self.policy_net(next_states).data # Probabilities p(s_t+n, ·; θpolicy) dns = self.support.expand_as(pns) * pns # Distribution d_t+n = (z, p(s_t+n, ·; θpolicy)) argmax_indices_ns = dns.sum(2).max(1)[1] # Perform argmax action selection using policy network: argmax_a[(z, p(s_t+n, a; θpolicy))] pns = self.target_net(next_states).data # Probabilities p(s_t+n, ·; θtarget) pns_a = pns[range(batch_size), argmax_indices_ns] # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θpolicy))]; θtarget) pns_a *= nonterminals # Set p = 0 for terminal nth next states as all possible expected returns = expected reward at final transition # Compute Tz (Bellman operator T applied to z) Tz = returns.unsqueeze(1) + nonterminals * (self.discount ** self.n) * self.support.unsqueeze(0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().long(), b.ceil().long() # Distribute probability of Tz m = states.data.new(batch_size, self.atoms).zero_() offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).long().unsqueeze(1).expand(batch_size, self.atoms).type_as(actions) m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum(Variable(m) * ps_a.log(), 1) # Cross-entropy loss (minimises Kullback-Leibler divergence) self.policy_net.zero_grad() (weights * loss).mean().backward() # Importance weight losses nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm) # Clip gradients (normalising by max value of gradient L2 norm) self.optimiser.step() mem.update_priorities(idxs, loss.data.abs().pow(self.priority_exponent)) # Update priorities of sampled transitions def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def save(self, path): torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth')) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0] def train(self): self.policy_net.train() def eval(self): self.policy_net.eval()
dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor Qt = DQN(in_channels=5, num_actions=18).type(dtype) Qt_t = DQN(in_channels=5, num_actions=18).type(dtype) Qt_t.load_state_dict(Qt.state_dict()) Qt_t.eval() for param in Qt_t.parameters(): param.requires_grad = False if torch.cuda.device_count() > 0: Qt.cuda() Qt = nn.DataParallel(Qt).to(device0) Qt_t = nn.DataParallel(Qt_t).to(device0) batch_size = BATCH_SIZE * torch.cuda.device_count() else: batch_size = BATCH_SIZE # optimizer optimizer = optim.RMSprop(Qt.parameters(), lr=LEARNING_RATE, alpha=ALPHA, eps=EPS) # training parameters # Create environment import gym
def train(): # global args # args = parser.parse_args() Learner = DQN().to(device) env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1') # env = retro.make(game='Airstriker-Genesis', state='Level1') criterion = L2_loss(0.99).to(device) if is_cuda: Learner = Learner.cuda() criterion = criterion.cuda() optimizer = optim.SGD(Learner.parameters(), lr=0.01) eps_threshold = 0.8 RM = ReplayMemory(1000) A_agent = ActorAgent(Learner, args) print("Start Episodes") for i_episode in range(50000): env.reset() A_agent.reset(Learner, args) last_state = get_screen(env) current_state = get_screen(env) state = current_state - last_state # state_var = torch.autograd.Variable(state) state_var = state.to(device) total_reward = 0 if i_episode % 50 == 0: eps_threshold = 0.9 for t in count(): if t == 0: print("episode begin") eps_threshold -= 0.000019 action_q = A_agent.act(state_var, eps_threshold) """ if is_cuda: action_q = action_q.cpu() _, action = action_q.data.max(2) else: _, action = action_q.data.max(2) """ _, action = action_q.data.max(2) action_numpy = action.squeeze(0).numpy() # print(list(action_numpy)) for i in range(4): _, reward, done, _ = env.step(action_numpy) total_reward += reward last_state = current_state current_state = get_screen(env) state = current_state - last_state # state_var = torch.autograd.Variable(state) state_var = state.to(device) # 行動語のstateを保存 A_agent.add_to_buffer(reward, action_q, state_var) # ReplayMemoryに状態保存 if len(A_agent.localbuffer) > 10: p, error = calc_priority_TDerror(Learner, criterion, A_agent, 10) RM.push(p, error) if done: break # Optimize Learner model # if t%100==0 and len(A_agent.localbuffer)>80 and len(RM)>=30: for i in range(4): error_batch = RM.priority_sample(30) optimizer.zero_grad() # error_batch.backward(retain_graph=True) error_batch.backward() optimizer.step() for param in Learner.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() print("{0}\t{1}\tLoss:{2}\tTotal:{3}\tReward:{4}".format( i_episode, t, float(error_batch), total_reward, reward, )) RM.reset() # env.render() with open("total_reward.txt", "a") as f: f.write("{0}\t{1}".format(i_episode, total_reward)) f.write("\n")
class Agent: def __init__(self): self.mode = "train" with open("config.yaml") as reader: self.config = yaml.safe_load(reader) print(self.config) self.load_config() self.online_net = DQN(config=self.config, word_vocab=self.word_vocab, char_vocab=self.char_vocab, answer_type=self.answer_type) self.target_net = DQN(config=self.config, word_vocab=self.word_vocab, char_vocab=self.char_vocab, answer_type=self.answer_type) self.online_net.train() self.target_net.train() self.update_target_net() for param in self.target_net.parameters(): param.requires_grad = False if self.use_cuda: self.online_net.cuda() self.target_net.cuda() self.naozi = ObservationPool(capacity=self.naozi_capacity) # optimizer self.optimizer = torch.optim.Adam( self.online_net.parameters(), lr=self.config['training']['optimizer']['learning_rate']) self.clip_grad_norm = self.config['training']['optimizer'][ 'clip_grad_norm'] def load_config(self): # word vocab with open("vocabularies/word_vocab.txt") as f: self.word_vocab = f.read().split("\n") self.word2id = {} for i, w in enumerate(self.word_vocab): self.word2id[w] = i # char vocab with open("vocabularies/char_vocab.txt") as f: self.char_vocab = f.read().split("\n") self.char2id = {} for i, w in enumerate(self.char_vocab): self.char2id[w] = i self.EOS_id = self.word2id["</s>"] self.train_data_size = self.config['general']['train_data_size'] self.question_type = self.config['general']['question_type'] self.random_map = self.config['general']['random_map'] self.testset_path = self.config['general']['testset_path'] self.naozi_capacity = self.config['general']['naozi_capacity'] self.eval_folder = pjoin( self.testset_path, self.question_type, ("random_map" if self.random_map else "fixed_map")) self.eval_data_path = pjoin(self.testset_path, "data.json") self.batch_size = self.config['training']['batch_size'] self.max_nb_steps_per_episode = self.config['training'][ 'max_nb_steps_per_episode'] self.max_episode = self.config['training']['max_episode'] self.target_net_update_frequency = self.config['training'][ 'target_net_update_frequency'] self.learn_start_from_this_episode = self.config['training'][ 'learn_start_from_this_episode'] self.run_eval = self.config['evaluate']['run_eval'] self.eval_batch_size = self.config['evaluate']['batch_size'] self.eval_max_nb_steps_per_episode = self.config['evaluate'][ 'max_nb_steps_per_episode'] # Set the random seed manually for reproducibility. self.random_seed = self.config['general']['random_seed'] np.random.seed(self.random_seed) torch.manual_seed(self.random_seed) if torch.cuda.is_available(): if not self.config['general']['use_cuda']: print( "WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml" ) self.use_cuda = False else: torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(self.random_seed) self.use_cuda = True else: self.use_cuda = False if self.question_type == "location": self.answer_type = "pointing" elif self.question_type in ["attribute", "existence"]: self.answer_type = "2 way" else: raise NotImplementedError self.save_checkpoint = self.config['checkpoint']['save_checkpoint'] self.experiment_tag = self.config['checkpoint']['experiment_tag'] self.save_frequency = self.config['checkpoint']['save_frequency'] self.load_pretrained = self.config['checkpoint']['load_pretrained'] self.load_from_tag = self.config['checkpoint']['load_from_tag'] self.qa_loss_lambda = self.config['training']['qa_loss_lambda'] self.interaction_loss_lambda = self.config['training'][ 'interaction_loss_lambda'] # replay buffer and updates self.discount_gamma = self.config['replay']['discount_gamma'] self.replay_batch_size = self.config['replay']['replay_batch_size'] self.command_generation_replay_memory = command_generation_memory.PrioritizedReplayMemory( self.config['replay']['replay_memory_capacity'], priority_fraction=self.config['replay'] ['replay_memory_priority_fraction'], discount_gamma=self.discount_gamma) self.qa_replay_memory = qa_memory.PrioritizedReplayMemory( self.config['replay']['replay_memory_capacity'], priority_fraction=0.0) self.update_per_k_game_steps = self.config['replay'][ 'update_per_k_game_steps'] self.multi_step = self.config['replay']['multi_step'] # distributional RL self.use_distributional = self.config['distributional']['enable'] self.atoms = self.config['distributional']['atoms'] self.v_min = self.config['distributional']['v_min'] self.v_max = self.config['distributional']['v_max'] self.support = torch.linspace(self.v_min, self.v_max, self.atoms) # Support (range) of z if self.use_cuda: self.support = self.support.cuda() self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) # dueling networks self.dueling_networks = self.config['dueling_networks'] # double dqn self.double_dqn = self.config['double_dqn'] # counting reward self.revisit_counting_lambda_anneal_episodes = self.config[ 'episodic_counting_bonus'][ 'revisit_counting_lambda_anneal_episodes'] self.revisit_counting_lambda_anneal_from = self.config[ 'episodic_counting_bonus']['revisit_counting_lambda_anneal_from'] self.revisit_counting_lambda_anneal_to = self.config[ 'episodic_counting_bonus']['revisit_counting_lambda_anneal_to'] self.revisit_counting_lambda = self.revisit_counting_lambda_anneal_from # valid command bonus self.valid_command_bonus_lambda = self.config[ 'valid_command_bonus_lambda'] # epsilon greedy self.epsilon_anneal_episodes = self.config['epsilon_greedy'][ 'epsilon_anneal_episodes'] self.epsilon_anneal_from = self.config['epsilon_greedy'][ 'epsilon_anneal_from'] self.epsilon_anneal_to = self.config['epsilon_greedy'][ 'epsilon_anneal_to'] self.epsilon = self.epsilon_anneal_from self.noisy_net = self.config['epsilon_greedy']['noisy_net'] if self.noisy_net: # disable epsilon greedy self.epsilon_anneal_episodes = -1 self.epsilon = 0.0 self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger']) self.single_word_verbs = set(["inventory", "look", "wait"]) self.two_word_verbs = set(["go"]) def train(self): """ Tell the agent that it's training phase. """ self.mode = "train" self.online_net.train() def eval(self): """ Tell the agent that it's evaluation phase. """ self.mode = "eval" self.online_net.eval() def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) def reset_noise(self): if self.noisy_net: # Resets noisy weights in all linear layers (of online net only) self.online_net.reset_noise() def zero_noise(self): if self.noisy_net: self.online_net.zero_noise() self.target_net.zero_noise() def load_pretrained_model(self, load_from): """ Load pretrained checkpoint from file. Arguments: load_from: File name of the pretrained model checkpoint. """ print("loading model from %s\n" % (load_from)) try: if self.use_cuda: state_dict = torch.load(load_from) else: state_dict = torch.load(load_from, map_location='cpu') self.online_net.load_state_dict(state_dict) except: print("Failed to load checkpoint...") def save_model_to_path(self, save_to): torch.save(self.online_net.state_dict(), save_to) print("Saved checkpoint to %s..." % (save_to)) def init(self, obs, infos): """ Prepare the agent for the upcoming games. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ # reset agent, get vocabulary masks for verbs / adjectives / nouns batch_size = len(obs) self.reset_binarized_counter(batch_size) self.not_finished_yet = np.ones((batch_size, ), dtype="float32") self.prev_actions = [["" for _ in range(batch_size)]] self.prev_step_is_still_interacting = np.ones( (batch_size, ), dtype="float32" ) # 1s and starts to be 0 when previous action is "wait" self.naozi.reset(batch_size=batch_size) def get_agent_inputs(self, string_list): sentence_token_list = [item.split() for item in string_list] sentence_id_list = [ _words_to_ids(tokens, self.word2id) for tokens in sentence_token_list ] input_sentence_char = list_of_token_list_to_char_input( sentence_token_list, self.char2id) input_sentence = pad_sequences( sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32') input_sentence = to_pt(input_sentence, self.use_cuda) input_sentence_char = to_pt(input_sentence_char, self.use_cuda) return input_sentence, input_sentence_char, sentence_id_list def get_game_info_at_certain_step(self, obs, infos): """ Get all needed info from game engine for training. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ batch_size = len(obs) feedback_strings = [preproc(item, tokenizer=self.nlp) for item in obs] description_strings = [ preproc(item, tokenizer=self.nlp) for item in infos["description"] ] observation_strings = [ d + " <|> " + fb if fb != d else d + " <|> hello" for fb, d in zip(feedback_strings, description_strings) ] inventory_strings = [ preproc(item, tokenizer=self.nlp) for item in infos["inventory"] ] local_word_list = [ obs.split() + inv.split() for obs, inv in zip(observation_strings, inventory_strings) ] directions = ["east", "west", "north", "south"] if self.question_type in ["location", "existence"]: # agents observes the env, but do not change them possible_verbs = [["go", "inventory", "wait", "open", "examine"] for _ in range(batch_size)] else: possible_verbs = [ list(set(item) - set(["", "look"])) for item in infos["verbs"] ] possible_adjs, possible_nouns = [], [] for i in range(batch_size): object_nouns = [ item.split()[-1] for item in infos["object_nouns"][i] ] object_adjs = [ w for item in infos["object_adjs"][i] for w in item.split() ] possible_nouns.append( list(set(object_nouns) & set(local_word_list[i]) - set([""])) + directions) possible_adjs.append( list(set(object_adjs) & set(local_word_list[i]) - set([""])) + ["</s>"]) return observation_strings, [ possible_verbs, possible_adjs, possible_nouns ] def get_state_strings(self, infos): description_strings = infos["description"] inventory_strings = infos["inventory"] observation_strings = [ _d + _i for (_d, _i) in zip(description_strings, inventory_strings) ] return observation_strings def get_local_word_masks(self, possible_words): possible_verbs, possible_adjs, possible_nouns = possible_words batch_size = len(possible_verbs) verb_mask = np.zeros((batch_size, len(self.word_vocab)), dtype="float32") noun_mask = np.zeros((batch_size, len(self.word_vocab)), dtype="float32") adj_mask = np.zeros((batch_size, len(self.word_vocab)), dtype="float32") for i in range(batch_size): for w in possible_verbs[i]: if w in self.word2id: verb_mask[i][self.word2id[w]] = 1.0 for w in possible_adjs[i]: if w in self.word2id: adj_mask[i][self.word2id[w]] = 1.0 for w in possible_nouns[i]: if w in self.word2id: noun_mask[i][self.word2id[w]] = 1.0 adj_mask[:, self.EOS_id] = 1.0 return [verb_mask, adj_mask, noun_mask] def get_match_representations(self, input_observation, input_observation_char, input_quest, input_quest_char, use_model="online"): model = self.online_net if use_model == "online" else self.target_net description_representation_sequence, description_mask = model.representation_generator( input_observation, input_observation_char) quest_representation_sequence, quest_mask = model.representation_generator( input_quest, input_quest_char) match_representation_sequence = model.get_match_representations( description_representation_sequence, description_mask, quest_representation_sequence, quest_mask) match_representation_sequence = match_representation_sequence * description_mask.unsqueeze( -1) return match_representation_sequence def get_ranks(self, input_observation, input_observation_char, input_quest, input_quest_char, word_masks, use_model="online"): """ Given input observation and question tensors, to get Q values of words. """ model = self.online_net if use_model == "online" else self.target_net match_representation_sequence = self.get_match_representations( input_observation, input_observation_char, input_quest, input_quest_char, use_model=use_model) action_ranks = model.action_scorer(match_representation_sequence, word_masks) # list of 3 tensors return action_ranks def choose_maxQ_command(self, action_ranks, word_mask=None): """ Generate a command by maximum q values, for epsilon greedy. """ if self.use_distributional: action_ranks = [ (item * self.support).sum(2) for item in action_ranks ] # list of batch x n_vocab action_indices = [] for i in range(len(action_ranks)): ar = action_ranks[i] ar = ar - torch.min( ar, -1, keepdim=True )[0] + 1e-2 # minus the min value, so that all values are non-negative if word_mask is not None: assert word_mask[i].size() == ar.size(), ( word_mask[i].size().shape, ar.size()) ar = ar * word_mask[i] action_indices.append(torch.argmax(ar, -1)) # batch return action_indices def choose_random_command(self, batch_size, action_space_size, possible_words=None): """ Generate a command randomly, for epsilon greedy. """ action_indices = [] for i in range(3): if possible_words is None: indices = np.random.choice(action_space_size, batch_size) else: indices = [] for j in range(batch_size): mask_ids = [] for w in possible_words[i][j]: if w in self.word2id: mask_ids.append(self.word2id[w]) indices.append(np.random.choice(mask_ids)) indices = np.array(indices) action_indices.append(to_pt(indices, self.use_cuda)) # batch return action_indices def get_chosen_strings(self, chosen_indices): """ Turns list of word indices into actual command strings. chosen_indices: Word indices chosen by model. """ chosen_indices_np = [to_np(item) for item in chosen_indices] res_str = [] batch_size = chosen_indices_np[0].shape[0] for i in range(batch_size): verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][ i], chosen_indices_np[2][i] res_str.append(self.word_ids_to_commands(verb, adj, noun)) return res_str def word_ids_to_commands(self, verb, adj, noun): """ Turn the 3 indices into actual command strings. Arguments: verb: Index of the guessing verb in vocabulary adj: Index of the guessing adjective in vocabulary noun: Index of the guessing noun in vocabulary """ # turns 3 indices into actual command strings if self.word_vocab[verb] in self.single_word_verbs: return self.word_vocab[verb] if self.word_vocab[verb] in self.two_word_verbs: return " ".join([self.word_vocab[verb], self.word_vocab[noun]]) if adj == self.EOS_id: return " ".join([self.word_vocab[verb], self.word_vocab[noun]]) else: return " ".join([ self.word_vocab[verb], self.word_vocab[adj], self.word_vocab[noun] ]) def act_random(self, obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words): with torch.no_grad(): batch_size = len(obs) word_indices_random = self.choose_random_command( batch_size, len(self.word_vocab), possible_words) chosen_indices = word_indices_random chosen_strings = self.get_chosen_strings(chosen_indices) for i in range(batch_size): if chosen_strings[i] == "wait": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "wait": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ chosen_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info def act_greedy(self, obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words): """ Acts upon the current list of observations. One text command must be returned for each observation. """ with torch.no_grad(): batch_size = len(obs) local_word_masks_np = self.get_local_word_masks(possible_words) local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in local_word_masks_np ] # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands action_ranks = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, local_word_masks, use_model="online") # list of batch x vocab word_indices_maxq = self.choose_maxQ_command( action_ranks, local_word_masks) chosen_indices = word_indices_maxq chosen_strings = self.get_chosen_strings(chosen_indices) for i in range(batch_size): if chosen_strings[i] == "wait": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "wait": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ chosen_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info def act(self, obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words, random=False): """ Acts upon the current list of observations. One text command must be returned for each observation. """ with torch.no_grad(): if self.mode == "eval": return self.act_greedy(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words) if random: return self.act_random(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words) batch_size = len(obs) local_word_masks_np = self.get_local_word_masks(possible_words) local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in local_word_masks_np ] # generate commands for one game step, epsilon greedy is applied, i.e., # there is epsilon of chance to generate random commands action_ranks = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, local_word_masks, use_model="online") # list of batch x vocab word_indices_maxq = self.choose_maxQ_command( action_ranks, local_word_masks) word_indices_random = self.choose_random_command( batch_size, len(self.word_vocab), possible_words) # random number for epsilon greedy rand_num = np.random.uniform(low=0.0, high=1.0, size=(batch_size, )) less_than_epsilon = (rand_num < self.epsilon).astype( "float32") # batch greater_than_epsilon = 1.0 - less_than_epsilon less_than_epsilon = to_pt(less_than_epsilon, self.use_cuda, type='long') greater_than_epsilon = to_pt(greater_than_epsilon, self.use_cuda, type='long') chosen_indices = [ less_than_epsilon * idx_random + greater_than_epsilon * idx_maxq for idx_random, idx_maxq in zip(word_indices_random, word_indices_maxq) ] chosen_strings = self.get_chosen_strings(chosen_indices) for i in range(batch_size): if chosen_strings[i] == "wait": self.not_finished_yet[i] = 0.0 # info for replay memory for i in range(batch_size): if self.prev_actions[-1][i] == "wait": self.prev_step_is_still_interacting[i] = 0.0 # previous step is still interacting, this is because DQN requires one step extra computation replay_info = [ chosen_indices, to_pt(self.prev_step_is_still_interacting, self.use_cuda, "float") ] # cache new info in current game step into caches self.prev_actions.append(chosen_strings) return chosen_strings, replay_info def get_dqn_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.command_generation_replay_memory) < self.replay_batch_size: return None data = self.command_generation_replay_memory.get_batch( self.replay_batch_size, self.multi_step) if data is None: return None obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data batch_size = len(actual_n_list) input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list) input_observation, input_observation_char, _ = self.get_agent_inputs( obs_list) next_input_observation, next_input_observation_char, _ = self.get_agent_inputs( next_obs_list) possible_words, next_possible_words = [], [] for i in range(3): possible_words.append([item[i] for item in possible_words_list]) next_possible_words.append( [item[i] for item in next_possible_words_list]) local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in self.get_local_word_masks(possible_words) ] next_local_word_masks = [ to_pt(item, self.use_cuda, type="float") for item in self.get_local_word_masks(next_possible_words) ] action_ranks = self.get_ranks( input_observation, input_observation_char, input_quest, input_quest_char, local_word_masks, use_model="online" ) # list of batch x vocab or list of batch x vocab x atoms # ps_a word_qvalues = [ ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for w_rank, idx in zip(action_ranks, chosen_indices) ] # list of batch or list of batch x atoms q_value = torch.mean(torch.stack(word_qvalues, -1), -1) # batch or batch x atoms # log_ps_a log_q_value = torch.log(q_value) # batch or batch x atoms with torch.no_grad(): if self.noisy_net: self.target_net.reset_noise() # Sample new target net noise if self.double_dqn: # pns Probabilities p(s_t+n, ·; θonline) next_action_ranks = self.get_ranks(next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_local_word_masks, use_model="online") # list of batch x vocab or list of batch x vocab x atoms # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))] next_word_indices = self.choose_maxQ_command( next_action_ranks, next_local_word_masks) # list of batch x 1 # pns # Probabilities p(s_t+n, ·; θtarget) next_action_ranks = self.get_ranks( next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_local_word_masks, use_model="target" ) # batch x vocab or list of batch x vocab x atoms # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget) next_word_qvalues = [ ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for w_rank, idx in zip(next_action_ranks, next_word_indices) ] # list of batch or list of batch x atoms else: # pns Probabilities p(s_t+n, ·; θonline) next_action_ranks = self.get_ranks(next_input_observation, next_input_observation_char, input_quest, input_quest_char, next_local_word_masks, use_model="target") # list of batch x vocab or list of batch x vocab x atoms next_word_indices = self.choose_maxQ_command( next_action_ranks, next_local_word_masks) # list of batch x 1 next_word_qvalues = [ ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for w_rank, idx in zip(next_action_ranks, next_word_indices) ] # list of batch or list of batch x atoms next_q_value = torch.mean(torch.stack(next_word_qvalues, -1), -1) # batch or batch x atoms # Compute Tz (Bellman operator T applied to z) discount = to_pt((np.ones_like(actual_n_list) * self.discount_gamma)**actual_n_list, self.use_cuda, type="float") if not self.use_distributional: rewards = rewards + next_q_value * discount # batch loss = F.smooth_l1_loss(q_value, rewards) return loss with torch.no_grad(): Tz = rewards.unsqueeze( -1) + discount.unsqueeze(-1) * self.support.unsqueeze( 0) # Tz = R^n + (γ^n)z (accounting for terminal states) Tz = Tz.clamp(min=self.v_min, max=self.v_max) # Clamp between supported values # Compute L2 projection of Tz onto fixed support z b = (Tz - self.v_min) / self.delta_z # b = (Tz - Vmin) / Δz l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = torch.zeros(batch_size, self.atoms).float() if self.use_cuda: m = m.cuda() offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).unsqueeze(1).expand( batch_size, self.atoms).long() if self.use_cuda: offset = offset.cuda() m.view(-1).index_add_( 0, (l + offset).view(-1), (next_q_value * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (next_q_value * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum( m * log_q_value, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) loss = torch.mean(loss) return loss def update_interaction(self): # update neural model by replaying snapshots in replay memory interaction_loss = self.get_dqn_loss() if interaction_loss is None: return None loss = interaction_loss * self.interaction_loss_lambda # Backpropagate self.online_net.zero_grad() self.optimizer.zero_grad() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), self.clip_grad_norm) self.optimizer.step() # apply gradients return to_np(torch.mean(interaction_loss)) def answer_question(self, input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char, use_model="online"): # first pad answerer_input, and get the mask model = self.online_net if use_model == "online" else self.target_net batch_size = len(observation_id_list) max_length = input_observation.size(1) mask = compute_mask(input_observation) # batch x obs_len # noun mask for location question if self.question_type in ["location"]: location_mask = [] for i in range(batch_size): m = [1 for item in observation_id_list[i]] location_mask.append(m) location_mask = pad_sequences(location_mask, maxlen=max_length, dtype="float32") location_mask = to_pt(location_mask, enable_cuda=self.use_cuda, type='float') assert mask.size() == location_mask.size() mask = mask * location_mask match_representation_sequence = self.get_match_representations( input_observation, input_observation_char, input_quest, input_quest_char, use_model=use_model) pred = model.answer_question(match_representation_sequence, mask) # batch x vocab or batch x 2 # attention sum: # sometimes certain word appears multiple times in the observation, # thus we need to merge them together before doing further computations # ------- but # if answer type is not pointing, we just use a pre-defined mapping # that maps 0/1 to their positions in vocab if self.answer_type == "2 way": observation_id_list = [] max_length = 2 for i in range(batch_size): observation_id_list.append( [self.word2id["0"], self.word2id["1"]]) observation = to_pt( pad_sequences(observation_id_list, maxlen=max_length).astype('int32'), self.use_cuda) vocab_distribution = np.zeros( (batch_size, len(self.word_vocab))) # batch x vocab vocab_distribution = to_pt(vocab_distribution, self.use_cuda, type='float') vocab_distribution = vocab_distribution.scatter_add_( 1, observation, pred) # batch x vocab non_zero_words = [] for i in range(batch_size): non_zero_words.append(list(set(observation_id_list[i]))) vocab_mask = torch.ne(vocab_distribution, 0).float() return vocab_distribution, non_zero_words, vocab_mask def point_maxq_position(self, vocab_distribution, mask): """ Generate a command by maximum q values, for epsilon greedy. Arguments: point_distribution: Q values for each position (mapped to vocab). mask: vocab masks. """ vocab_distribution = vocab_distribution - torch.min( vocab_distribution, -1, keepdim=True )[0] + 1e-2 # minus the min value, so that all values are non-negative vocab_distribution = vocab_distribution * mask # batch x vocab indices = torch.argmax(vocab_distribution, -1) # batch return indices def answer_question_act_greedy(self, input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char): with torch.no_grad(): vocab_distribution, _, vocab_mask = self.answer_question( input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char, use_model="online") # batch x time positions_maxq = self.point_maxq_position(vocab_distribution, vocab_mask) return positions_maxq # batch def get_qa_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.qa_replay_memory) < self.replay_batch_size: return None transitions = self.qa_replay_memory.sample(self.replay_batch_size) batch = qa_memory.qa_Transition(*zip(*transitions)) observation_list = batch.observation_list quest_list = batch.quest_list answer_strings = batch.answer_strings answer_position = np.array(_words_to_ids(answer_strings, self.word2id)) groundtruth = to_pt(answer_position, self.use_cuda) # batch input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list) input_observation, input_observation_char, observation_id_list = self.get_agent_inputs( observation_list) answer_distribution, _, _ = self.answer_question( input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char, use_model="online") # batch x vocab batch_loss = NegativeLogLoss(answer_distribution, groundtruth) # batch return torch.mean(batch_loss) def update_qa(self): # update neural model by replaying snapshots in replay memory qa_loss = self.get_qa_loss() if qa_loss is None: return None loss = qa_loss * self.qa_loss_lambda # Backpropagate self.online_net.zero_grad() self.optimizer.zero_grad() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), self.clip_grad_norm) self.optimizer.step() # apply gradients return to_np(torch.mean(qa_loss)) def finish_of_episode(self, episode_no, batch_size): # Update target networt if ( episode_no + batch_size ) % self.target_net_update_frequency <= episode_no % self.target_net_update_frequency: self.update_target_net() # decay lambdas if episode_no < self.learn_start_from_this_episode: return if episode_no < self.epsilon_anneal_episodes + self.learn_start_from_this_episode: self.epsilon -= (self.epsilon_anneal_from - self.epsilon_anneal_to ) / float(self.epsilon_anneal_episodes) self.epsilon = max(self.epsilon, 0.0) if episode_no < self.revisit_counting_lambda_anneal_episodes + self.learn_start_from_this_episode: self.revisit_counting_lambda -= ( self.revisit_counting_lambda_anneal_from - self.revisit_counting_lambda_anneal_to) / float( self.revisit_counting_lambda_anneal_episodes) self.revisit_counting_lambda = max(self.epsilon, 0.0) def reset_binarized_counter(self, batch_size): self.binarized_counter_dict = [{} for _ in range(batch_size)] def get_binarized_count(self, observation_strings, update=True): count_rewards = [] batch_size = len(observation_strings) for i in range(batch_size): key = observation_strings[i] if key not in self.binarized_counter_dict[i]: self.binarized_counter_dict[i][key] = 0.0 if update: self.binarized_counter_dict[i][key] += 1.0 r = self.binarized_counter_dict[i][key] r = float(r == 1.0) count_rewards.append(r) return count_rewards
class Actor: def __init__(self, learner, param_server, actor_idx, epsilon, num_channels=3, num_actions=19): # environment initialization import gym import minerl self.actor_idx = actor_idx self.env = gym.make(ENV_NAME) self.port_number = int("12340") + actor_idx print("actor environment %d initialize successfully" % self.actor_idx) self.env.make_interactive(port=self.port_number, realtime=False) self.learner_state_dict = ray.get(learner.get_state_dict.remote()) print("getting learner state dict finished...") # network initalization self.actor_network = DQN(num_channels, num_actions).cuda() self.actor_target_network = DQN(num_channels, num_actions).cuda() self.actor_network.load_state_dict(self.learner_state_dict) self.actor_target_network.load_state_dict(self.learner_state_dict) print("actor network %d initialize successfully" % self.actor_idx) self.param_server = param_server self.epi_counter = 0 self.max_epi = 100 self.n_step = 4 self.update_period = 4 self.gamma = 0.99 # exploring info self.epsilon = epsilon self.endEpsilon = 0.01 self.stepDrop = (self.epsilon - self.endEpsilon) / self.max_epi self.local_buffer_size = 100 self.local_buffer = deque(maxlen=self.local_buffer_size) self.writer = SummaryWriter(f'runs/apex/actor{self.actor_idx}') # 1. 네트워크 파라미터 복사 # 2. 환경 탐험 (초기화, 행동) # 3. 로컬버퍼에 저장 # 4. priority 계산 # 5. 글로벌 버퍼에 저장 # 6. 주기적으로 네트워크 업데이트 def get_epi_counter(self): return self.epi_counter def update_params(self, learner): ray.get(self.param_server.pull_from_learner.remote(learner)) policy_params, target_params = ray.get( self.param_server.push_to_actor.remote()) self.actor_network.load_state_dict(policy_params) self.actor_target_network.load_state_dict(target_params) def append_sample(self, memory, state, action, reward, next_state, done, n_rewards=None): # Caluclating Priority (TD Error) target = self.actor_network(state).data old_val = target[0][action].cpu() target_val = self.actor_target_network(next_state).data.cpu() if done: target[0][action] = reward else: target[0][action] = reward + 0.99 * torch.max(target_val) error = abs(old_val - target[0][action]) error = error.cpu() state_ = state.cpu() next_state_ = next_state.cpu() if isinstance(memory, Memory): if n_rewards == None: memory.add(error, [state_, action, reward, next_state_, done]) else: memory.add( error, (state_, action, reward, next_state_, done, n_rewards)) else: if n_rewards == None: memory.remote.add(error, [state_, action, reward, next_state_, done]) else: memory.add.remote( error, (state_, action, reward, next_state_, done, n_rewards)) def explore(self, learner, memory): for num_epi in range(self.max_epi): obs = self.env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 total_steps = 0 if (self.epsilon > self.endEpsilon): self.epsilon -= self.stepDrop # initialize local_buffer n_step = self.n_step n_step_state_buffer = deque(maxlen=n_step) n_step_action_buffer = deque(maxlen=n_step) n_step_reward_buffer = deque(maxlen=n_step) n_step_n_rewards_buffer = deque(maxlen=n_step) n_step_next_state_buffer = deque(maxlen=n_step) n_step_done_buffer = deque(maxlen=n_step) gamma_list = [self.gamma**i for i in range(n_step)] while not done: steps += 1 total_steps += 1 a_out = self.actor_network.sample_action(state, self.epsilon) action_index = a_out action = make_19action(self.env, action_index) obs_prime, reward, done, info = self.env.step(action) total_reward += reward state_prime = converter(ENV_NAME, obs_prime).cuda() # put transition in local buffer n_step_state_buffer.append(state) n_step_action_buffer.append(action_index) n_step_reward_buffer.append(reward) n_step_next_state_buffer.append(state_prime) n_step_done_buffer.append(done) n_rewards = sum([ gamma * reward for gamma, reward in zip(gamma_list, n_step_reward_buffer) ]) n_step_n_rewards_buffer.append(n_rewards) if (len(n_step_state_buffer) >= n_step): # Compute Priorities for i in range(n_step): self.append_sample(memory, n_step_state_buffer[i], n_step_action_buffer[i], n_step_reward_buffer[i], n_step_next_state_buffer[i], n_step_done_buffer[i], n_step_n_rewards_buffer[i]) if (n_step_done_buffer[i]): break state = state_prime self.actor_network.cuda() self.actor_target_network.cuda() if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) self.writer.add_scalar('Rewards/train', total_reward, num_epi) self.epi_counter += 1 if (num_epi % self.update_period == 0): self.update_params(learner) break
def train(args): model = DQN(game=args.game) if args.use_pretrained: pretrained_weight = torch.load( sorted(glob(os.path.join('ckpt', args.tag, '*.pth')))[-1]) model.load_state_dict(pretrained_weight) else: os.makedirs(os.path.join('ckpt', args.tag), exist_ok=True) model.apply(init_weights) model = model.cuda() start = time.time() episode = 0 iteration = 0 epsilon = args.epsilon decayed = args.decayed criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) # instantiate game game = Game(game=args.game) high_score = 0 # initialize replay memory D = deque() elapsed_time = 0 action = torch.zeros([model.number_of_actions], dtype=torch.float32) score = game.reward terminal = game.game_over() image_data = game.get_torch_image().cuda() state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) start = time.time() while iteration < args.iteration: output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) # epsilon greedy exploration eps = epsilon - iteration * (epsilon - decayed) / args.iteration random_action = random.random() <= eps # Pick action --> random or index of maximum q value action_index = [ torch.randint( model.number_of_actions, torch.Size([]), dtype=torch.int) if random_action else torch.argmax(output) ][0] action[action_index] = 1 elapsed_time = time.time() - start # get next state and reward reward = game.act(action_index) terminal = game.game_over() image_data_1 = game.get_torch_image().cuda() state_1 = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0).cuda() action = action.unsqueeze(0).cuda() reward = torch.from_numpy(np.array( [reward], dtype=np.float32)).unsqueeze(0).cuda() # save transition to replay memory D.append( (state.cpu(), action.cpu(), reward.cpu(), state_1.cpu(), terminal)) # if replay memory is full, remove the oldest transition if len(D) > args.replayMemorySize: D.popleft() # sample random minibatch minibatch = random.sample(D, min(len(D), args.minibatchSize)) state_batch = torch.cat(tuple(d[0] for d in minibatch)).cuda() action_batch = torch.cat(tuple(d[1] for d in minibatch)).cuda() reward_batch = torch.cat(tuple(d[2] for d in minibatch)).cuda() state_1_batch = torch.cat(tuple(d[3] for d in minibatch)).cuda() # get output for the next state output_1_batch = model(state_1_batch) y_batch = torch.cat( tuple(reward_batch[i] if minibatch[i][4] else reward_batch[i] + args.gamma * torch.max(output_1_batch[i]) for i in range(len(minibatch)))) # calculate with target network q_value = torch.sum(model(state_batch) * action_batch, dim=1) # LR warmup if iteration < 20000: for g in optimizer.param_groups: g['lr'] = args.lr * iteration / 20000 optimizer.zero_grad() y_batch = y_batch.detach() loss = criterion(q_value, y_batch) loss.backward() optimizer.step() state = state_1 iteration += 1 score += game.reward args.writer.add_scalar('Train/lr', optimizer.param_groups[0]['lr'], iteration) args.writer.add_scalar('Train/epsilon', eps, iteration) args.writer.add_scalar('Train/loss', loss, iteration) args.writer.add_scalar('Train/replay_memory', len(D), iteration) if terminal: score = score - game.reward_terminal args.writer.add_scalar('Episode/elapsed_time', elapsed_time, episode) args.writer.add_scalar('Episode/episode', episode, episode) args.writer.add_scalar('Episode/score', score, episode) game.reset_game() episode += 1 start = time.time() print( 'Episode {} (Iteration {}): Agent passed {} pipes!, Time: {:.3f}' .format(episode, iteration, score, elapsed_time)) if score > high_score: print('Weight Saved!') high_score = score torch.save( model, os.path.join( 'ckpt', args.tag, 'E{:07d}_S{:03d}.pth'.format(episode, int(score)))) score = 0 print("Saving final model") torch.save( model, os.path.join('ckpt', args.tag, 'E_{:07d}_S{:03d}.pth'.format(episode, int(high_score))))
def main(): parser = argparse.ArgumentParser(description='DQN Breakout Script') parser.add_argument('--use-cuda', action='store_true', default=False, help='whether to use CUDA (default: False)') parser.add_argument('--batch-size', type=int, default=128, metavar='M', help='batch size (default: 128)') parser.add_argument('--gamma', type=float, default=0.999, metavar='M', help='gamma (default: 0.999)') parser.add_argument('--eps-start', type=float, default=0.9, metavar='M', help='eps start (default: 0.9)') parser.add_argument('--eps-end', type=float, default=0.05, metavar='M', help='eps end (default: 0.05)') parser.add_argument('--eps-decay', type=int, default=200, metavar='M', help='eps decay (default: 200)') parser.add_argument('--num-obs-in-state', type=int, default=4, metavar='M', help='num observations in state (default: 4)') parser.add_argument('--replay-memory-capacity', type=int, default=10000, metavar='M', help='replay memory capacity (default: 10000)') parser.add_argument('--num-episodes', type=int, default=10, metavar='M', help='num of episodes (default: 10)') parser.add_argument('--reset-period', type=int, default=5, metavar='M', help='period to reset target network (default: 5)') parser.add_argument('--atari-env', type=str, default='Breakout-v0', metavar='M', help='Atari environment to use (default: Breakout-v0)') args = parser.parse_args() env = gym.envs.make(args.atari_env) model = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0]) model_target = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0]) if args.use_cuda: model.cuda() model_target.cuda() optimizer = optim.RMSprop(model.parameters()) memory = ReplayMemory(args.replay_memory_capacity) epsilons = np.linspace(args.eps_start, args.eps_end, args.eps_decay) step_idx = 1 reset_idx = 1 tfs = get_transforms() episode_reward = 0. episode_length = 0 for i_episode in range(args.num_episodes): # Initialize the environment and state obs = env.reset() state_processor = StateProcessor(args.num_obs_in_state, tfs, obs) state = state_processor.get_state() while True: episode_length += 1 if step_idx < args.eps_decay: eps = epsilons[step_idx] else: eps = args.eps_end action = select_action(model, state, env.action_space.shape[0], eps, args.use_cuda) # print('%d %d' % (episode_length, action[0,0])) next_obs, reward, done, info = env.step(action[0, 0]) episode_reward += reward reward = torch.Tensor([reward]) if args.use_cuda: reward = reward.cuda() if not done: state_processor.push_obs(next_obs) next_state = state_processor.get_state() else: next_state = None # None next_state marks done memory.push(state, action, next_state, reward) # optimize optimize_model(optimizer, memory, model, model_target, args.batch_size, args.gamma, args.use_cuda) step_idx += 1 reset_idx += 1 if reset_idx == args.reset_period: reset_idx = 1 model_target.load_state_dict(model.state_dict()) if done: break print(episode_reward) print(episode_length) episode_reward = 0. episode_length = 0
class QAgent(Agent): def __init__(self): self.fex = Extractor() self.net = DQN() try: self.net.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu'))) except: print("Starting with new weights") raise Exception("Weights not found") self.net.eval() self.criterion = torch.nn.MSELoss() self.optimizer = torch.optim.Adam(self.net.parameters()) self.memory = ReplayMemory() self.training = False self.s = None self.a = None self.score = None def registerInitialState(self, state): self.s = None self.a = None self.score = None def getAction(self, game_state): legal = game_state.getLegalPacmanActions() if Directions.STOP in legal: legal.remove(Directions.STOP) state = self.fex(game_state) if self.training: state = state.cuda() with torch.no_grad(): scores = self.net(state) scores = list(zip(ACTIONS, scores)) legal_scores = [p for p in scores if p[0] in legal] action = max(legal_scores, key = lambda p: p[1])[0] if self.training: if random.random() < EPSILON: action = random.choice(legal) if self.s is not None: reward = game_state.getScore() - self.score reward = process_reward(self.s, state, reward) next_legals = game_state.getLegalActions() if Directions.STOP in next_legals: next_legals.remove(Directions.STOP) next_legals = (ACTION_MAP[d] for d in next_legals) self.memory.push(self.s, self.a, reward, state, next_legals) self.s = state self.a = ACTION_MAP[action] self.score = game_state.getScore() return action def final(self, state): if self.training: reward = state.getScore() - self.score reward = -10 self.memory.push(self.s, self.a, reward, None, []) def train(self): global EPSILON self.training = True self.net.cuda() runners, names = load_runners() for epoch in range(EPOCHS): for t in self.net.parameters(): print(t.data) if epoch <= 4: EPSILON = [0.8, 0.5, 0.3, 0.1, 0.01][epoch] print('Epoch {} | EPSILON {}'.format(epoch, EPSILON)) g_dict = {} for runner, name in zip(runners, names): games = [] for game_idx in range(GAMES_PER_EPOCH): game = runner.run_game(self) games.append(game) for _ in range(SAMPLES_PER_GAME): self.training_iteration() avg = np.mean([game.state.getScore() for game in games]) wins = sum([game.state.isWin() for game in games]) #print(f'{name}: {avg:0.2f} | {wins}/{GAMES_PER_EPOCH}') print('{}: {} | {}/{}'.format(name,avg, wins, GAMES_PER_EPOCH)) print() torch.save(self.net.state_dict(), 'model.pth') def training_iteration(self): # sample mini-batch sarsl = self.memory.sample() if sarsl is None: return else: states, actions, rewards, next_states, next_state_legals = sarsl # replace deaths (None) with zeros for i, s in enumerate(next_states): if s is None: next_states[i] = self.fex.empty() next_states = torch.stack(next_states) # get max Q(s',a'); deaths get value 0 with torch.no_grad(): next_actions_values = self.net(next_states) best_actions_values = [] for next_legals, action_vals in zip(next_state_legals, next_actions_values): legal_vals = [v for (idx,v) in enumerate(action_vals) if idx in next_legals] if legal_vals == []: legal_vals = [0] best_actions_values.append(max(legal_vals)) best_actions_values = torch.tensor(best_actions_values).cuda() # compute target values targets = rewards + GAMMA*best_actions_values # compute current action values actions = actions.reshape(len(actions),1) self.net.train() action_values = self.net(states).gather(1,actions).reshape(32) self.net.eval() # compute loss and backpropagate it loss = self.criterion(targets, action_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def play(self, path): runner = LocalPacmanGameRunner(layout_path=path, random_ghosts=True, show_window=True, zoom_window=1.0, frame_time=0.1, timeout=-1000) game = runner.run_game(self)
# For training var_batch_phi = autograd.Variable(torch.Tensor(batch_size, 4, 84, 84)).cuda() var_batch_a = autograd.Variable(torch.LongTensor(batch_size, 1), requires_grad=False).cuda() var_batch_r = autograd.Variable(torch.Tensor(batch_size, 1)).cuda() var_batch_phi_next = autograd.Variable(torch.Tensor(batch_size, 4, 84, 84)).cuda() var_batch_r_mask = autograd.Variable(torch.Tensor(batch_size, 1), requires_grad=False).cuda() MP = MemoryReplay(memory_size, batch_size) dqn = DQN() target_dqn = DQN() target_dqn.load_state_dict(dqn.state_dict()) dqn.cuda() target_dqn.cuda() optimz = optim.RMSprop(dqn.parameters(), lr=0.0025, alpha=0.9, eps=1e-02, momentum=0.0) pong = Pong() for i in range(memory_size): phi = pong.current_phi act_index = random.randrange(3) phi_next, r, done = pong.step(VALID_ACTION[act_index]) pong.display()
class Agent: def __init__(self, time_step, split, lr): self.dataset = Dataset(T=time_step, split_ratio=split, binary_file=config.BINARY_DATASET) self.policy_net_encoder = AttnEncoder( input_size=self.dataset.get_num_features(), hidden_size=config.ENCODER_HIDDEN_SIZE, time_step=time_step) self.policy_net_decoder = AttnDecoder( code_hidden_size=config.ENCODER_HIDDEN_SIZE, hidden_size=config.DECODER_HIDDEN_SIZE, time_step=time_step) self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder) self.target_net_encoder = AttnEncoder( input_size=self.dataset.get_num_features(), hidden_size=config.ENCODER_HIDDEN_SIZE, time_step=time_step) self.target_net_decoder = AttnDecoder( code_hidden_size=config.ENCODER_HIDDEN_SIZE, hidden_size=config.DECODER_HIDDEN_SIZE, time_step=time_step) self.target_net = DQN(self.target_net_encoder, self.target_net_decoder) if torch.cuda.is_available(): self.policy_net_encoder = self.policy_net_encoder.cuda() self.policy_net_decoder = self.policy_net_decoder.cuda() self.target_net_encoder = self.target_net_encoder.cuda() self.target_net_decoder = self.target_net_decoder.cuda() self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.memory = ReplayMemory(config.MEMORY_CAPACITY) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) def select_action(self, state, test=False): global steps_done sample = random.random() eps_threshold = config.EPS_END + ( config.EPS_START - config.EPS_END) * math.exp( -1. * steps_done / config.EPS_DECAY) steps_done += 1 if sample > eps_threshold or test == True: with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1) else: if torch.cuda.is_available(): return torch.tensor([[random.randint(3)]], dtype=torch.long).cuda() else: return torch.tensor([[random.randint(3)]], dtype=torch.long) def optimize_model(self): if len(self.memory) < config.BATCH_SIZE: return transitions = self.memory.sample(config.BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = tuple([ torch.cat( tuple([batch.state[i][j] for i in range(config.BATCH_SIZE)])) for j in range(3) ]) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = tuple([ torch.cat( tuple( [batch.next_state[i][j] for i in range(config.BATCH_SIZE)])) for j in range(3) ]) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() expected_state_action_values = (next_state_values * config.GAMMA) + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): if param.grad is not None: param.grad.data.clamp_(-1, 1) self.optimizer.step() def load_model(self, encoder_path=None, decoder_path=None, DQN_path=None): if (DQN_path != None): self.policy_net.load_state_dict( torch.load(DQN_path, map_location=lambda storage, loc: storage)) self.target_net.load_state_dict(self.policy_net.state_dict()) else: self.policy_net_encoder.load_state_dict( torch.load(encoder_path, map_location=lambda storage, loc: storage)) self.policy_net_decoder.load_state_dict( torch.load(decoder_path, map_location=lambda storage, loc: storage)) self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder) self.target_net.load_state_dict(self.policy_net.state_dict()) def train(self, num_epochs, interval): env = Environment(np.array([0.5, 0.5])) episode = 0 for epoch in range(num_epochs): env.reset() state = (env.x[env.current_step].unsqueeze(0), env.y_seq[env.current_step].unsqueeze(0), env.position.unsqueeze(0)) while (1): action = self.select_action(state) _, next_state, reward = env.step(action.item()) if (next_state == None): break self.memory.push(state, action, next_state, reward) state = next_state self.optimize_model() episode += 1 if (episode % config.TARGET_UPDATE == 0): self.target_net.load_state_dict( self.policy_net.state_dict()) print(env.wealth, action, env.position) if (epoch + 1) % (interval) == 0 or epoch + 1 == num_epochs: torch.save(self.policy_net.state_dict(), 'models/DQN' + str(epoch + 1) + '.model') def test(self, num_epochs): env = Environment(test=True) state = (env.x[env.current_step], env.y_seq[env.current_step], env.position) while (1): action = self.select_action(state, test=True) _, next_state, _ = env.step(action.item()) if (next_state == None): break state = next_state print(env.wealth)
state_var = torch.autograd.Variable(state) target_var = torch.autograd.Variable(target) target_var.unsqueeze_(0) import copy learner = DQN() actor = DQN() for param in actor.parameters(): param.requires_grad = False cuda = False if torch.cuda.is_available(): cuda = True learner = learner.cuda(0) reward = reward.cuda(0) optimizer = torch.optim.SGD(learner.parameters(), lr=0.01) criterion = L2_loss(0.999) learner.train() for k in range(100): if cuda: x = learner(d_state_var.cuda(0)) else: x = learner(d_state_var) actor.load_state_dict(learner.state_dict()) # print(x) for i in range(10): # state_var = torch.autograd.Variable(torch.randn(1, 3, 40, 40)) y = actor(state_var)
def main(): global args, move_list, i_step args = parser.parse_args() #move_list = [x.__name__ for x in movement.__dict__.values() # if inspect.isfunction(x)] #move_list.remove('focus') move_list=[] move_list.append('f_roll') move_list.append('idle') move_list.append('r_roll') move_list.append('l_roll') move_list.append('b_roll') move_list.append('light_atk') move_list.append('drink_estus') m = PyMouse(display=':0') k = PyKeyboard(display=':0') sct = mss(display=':0') env = DarkSoulsEnv(sct=sct, m=m, k=k) if args.pretrain: pass else: args.pretrain=None model1 = DQN(action=len(move_list), variables=3, pretrained=args.pretrain) if use_cuda: model1.cuda() # get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model1.parameters()]))) optimizer = optim.Adam(model1.parameters(), lr=args.lr) #optimizer = optim.RMSprop(model.parameters()) #optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) i_step = 0 args.start_episode = 0 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_episode = checkpoint['episode'] i_step = checkpoint['step'] args.name = checkpoint['name'] model1.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['episode'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) model2 = copy.deepcopy(model1) train(model=model1, model2=model2, env=env, optimizer=optimizer)
class DDQNAgent: def __init__(self, config: Config, training=True): self.config = config self.is_training = training self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_shape, self.config.action_dim) self.target_model = DQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.optim = Adam(self.model.parameters(), lr=self.config.learning_rate) self.model.cuda() self.target_model.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learn(self, t): s, a, r, s2, done = self.buffer.sample(self.config.batch_size) s = torch.tensor(s, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) s2 = torch.tensor(s2, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) s = s.cuda() a = a.cuda() r = r.cuda() s2 = s2.cuda() done = done.cuda() q_values = self.model(s).cuda() next_q_values = self.model(s2).cuda() next_q_state_values = self.target_model(s2).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optim.zero_grad() loss.backward() self.optim.step() if t % self.config.update_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_checkpoint(self): os.makedirs('ckpt', exist_ok=True) torch.save(self.model.state_dict(), 'ckpt/model.pt') def load_checkpoint(self): self.model.load_state_dict('ckpt/model.pt') self.target_model.load_state_dict('ckpt/model.pt')
model_path = 'dqn3.pth' if __name__ == '__main__': # Create carpole environment and network env = gym.make('CartPole-v0').unwrapped net = DQN(n_state=env.observation_space.shape[0], n_action=env.action_space.n, memory_size=memory_size, lr=lr, epsilon=epsilon, epsilon_decay=epsilon_decay, update_iter=update_iter, batch_size=batch_size, gamma=gamma, model_path=model_path) net.cuda() net.load() reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 while True: # env.render() # Select action and obtain the reward a = net.chooseAction(s) s_, r, finish, info = env.step(a) # Record the total reward total_reward += r # Revised the reward
class DQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_value = next_q_values.max(1)[0] q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) # Notice that detach the expected_q_value loss = (q_value - expected_q_value.detach()).pow(2).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() return loss.item() def cuda(self): self.model.cuda() def load_weights(self, model_path): if model_path is None: return self.model.load_state_dict(torch.load(model_path)) def save_model(self, output, tag=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n")
class Agent(): def __init__(self, n_actions, eps_start, eps_end, eps_steps, gamma, train, cuda, batch_size): self.eps_start = eps_start self.eps_end = eps_end self.eps_steps = eps_steps self.gamma = gamma self.batch_size = batch_size self.n_actions = n_actions self.steps_done = 0 self.policy_net = DQN( n_actions) # CHANGE THESE TWO LINES FOR TESTING ON CART POLE self.target_net = DQN( n_actions) # CHANGE THESE TWO LINES FOR TESTING ON CART POLE if not train: self.policy_net.load_state_dict(torch.load('NetParameters.txt')) self.update_target_net() if cuda: self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.criterion = nn.MSELoss() self.optimizer = optim.RMSprop(self.policy_net.parameters()) #self.optimizer=optim.Adam(self.policy_net.parameters(),0.001) def take_action(self, state): r = random.random() epsilon = self.eps_start - ( (self.eps_start - self.eps_end) / self.eps_steps) * self.steps_done #epsilon=EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.steps_done / EPS_DECAY) self.steps_done += 1 if epsilon < self.eps_end: epsilon = self.eps_end if r < epsilon: return random.randint(0, self.n_actions - 1) else: return self.policy_net(Variable(state.cuda( ), volatile=True)).data.max(1)[1][ 0] #without [0] it was a long tensor of size 1,but env.step() takes a number,which is size 0 def optimize_model(self, memory): if len(memory.memory) < self.batch_size: return transitions = memory.sample(self.batch_size) batch_state, batch_action, batch_next_state, batch_reward = zip( *transitions) batch_state = Variable(torch.cat(batch_state)).cuda() batch_action = Variable(torch.cat(batch_action)).cuda() batch_reward = Variable(torch.cat(batch_reward)).cuda() batch_next_state = Variable(torch.cat(batch_next_state)).cuda() current_q_values = self.policy_net(batch_state).gather( 1, batch_action.unsqueeze(1) ) #action was 1 dimensional,dimensions need to match batch_state max_next_q_values = self.target_net(batch_next_state).detach().max( 1)[0] expected_q_values = batch_reward + (self.gamma * max_next_q_values) loss = self.criterion(current_q_values, expected_q_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.steps_done % 400 == 0: #update target net self.update_target_net() def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def save(self): print("Cuva model") torch.save(self.policy_net.state_dict(), 'NetParameters.txt') print("Sacuvao ga je")