def _generator_run(self, input_): self.game_no = self.game_no + 1 self.init_fn(input_) self.engine = simulator.Simulator(feature_name='unit_test1', actionspace_name='lattice1', canvas=self.canvas) self.reset() while self.engine.get_time() < 200: self.i = self.i + 1 #print(dire_predefine_step) dire_state = self.engine.get_state_tup("Dire", 0) dire_predefine_step = self.engine.predefined_step("Dire", 0) predefine_move = torch.LongTensor([dire_predefine_step[1]]) is_end = dire_state[2] if is_end: break self.predefined_steps.append(predefine_move) state_now = dire_state[0] self.states.append(state_now) action_out, value_out = self.a3c_model(state_now) prob = F.softmax(action_out) self.raw_probs.append(prob) log_prob = F.log_softmax(action_out) self.raw_log_probs.append(log_prob) entropy = -(log_prob * prob).sum(1, keepdim=True) self.entropies.append(entropy) if self.rank != 0: action = predefine_move.view(1, -1).data else: #action = prob.multinomial(num_samples=1).data action = torch.argmax(log_prob, 1).data.view(-1, 1) self.actions.append(action) log_prob = log_prob.gather(1, Variable(action)) self.engine.set_order("Dire", 0, (1, action)) self.engine.loop() reward = dire_state[1] self.rewards.append(reward) self.values.append(value_out) self.log_probs.append(log_prob) yield print("rank %d os.pid %d" % (self.rank, os.getpid())) if self.rank != 0: self.train()
def _generator_run(self, input_): self.init_fn(input_) self.engine = simulator.Simulator(feature_name='unit_test1', actionspace_name='lattice1', canvas=self.canvas) while True: dire_predefine_step = self.engine.predefined_step("Dire",0) self.engine.loop() self.engine.set_order("Dire",0,dire_predefine_step) yield if self.stop_cond_fn(self): break self.cleanup_fn()
def _generator_run(self, input_): self.init_fn(input_) self.engine = simulator.Simulator(feature_name='unit_test1', actionspace_name='lattice1', canvas=self.canvas) self.reset() while self.engine.get_time() < 200: self.i = self.i + 1 #print(dire_predefine_step) dire_state = self.engine.get_state_tup("Dire", 0) dire_predefine_step = self.engine.predefined_step("Dire", 0) predefine_move = torch.LongTensor([dire_predefine_step[1]]) is_end = dire_state[2] if is_end: break self.predefined_steps.append(predefine_move) action_out, value_out = self.a3c_model(dire_state[0]) prob = F.softmax(action_out) log_prob = F.log_softmax(action_out) self.raw_log_probs.append(log_prob) entropy = -(log_prob * prob).sum(1, keepdim=True) self.entropies.append(entropy) action = prob.multinomial(num_samples=1).data log_prob = log_prob.gather(1, Variable(action)) self.engine.set_order("Dire", 0, (1, action)) self.engine.loop() reward = 0 self.rewards.append(reward) self.values.append(value_out) self.log_probs.append(log_prob) yield self.train()
def _generator_run(self, input_): self.init_fn(input_) self.engine = simulator.Simulator(feature_name='unit_test1', canvas=self.canvas) while True: self.i = self.i + 1 dire_predefine_step = self.engine.predefined_step("Dire",0) self.engine.loop() dire_predefine_step = self.engine.predefined_step("Dire",0) predefine_move = torch.FloatTensor(dire_predefine_step[1]) #print(dire_predefine_step) dire_state = self.engine.get_state_tup("Dire", 0) is_end = dire_state[2] out = self.lstm_module(dire_state[0]) loss = torch.mean((out - predefine_move)**2) self.losses.append(loss) self.engine.set_order("Dire", 0, (1,tuple(out.detach().numpy()[0]))) if self.i % self.batch_size == 0: #just make it simple self.lstm_module.zero_grad() random.shuffle(self.losses) self.losses = self.losses[:self.buffer_size] buf = self.losses[:self.batch_size] avg_loss = (sum(buf)/self.batch_size) print(avg_loss.float(), out, predefine_move) avg_loss.backward() self.optimizer.step() self.losses = [] yield if is_end: break
def _generator_run(self, input_): self.game_no = self.game_no + 1 self.init_fn(input_) self.engine = simulator.Simulator(feature_name='Lattice1', actionspace_name='lattice1', canvas=self.canvas) state_pkg = StatePkg() while self.engine.get_time() < 30: self.i = self.i + 1 #print(dire_predefine_step) dire_state = self.engine.get_state_tup("Dire", 0) dire_predefine_step = self.engine.predefined_step("Dire", 0) predefine_move = torch.LongTensor([dire_predefine_step[1]]) is_end = dire_state[2] if is_end: break state_pkg.predefined_steps.append(predefine_move) state_now = dire_state[0] state_pkg.states.append(state_now) action_out, value_out = self.a3c_model(state_now) prob = F.softmax(action_out) state_pkg.raw_probs.append(prob) log_prob = F.log_softmax(action_out) state_pkg.raw_log_probs.append(log_prob) entropy = -(log_prob * prob).sum(1, keepdim=True) state_pkg.entropies.append(entropy) #print(torch.max(prob).data) max_prob = torch.max(prob).data if max_prob > 0.9: action = torch.argmax(log_prob, 1).data.view(-1, 1) else: action = prob.multinomial(num_samples=1).data #action = torch.argmax(log_prob, 1).data.view(-1,1) state_pkg.actions.append(action) log_prob = log_prob.gather(1, Variable(action)) self.engine.set_order("Dire", 0, (1, action)) self.engine.loop() reward = dire_state[1] state_pkg.rewards.append(reward) state_pkg.values.append(value_out) state_pkg.log_probs.append(log_prob) yield print("rank %d os.pid %d" % (self.rank, os.getpid())) self.state_buffer.append(state_pkg) self.train(self.state_buffer) self.state_buffer = self.state_buffer[-2:] torch.save(self.a3c_model.state_dict(), "./tmp/model_%d_%d" % (self.game_no, os.getpid()))