def main(args): test_x, test_y = load_image(args.image_path) test_inp = to_tensor(test_x.astype(np.float32)) test_target = to_tensor(test_y.astype(np.float32)) generator = Generator().to("cuda") start_t = time.time() pretrain_model = flow.load(args.model_path) generator.load_state_dict(pretrain_model) end_t = time.time() print("load params time : {}".format(end_t - start_t)) start_t = time.time() generator.eval() with flow.no_grad(): gout = to_numpy(generator(test_inp), False) end_t = time.time() print("infer time : {}".format(end_t - start_t)) # save images save_images( gout, test_inp.numpy(), test_target.numpy(), path=os.path.join("./testimage.png"), plot_size=1, )
def forward_actor(self, state, goal=None): if not isinstance(state, torch.Tensor): state = to_tensor(state) if goal is not None: if not isinstance(goal, torch.Tensor): goal = to_tensor(goal) x = torch.cat((state, goal), -1) else: x = state return self.actor(x)
def __getitem__(self, idx): image_path = self.image_files[idx] mask_path = self.masks_files[idx] img = cv2.imread(image_path) mask = cv2.imread(mask_path) augmented = self.transforms(image=img, mask=mask) img = augmented['image'] mask = augmented['mask'] return to_tensor(img), to_tensor(mask)
def forward_critic(self, state, action, goal=None): if not isinstance(state, torch.Tensor): state = to_tensor(state) if not isinstance(action, torch.Tensor): action = to_tensor(action) if goal is not None: if not isinstance(goal, torch.Tensor): goal = to_tensor(goal) x = torch.cat((state, action, goal), 1) else: x = torch.cat((state, action), 1) return self.critic(x)
def behavior(): ''' Obsolete. Draw the action probability of the agent at different observations. ''' actor = Actor(3, 5, args).to(device) critic = Critic(3, args).to(device) saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) actionstr = {0: 'left', 1: 'right', 2: 'down', 3: 'up'} for enemyBaseHealth in [100, 50, 1]: allinput = [] for posX in range(51): for posY in range(51): allinput.append([posX, posY, enemyBaseHealth]) allinput = np.array(allinput) normalized = [] for i in range(0, 2592, 10): normalized.append(running_state(allinput[i:i + 10, :])) ending = running_state(allinput[2591:2601, :]) ending2d = np.empty((1, 3)) ending2d[0, :] = ending[-1, :] normalized.append(ending2d) allNormalized = np.concatenate(normalized, axis=0) with torch.no_grad(): mu = actor(to_tensor(allNormalized)) mu = torch.cat([to_tensor(allNormalized), mu], dim=1) for action in range(4): fig, ax = plt.subplots(figsize=(7, 7)) value = np.empty((51, 51)) for row in range(51): for col in range(51): value[row, col] = mu[51 * col + 50 - row, 3 + action].item() # (x, y) = (col, 50-row) ax.set_xlabel('X') ax.set_ylabel('Y') plt.imshow(value, cmap='Greens', interpolation='spline36', vmin=0.05, vmax=0.95) plt.colorbar() ax.plot(30, 30, '*r', markersize=10) plt.title('Health %d Action-%s' % (enemyBaseHealth, actionstr[action])) plt.tight_layout() plt.savefig('Health%dAction%dat.png' % (enemyBaseHealth, action)) plt.close()
def forward(self, x, goal): if not isinstance(x, torch.Tensor): x = to_tensor(x) if not isinstance(goal, torch.Tensor): goal = to_tensor(goal) x = torch.cat((x, goal), -1) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = F.relu(self.fc3(x)) x = self.action_out(x) actions = self.max_action * torch.tanh(x) return (actions, x)
def process_memory(net, batch, args): states = to_tensor(batch.state, False) actions = to_tensor(batch.action, False) rewards = to_tensor(batch.reward, False) masks = to_tensor(batch.mask, False) netOutput = net(states) # (value, action, moveX, moveY, target) values = netOutput[0] old_policy = log_density(actions, netOutput) old_values = values.clone() returns, advants = getGA(rewards, masks, values, args) return states, actions, returns, advants, old_policy, old_values
def forward(self, x, actions, goals): if not isinstance(x, torch.Tensor): x = to_tensor(x) if not isinstance(actions, torch.Tensor): actions = to_tensor(actions) if not isinstance(goals, torch.Tensor): goals = to_tensor(goals) x = torch.cat([x, goals, actions / self.max_action], dim=1) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = F.relu(self.fc3(x)) q_value = self.q_out(x) return q_value
def process_memory(actor, critic, batch, args): states = to_tensor(batch.state) actions = to_tensor(batch.action) rewards = to_tensor(batch.reward) masks = to_tensor(batch.mask) values = critic(states) # ---------------------------- # step 1: get returns and GAEs and log probability of old policy returns, advants = get_gae(rewards, masks, values, args) mu, std, logstd = actor(states) old_policy = log_density(actions, mu, std, logstd) old_values = values.clone() return states, actions, returns, advants, old_policy, old_values
def fit(self, features, adj, labels, idx_train, idx_val=None, train_iters=200, initialize=True, verbose=False, normalize=True, patience=500, load_path=None): ''' train the gcn model, when idx_val is not None, pick the best model according to the validation loss ''' self.device = self.layers[0].weight.device if initialize: self.initialize() if type(adj) is not torch.Tensor: features, adj, labels = utils.to_tensor(features, adj, labels, device=self.device) else: features = features.to(self.device) adj = adj.to(self.device) labels = labels.to(self.device) if normalize: if utils.is_sparse_tensor(adj): adj_norm = utils.normalize_adj_tensor(adj, sparse=True) else: adj_norm = utils.normalize_adj_tensor(adj) else: adj_norm = adj self.adj_norm = adj_norm self.features = features self.labels = labels self._train_with_val(labels, idx_train, idx_val, train_iters, verbose, load_path)
def predict(self, features=None, adj=None): '''By default, inputs are unnormalized data''' self.eval() if features is None and adj is None: return self.forward(self.features, self.adj_norm) else: if type(adj) is not torch.Tensor: features, adj = utils.to_tensor(features, adj, device=self.device) self.features = features return self.forward(self.features, adj) if utils.is_sparse_tensor(adj): self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True) else: self.adj_norm = utils.normalize_adj_tensor(adj) return self.forward(self.features, self.adj_norm)
def forward(self, state, action=None): if not isinstance(state, torch.Tensor): state = to_tensor(state) value = self.critic(state) mu = self.actor(state) std = self.log_std dist = torch.distributions.Normal(mu, F.softplus(std)) if action is None: action = dist.sample() log_prob = dist.log_prob(action).sum(-1).unsqueeze(-1) entropy = dist.entropy().sum(-1).unsqueeze(-1) return { 'actions': action.unsqueeze(0), 'log_prob': log_prob, 'entropy': entropy, 'mean': mu, 'values': value }
def _update(self): experiences = self.replay_buffer.sample(self.config.batch_size) states, goals = self._preproc_og(experiences['obs'], experiences['g']) next_states, next_goals = self._preproc_og(experiences['next_obs'], experiences['g']) actions = experiences['actions'] rewards = experiences['r'] states = self.o_norm.normalize(states) goals = self.g_norm.normalize(goals) next_states = self.o_norm.normalize(next_states) next_goals = self.g_norm.normalize(next_goals) with torch.no_grad(): next_actions = self.target_actor(next_states, next_goals) target_value = self.target_critic(next_states, next_actions[0], next_goals) expected_value = (to_tensor(rewards) + self.config.discount * target_value).detach() clip_return = 1 / (1 - self.config.discount) expected_value = torch.clamp(expected_value, -clip_return, 0) #====== Value loss ======== value_criterion = nn.MSELoss() value = self.critic(states, actions, goals) value_loss = value_criterion(expected_value, value) #====== Policy loss ======= actions_ = self.actor(states, goals) policy_loss = -(self.critic(states, actions_[0], goals)).mean() policy_loss += self.config.action_l2 * (actions_[0]).pow(2).mean() #====== Policy update ======= self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() #====== Value update ======== self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step()
def train(self): # init dataset x, y = load_facades() # flow.Tensor() bug in here x, y = np.ascontiguousarray(x), np.ascontiguousarray(y) self.fixed_inp = to_tensor(x[:self.batch_size].astype(np.float32)) self.fixed_target = to_tensor(y[:self.batch_size].astype(np.float32)) batch_num = len(x) // self.batch_size label1 = to_tensor(np.ones((self.batch_size, 1, 30, 30)), dtype=flow.float32) label0 = to_tensor(np.zeros((self.batch_size, 1, 30, 30)), dtype=flow.float32) for epoch_idx in range(self.n_epochs): self.netG.train() self.netD.train() start = time.time() # run every epoch to shuffle for batch_idx in range(batch_num): inp = to_tensor(x[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size].astype(np.float32)) target = to_tensor( y[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size].astype(np.float32)) # update D d_fake_loss, d_real_loss, d_loss = self.train_discriminator( inp, target, label0, label1) # update G g_gan_loss, g_image_loss, g_total_loss, g_out = self.train_generator( inp, target, label1) self.G_GAN_loss.append(g_gan_loss) self.G_image_loss.append(g_image_loss) self.G_total_loss.append(g_total_loss) self.D_loss.append(d_loss) if (batch_idx + 1) % self.eval_interval == 0: self.logger.info( "{}th epoch, {}th batch, d_fakeloss:{:>8.4f}, d_realloss:{:>8.4f}, ggan_loss:{:>8.4f}, gl1_loss:{:>8.4f}" .format( epoch_idx + 1, batch_idx + 1, d_fake_loss, d_real_loss, g_gan_loss, g_image_loss, )) self.logger.info("Time for epoch {} is {} sec.".format( epoch_idx + 1, time.time() - start)) if (epoch_idx + 1) % 2 * self.eval_interval == 0: # save .train() images # save .eval() images self._eval_generator_and_save_images(epoch_idx) if self.save: flow.save( self.netG.state_dict(), os.path.join(self.checkpoint_path, "pix2pix_g_{}".format(epoch_idx + 1)), ) flow.save( self.netD.state_dict(), os.path.join(self.checkpoint_path, "pix2pix_d_{}".format(epoch_idx + 1)), ) # save train loss and val error to plot np.save( os.path.join(self.path, "G_image_loss_{}.npy".format(self.n_epochs)), self.G_image_loss, ) np.save( os.path.join(self.path, "G_GAN_loss_{}.npy".format(self.n_epochs)), self.G_GAN_loss, ) np.save( os.path.join(self.path, "G_total_loss_{}.npy".format(self.n_epochs)), self.G_total_loss, ) np.save( os.path.join(self.path, "D_loss_{}.npy".format(self.n_epochs)), self.D_loss, ) self.logger.info("*************** Train done ***************** ")
def train(): numAgent = 10 # multiple agents are running synchronously. # each agent has a different type with different properties. # Only one network is created, different agent gets their # own behavior according to the embedding input. numGame = 20 # multiple games running simultaneously. print('agent count:', numAgent) print('Env num:', numGame) env = {} for game in range(numGame): env[game] = miniDotaEnv(args, numAgent) # initialize the neural networks. # use a single network to share the knowledge. net = ac(args) if not args.cpuSimulation: net = net.to(device) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) net.load_state_dict(ckpt['net']) observations, lastDone = {}, {} for game in range(numGame): observations[game] = env[game].reset(0)[ 'observations'] # get initial state. lastDone[game] = [ False ] * 10 # to record whether game is done at the previous step. optimizer = optim.Adam(net.parameters(), lr=args.lr) for iteration in range(args.max_iter): # playing-training iteration. start = time.time() print() print('Start iteration %d ..' % iteration) if args.cpuSimulation: net = net.cpu() net.eval() # switch to evaluation mode. memory = [] for i in range(numGame): memory.append([Memory() for j in range(numAgent)]) # memory is cleared at every iter so only the current iteration's samples are used in training. # the separation of memory according to game is necessary as they # need to be processed separate for each game. steps = 0 teamscore = 0 # only for game 0. record = [] # record the states for visualization. gameEnd = np.zeros(numGame).astype(bool) while steps <= args.time_horizon: # loop for one game. if np.all(gameEnd): break steps += 1 stateList = [] for game in range(numGame): for agent in range(numAgent): stateList.append( np.expand_dims(observations[game][agent], axis=0)) stateCombined = np.concatenate(stateList, axis=0) # concatenate the states of all games and process them by the network together. with torch.no_grad(): actionDistr = net(to_tensor(stateCombined, args.cpuSimulation)) actions = get_action(actionDistr) for game in range(numGame): if not gameEnd[game]: # the following random action cannot work, because random action has too small prob density value, # leading to strange bugs. # sample = random.random() # if sample > args.randomActionRatio * (1 - min(1, iteration/1000) ): # thisGameAction = actions[10*game:10*(game+1), :] # contain actions from all agents. # check(thisGameAction) # else: # actionmove = np.random.randint(0, 3, size=(10,3)) # target = np.random.randint(0, 12, size=(10,1)) # thisGameAction = np.concatenate([actionmove, target], axis=1) thisGameAction = actions[10 * game:10 * ( game + 1 ), :] # select the actions from all agents of this env. envInfo = env[game].step( thisGameAction ) # environment runs one step given the action. nextObs = envInfo['observations'] # get the next state. if game == 0: record.append( np.concatenate([ env[game].getState(), actions[0:10, :].reshape(-1) ])) rewards = envInfo['rewards'] dones = envInfo['local_done'] # masks = list(~dones) # cut the return calculation at the done point. masks = [ True ] * numAgent # no need to mask out the last state-action pair, # because the last reward is useful to us. for i in range(numAgent): if not lastDone[game][i]: memory[game][i].push(observations[game][i], thisGameAction[i], rewards[i], masks[i]) lastDone[game] = dones if game == 0: teamscore += sum( [rewards[x] for x in env[game].getTeam0()]) observations[game] = nextObs gameEnd[game] = np.all(dones) if gameEnd[game]: if game == 0: print('Game 0 score: %f' % teamscore) # recordMat = np.stack(record)# stack will expand the dimension before concatenate. # draw(recordMat, iteration, env[game].getUnitRange(), 10) observations[game] = env[game].reset(iteration + 1)['observations'] lastDone[game] = [False] * 10 simEnd = time.time() print('Simulation time: %.f' % (simEnd - start)) net.train() # switch to training mode. net = net.cuda() sts, ats, returns, advants, old_policy, old_value = [], [], [], [], [], [] for game in range(numGame): for i in range(numAgent): batch = memory[game][i].sample() st, at, rt, adv, old_p, old_v = process_memory( net, batch, args) sts.append(st) ats.append(at) returns.append(rt) advants.append(adv) old_policy.append(old_p) old_value.append(old_v) sts = torch.cat(sts) ats = torch.cat(ats) returns = torch.cat(returns) advants = torch.cat(advants) old_policy = torch.cat(old_policy) old_value = torch.cat(old_value) train_model(net, optimizer, sts, ats, returns, advants, old_policy, old_value, args) # training is based on the state-action pairs from all games of the current iteration. trainEnd = time.time() print('Training time: %.f' % (trainEnd - simEnd)) if iteration % 10 == 0: model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_%.3f.pth.tar' % teamscore) save_checkpoint( { 'net': net.state_dict(), 'args': args, 'score': teamscore }, filename=ckpt_path)
weight_decay=args.l2_rate) scores = [] score_avg = 0 for iter in range(args.max_iter): actor.eval(), critic.eval() memory = [Memory() for _ in range(num_agent)] steps = 0 score = 0 while steps < args.time_horizon: steps += 1 mu, std, _ = actor(to_tensor(states)) actions = get_action(mu, std) env_info = env.step(actions)[default_brain] next_states = running_state(env_info.vector_observations) rewards = env_info.rewards dones = env_info.local_done masks = list(~(np.array(dones))) for i in range(num_agent): memory[i].push(states[i], actions[i], rewards[i], masks[i]) score += rewards[0] states = next_states if dones[0]:
def test(interval, runs): print('Testing..') numAgent = 10 numGame = 1 assert numGame == 1 # needed. env = {0: miniDotaEnv(args, numAgent)} net = ac(args) if not args.cpuSimulation: net = net.to(device) saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) net.load_state_dict(ckpt['net']) net.eval() observations = {0: env[0].reset(0)['observations']} for iteration in range(runs): start = time.time() print() print('Start iteration %d ..' % iteration) if args.cpuSimulation: net = net.cpu() steps = 0 teamscore = 0 gameEnd = np.zeros(numGame).astype(bool) record = [] teamLabel = env[0].getState().reshape((12, 4))[:10, 0] while steps <= args.time_horizon: # loop for one round of games. if np.all(gameEnd): break steps += 1 stateList = [] for game in range(numGame): for agent in range(numAgent): stateList.append( np.expand_dims(observations[game][agent], axis=0)) stateCombined = np.concatenate(stateList, axis=0) with torch.no_grad(): actionDistr = net(to_tensor( stateCombined, args.cpuSimulation)) # calculate all envs together. actions = get_action(actionDistr) for game in range(numGame): if not gameEnd[game]: thisGameAction = actions[ 10 * game:10 * (game + 1), :] # contain actions from all agents. # for player in range(10): # if teamLabel[player] == 0 and steps < 100: # thisGameAction[player] = [0, 1, 1, 0] # ablation test. envInfo = env[game].step( thisGameAction ) # environment runs one step given the action. nextObs = envInfo['observations'] # get the next state. allAction = np.concatenate( [actionDistr[x] for x in range(1, 5)], axis=1) record.append( np.concatenate([ env[0].getState(), actions[0:10, :].reshape(-1), allAction.reshape(-1) ])) rewards = envInfo['rewards'] dones = envInfo['local_done'] teamscore += sum([rewards[x] for x in env[0].getTeam0()]) observations[game] = nextObs gameEnd[game] = np.all(dones) if gameEnd[game]: print('Team 0 score: %f' % teamscore) simEnd = time.time() print('Simulation time: %.f' % (simEnd - start)) recordMat = np.stack( record ) # stack will expand the dimension before concatenate. draw(recordMat, iteration, env[game].getUnitRange(), interval) observations[game] = env[game].reset(iteration + 1)['observations'] drawEnd = time.time() print('Drawing time: %.f' % (drawEnd - simEnd))