def train(args): env = BatchKGEnvironment(args.dataset, args.max_acts, max_path_len=args.max_path_len, state_history=args.state_history) uids = list(env.kg(USER).keys()) dataloader = ACDataLoader(uids, args.batch_size) model = ActorCritic(env.state_dim, env.act_dim, gamma=args.gamma, hidden_sizes=args.hidden).to(args.device) logger.info('Parameters:' + str([i[0] for i in model.named_parameters()])) optimizer = optim.Adam(model.parameters(), lr=args.lr) total_losses, total_plosses, total_vlosses, total_entropy, total_rewards = [], [], [], [], [] step = 0 model.train() for epoch in range(1, args.epochs + 1): ### Start epoch ### dataloader.reset() while dataloader.has_next(): batch_uids = dataloader.get_batch() ### Start batch episodes ### batch_state = env.reset(batch_uids) # numpy array of [bs, state_dim] done = False while not done: batch_act_mask = env.batch_action_mask(dropout=args.act_dropout) # numpy array of size [bs, act_dim] batch_act_idx = model.select_action(batch_state, batch_act_mask, args.device) # int batch_state, batch_reward, done = env.batch_step(batch_act_idx) model.rewards.append(batch_reward) ### End of episodes ### lr = args.lr * max(1e-4, 1.0 - float(step) / (args.epochs * len(uids) / args.batch_size)) for pg in optimizer.param_groups: pg['lr'] = lr # Update policy total_rewards.append(np.sum(model.rewards)) loss, ploss, vloss, eloss = model.update(optimizer, args.device, args.ent_weight) total_losses.append(loss) total_plosses.append(ploss) total_vlosses.append(vloss) total_entropy.append(eloss) step += 1 # Report performance if step > 0 and step % 100 == 0: avg_reward = np.mean(total_rewards) / args.batch_size avg_loss = np.mean(total_losses) avg_ploss = np.mean(total_plosses) avg_vloss = np.mean(total_vlosses) avg_entropy = np.mean(total_entropy) total_losses, total_plosses, total_vlosses, total_entropy, total_rewards = [], [], [], [], [] logger.info( 'epoch/step={:d}/{:d}'.format(epoch, step) + ' | loss={:.5f}'.format(avg_loss) + ' | ploss={:.5f}'.format(avg_ploss) + ' | vloss={:.5f}'.format(avg_vloss) + ' | entropy={:.5f}'.format(avg_entropy) + ' | reward={:.5f}'.format(avg_reward)) ### END of epoch ### policy_file = '{}/policy_model_epoch_{}.ckpt'.format(args.log_dir, epoch) logger.info("Save model to " + policy_file) torch.save(model.state_dict(), policy_file)
def train(args): # initialize OpenAI Gym env and dqn agent #env = gym.make(ENV_NAME) env = BatchKGEnvironment(args.dataset, args.max_acts, max_path_len=args.max_path_len, state_history=args.state_history) uids = list(env.kg(USER).keys()) print('uids:',len(uids)) uids = np.arange(19488).tolist() agent = DQN(env,env.state_dim,env.act_dim,gamma = args.gamma,hidden_sizes = args.hidden) dataloader = ACDataLoader(uids, args.batch_size) logger.info('Parameters:' + str([i[0] for i in agent.named_parameters()])) optimizer = optim.Adam(agent.parameters(), lr=args.lr) #model = ActorCritic(env.state_dim, env.act_dim, gamma=args.gamma, hidden_sizes=args.hidden).to(args.device) #logger.info('Parameters:' + str([i[0] for i in model.named_parameters()])) ''' uids = list(env.kg(USER).keys()) dataloader = ACDataLoader(uids, args.batch_size) model = ActorCritic(env.state_dim, env.act_dim, gamma=args.gamma, hidden_sizes=args.hidden).to(args.device) logger.info('Parameters:' + str([i[0] for i in model.named_parameters()])) ''' episode = 1 for epoch in range(0, args.epochs): ### Start epoch ### if epoch % 10 == 0: print('epoch:',epoch) dataloader.reset() while dataloader.has_next(): batch_uids = dataloader.get_batch() ### Start batch episodes ### #print('batch_uids:',batch_uids,';',len(batch_uids)) batch_state1 = env.reset(batch_uids) # numpy array of [bs, state_dim] #print('egreedy_action, batch_state1:',batch_state1,';',len(batch_state1),';',len(batch_state1[0])) for step in range(STEP):#while not done: batch_act_mask = env.batch_action_mask(dropout=args.act_dropout) # numpy array of size [bs, act_dim] '''select action''' #print('shape of batch state1:',batch_state1.shape) batch_act_idx = agent.egreedy_action(batch_state1,batch_act_mask)#batch_act_mask batch_state2, batch_reward, done = env.batch_step(batch_act_idx) batch_state2 = batch_state2.reshape((-1,400)) agent.perceive(batch_state1,batch_act_idx,batch_reward,batch_state2,done,optimizer) agent.rewards.append(batch_reward) #print('batch_state2:',batch_state2,';',len(batch_state2),';',len(batch_state2[0])) batch_state1 = batch_state2 #train Q network if done: break # Test every 100 episodes if epoch % 100 == 0: total_reward = 0 for i in range(TEST): #batch_uids = dataloader.get_batch() #print('action batch_uids:',batch_uids) ### Start batch episodes ### batch_state1 = env.reset(batch_uids) # numpy array of [bs, state_dim] #state = env.reset() for j in range(STEP): #env.render()#在屏幕上显示画面,不需要 batch_act_mask = env.batch_action_mask(dropout=args.act_dropout) #print('before action, batch_state1:',batch_state1,';',len(batch_state1),';',len(batch_state1[0])) action = agent.action(batch_state1,batch_act_mask) # direct action for test batch_state2, batch_reward, done = env.batch_step(action) batch_state1 = batch_state2 total_reward += batch_reward if done: break ave_reward = total_reward/TEST if episode % 100 == 0: print ('episode: ',episode,'Evaluation Average Reward:',sum(ave_reward)/len(ave_reward)) episode = episode + 1 #agent.update_target_q_network(epoch) ######### if epoch % 10 == 0: policy_file = '{}/dqn3_model_epoch_{}.ckpt'.format(args.log_dir, epoch) logger.info("Save model to " + policy_file) print('epoch:',epoch,',episode:',episode) torch.save(agent.state_dict(), policy_file)