Example #1
0
def train(args):
    env = BatchKGEnvironment(args.dataset, args.max_acts, max_path_len=args.max_path_len, state_history=args.state_history)
    uids = list(env.kg(USER).keys())
    dataloader = ACDataLoader(uids, args.batch_size)
    model = ActorCritic(env.state_dim, env.act_dim, gamma=args.gamma, hidden_sizes=args.hidden).to(args.device)
    logger.info('Parameters:' + str([i[0] for i in model.named_parameters()]))
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    total_losses, total_plosses, total_vlosses, total_entropy, total_rewards = [], [], [], [], []
    step = 0
    model.train()
    for epoch in range(1, args.epochs + 1):
        ### Start epoch ###
        dataloader.reset()
        while dataloader.has_next():
            batch_uids = dataloader.get_batch()
            ### Start batch episodes ###
            batch_state = env.reset(batch_uids)  # numpy array of [bs, state_dim]
            done = False
            while not done:
                batch_act_mask = env.batch_action_mask(dropout=args.act_dropout)  # numpy array of size [bs, act_dim]
                batch_act_idx = model.select_action(batch_state, batch_act_mask, args.device)  # int
                batch_state, batch_reward, done = env.batch_step(batch_act_idx)
                model.rewards.append(batch_reward)
            ### End of episodes ###

            lr = args.lr * max(1e-4, 1.0 - float(step) / (args.epochs * len(uids) / args.batch_size))
            for pg in optimizer.param_groups:
                pg['lr'] = lr

            # Update policy
            total_rewards.append(np.sum(model.rewards))
            loss, ploss, vloss, eloss = model.update(optimizer, args.device, args.ent_weight)
            total_losses.append(loss)
            total_plosses.append(ploss)
            total_vlosses.append(vloss)
            total_entropy.append(eloss)
            step += 1

            # Report performance
            if step > 0 and step % 100 == 0:
                avg_reward = np.mean(total_rewards) / args.batch_size
                avg_loss = np.mean(total_losses)
                avg_ploss = np.mean(total_plosses)
                avg_vloss = np.mean(total_vlosses)
                avg_entropy = np.mean(total_entropy)
                total_losses, total_plosses, total_vlosses, total_entropy, total_rewards = [], [], [], [], []
                logger.info(
                        'epoch/step={:d}/{:d}'.format(epoch, step) +
                        ' | loss={:.5f}'.format(avg_loss) +
                        ' | ploss={:.5f}'.format(avg_ploss) +
                        ' | vloss={:.5f}'.format(avg_vloss) +
                        ' | entropy={:.5f}'.format(avg_entropy) +
                        ' | reward={:.5f}'.format(avg_reward))
        ### END of epoch ###

        policy_file = '{}/policy_model_epoch_{}.ckpt'.format(args.log_dir, epoch)
        logger.info("Save model to " + policy_file)
        torch.save(model.state_dict(), policy_file)
Example #2
0
def predict_paths(policy_file, path_file, args):
    print("Predicting paths...")
    env = BatchKGEnvironment(
        args.dataset,
        args.max_acts,
        max_path_len=args.max_path_len,
        state_history=args.state_history,
    )
    pretrain_sd = torch.load(policy_file)
    model = ActorCritic(env.state_dim,
                        env.act_dim,
                        gamma=args.gamma,
                        hidden_sizes=args.hidden).to(args.device)
    model_sd = model.state_dict()
    model_sd.update(pretrain_sd)
    model.load_state_dict(model_sd)

    test_labels = load_labels(args.dataset, "test")
    test_uids = list(test_labels.keys())

    batch_size = 16
    start_idx = 0
    all_paths, all_probs = [], []
    pbar = tqdm(total=len(test_uids))
    while start_idx < len(test_uids):
        end_idx = min(start_idx + batch_size, len(test_uids))
        batch_uids = test_uids[start_idx:end_idx]
        paths, probs = batch_beam_search(env,
                                         model,
                                         batch_uids,
                                         args.device,
                                         topk=args.topk)
        all_paths.extend(paths)
        all_probs.extend(probs)
        start_idx = end_idx
        pbar.update(batch_size)
    predicts = {"paths": all_paths, "probs": all_probs}
    pickle.dump(predicts, open(path_file, "wb"))
Example #3
0
def pretrain(args):
    env = BatchKGEnvironment(args.dataset,
                             args.max_acts,
                             max_path_len=args.max_path_len,
                             state_history=args.state_history)

    pretrain_path_file = '{}/uid_path_idx.pkl'.format(args.log_dir)
    uid_path_idx = pk.load(open(pretrain_path_file,
                                'rb'))  # [(uid, [path]), ...]
    logger.info("Load pre train path from " + pretrain_path_file)

    dataloader = ACDataLoader(uid_path_idx, args.batch_size)
    model = PreActorCritic(env.state_dim,
                           env.act_dim,
                           gamma=args.gamma,
                           hidden_sizes=args.hidden).to(args.device)
    logger.info('Parameters:' + str([i[0] for i in model.named_parameters()]))
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    ### Pretrain Actor ###
    total_losses, total_plosses, total_rewards = [], [], []
    step = 0
    model.train()
    for epoch in range(1, args.epochs + 1):
        ### Start epoch ###
        dataloader.reset()
        while dataloader.has_next():
            batch_uid_path_idx = dataloader.get_batch()
            batch_uids = [b[0] for b in batch_uid_path_idx]
            batch_path_idx = np.array([b[1] for b in batch_uid_path_idx])
            ### Start batch episodes ###
            batch_state = env.reset(
                batch_uids)  # numpy array of [bs, state_dim]
            act_num = 0
            done = False
            while not done:
                true_batch_act_idx = batch_path_idx[:, act_num]
                batch_act_mask = env.batch_action_mask(
                    dropout=args.act_dropout
                )  # numpy array of size [bs, act_dim]
                batch_act_idx = model.select_true_action(
                    batch_state, batch_act_mask, true_batch_act_idx,
                    args.device)
                batch_state, batch_reward, done = env.batch_step(batch_act_idx)
                model.rewards.append(batch_reward)
                act_num += 1
            ### End of episodes ###

            lr = args.lr * max(
                1e-4, 1.0 - float(step) /
                (args.epochs * len(batch_uids) / args.batch_size))
            for pg in optimizer.param_groups:
                pg['lr'] = lr

            # Update policy
            total_rewards.append(np.sum(model.rewards))
            loss, ploss = model.update(optimizer, args.device, args.ent_weight)
            total_losses.append(loss)
            total_plosses.append(ploss)
            step += 1

            # Report performance
            if step > 0 and step % 100 == 0:
                avg_reward = np.mean(total_rewards) / args.batch_size
                avg_loss = np.mean(total_losses)
                avg_ploss = np.mean(total_plosses)
                total_losses, total_plosses, total_rewards = [], [], []
                logger.info('epoch/step={:d}/{:d}'.format(epoch, step) +
                            ' | loss={:.5f}'.format(avg_loss) +
                            ' | ploss={:.5f}'.format(avg_ploss) +
                            ' | reward={:.5f}'.format(avg_reward))
        ### END of epoch ###

        policy_file = '{}/pre_model_epoch_{}.ckpt'.format(args.log_dir, epoch)
        logger.info("Save pre train model to " + policy_file)
        torch.save(model.state_dict(), policy_file)
Example #4
0
def train(args):
    
    # initialize OpenAI Gym env and dqn agent
    #env = gym.make(ENV_NAME)
    env = BatchKGEnvironment(args.dataset, args.max_acts, max_path_len=args.max_path_len, state_history=args.state_history)
    uids = list(env.kg(USER).keys())
    print('uids:',len(uids))
    uids = np.arange(19488).tolist()
    agent = DQN(env,env.state_dim,env.act_dim,gamma = args.gamma,hidden_sizes = args.hidden)
    dataloader = ACDataLoader(uids, args.batch_size)
    logger.info('Parameters:' + str([i[0] for i in agent.named_parameters()]))
    optimizer = optim.Adam(agent.parameters(), lr=args.lr)
    #model = ActorCritic(env.state_dim, env.act_dim, gamma=args.gamma, hidden_sizes=args.hidden).to(args.device)
    #logger.info('Parameters:' + str([i[0] for i in model.named_parameters()]))

    '''    
    uids = list(env.kg(USER).keys())
    dataloader = ACDataLoader(uids, args.batch_size)
    model = ActorCritic(env.state_dim, env.act_dim, gamma=args.gamma, hidden_sizes=args.hidden).to(args.device)
    logger.info('Parameters:' + str([i[0] for i in model.named_parameters()]))
    '''
    episode = 1
    for epoch in range(0, args.epochs):
        ### Start epoch ###
        if epoch % 10 == 0:
            print('epoch:',epoch)
        dataloader.reset()
        while dataloader.has_next():
            batch_uids = dataloader.get_batch()
            ### Start batch episodes ###
            #print('batch_uids:',batch_uids,';',len(batch_uids))
            batch_state1 = env.reset(batch_uids)  # numpy array of [bs, state_dim]
            #print('egreedy_action, batch_state1:',batch_state1,';',len(batch_state1),';',len(batch_state1[0]))
            for step in range(STEP):#while not done:

                batch_act_mask = env.batch_action_mask(dropout=args.act_dropout)  # numpy array of size [bs, act_dim]
                '''select action'''
                #print('shape of batch state1:',batch_state1.shape)
                batch_act_idx = agent.egreedy_action(batch_state1,batch_act_mask)#batch_act_mask
                batch_state2, batch_reward, done = env.batch_step(batch_act_idx)
                batch_state2 = batch_state2.reshape((-1,400))
                agent.perceive(batch_state1,batch_act_idx,batch_reward,batch_state2,done,optimizer)
                agent.rewards.append(batch_reward)
                #print('batch_state2:',batch_state2,';',len(batch_state2),';',len(batch_state2[0]))
                batch_state1 = batch_state2
                #train Q network
                if done:
                    break
                # Test every 100 episodes
            if epoch % 100 == 0:
                total_reward = 0
                for i in range(TEST):
                    #batch_uids = dataloader.get_batch()
                    #print('action batch_uids:',batch_uids)
                    ### Start batch episodes ###
                    batch_state1 = env.reset(batch_uids)  # numpy array of [bs, state_dim]
                    #state = env.reset()
                    for j in range(STEP):
                        #env.render()#在屏幕上显示画面,不需要
                        batch_act_mask = env.batch_action_mask(dropout=args.act_dropout)
                        #print('before action, batch_state1:',batch_state1,';',len(batch_state1),';',len(batch_state1[0]))
                        action = agent.action(batch_state1,batch_act_mask) # direct action for test
                        batch_state2, batch_reward, done = env.batch_step(action)
                        batch_state1 = batch_state2
                        total_reward += batch_reward
                        if done:
                            break
                ave_reward = total_reward/TEST
                if episode % 100 == 0:
                    print ('episode: ',episode,'Evaluation Average Reward:',sum(ave_reward)/len(ave_reward))
                episode = episode + 1 
        #agent.update_target_q_network(epoch)
        
        #########
        if epoch % 10 == 0:
            policy_file = '{}/dqn3_model_epoch_{}.ckpt'.format(args.log_dir, epoch)
            logger.info("Save model to " + policy_file)
            print('epoch:',epoch,',episode:',episode)
            torch.save(agent.state_dict(), policy_file)