def main(): rnd_seed = None if rnd_seed: torch.manual_seed(rnd_seed) np.random.seed(rnd_seed) # --------------------------------------- # DATA LOADING # --------------------------------------- #result_path = "../result_lrn_0p001_rl/" dict_file = "../dataset/CCGbank/dict_word" entity_file = "../dataset/CCGbank/dict_tag" index2word = get_index2word(dict_file) index2label = get_index2label(entity_file) vocab_size = len(index2word) label_size = len(index2label) #train_X, train_Y = minibatch_of_one_de('train') val_X, val_Y = minibatch_of_one_de('val') test_X, test_Y = minibatch_of_one_de('test') # --------------------------------------- # HYPER PARAMETERS # --------------------------------------- # Using word2vec pre-trained embedding word_embedding_dim = 300 hidden_dim = 512 label_embedding_dim = 512 max_epoch = 30 # 0.001 is a good value ner_learning_rate = 0.001 pretrained = None # --------------------------------------- # GPU OR NOT? # --------------------------------------- gpu = True if gpu and rnd_seed: torch.cuda.manual_seed(rnd_seed) # --------------------------------------- # MODEL INSTANTIATION # --------------------------------------- #attention = None attention = "fixed" load_model_dir = "../result_ccg_lrn_0p001_atten/" load_model_filename = os.path.join(load_model_dir, "ckpt_11.pth") batch_size = 1 machine = ner(word_embedding_dim, hidden_dim, label_embedding_dim, vocab_size, label_size, learning_rate=ner_learning_rate, minibatch_size=batch_size, max_epoch=max_epoch, train_X=None, train_Y=None, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y, attention=attention, gpu=gpu, pretrained=pretrained, load_model_filename=load_model_filename) if gpu: machine = machine.cuda() initial_beam_size = 1 # When you have only one beam, it does not make sense to consider # max_beam_size larger than the size of your label vocabulary max_beam_size = 10 # ============ INIT RL ===================== os.environ['OMP_NUM_THREADS'] = '4' #os.environ['CUDA_VISIBLE_DEVICES'] = "" parser = argparse.ArgumentParser(description='A3C') parser.add_argument('--logdir', default='../result_ccg_atten_ckpt_11_rl_lrn_0p001_reward_0p02_beam_1_gpu', help='name of logging directory') parser.add_argument('--lr', type=float, default=0.001, help='learning rate (default: 0.0001)') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor for rewards (default: 0.99)') parser.add_argument('--n_epochs', type=int, default=100, help='number of epochs for training agent(default: 30)') parser.add_argument('--entropy-coef', type=float, default=0.01, help='entropy term coefficient (default: 0.01)') parser.add_argument('--num-processes', type=int, default=1, help='how many training processes to use (default: 4)') parser.add_argument('--num-steps', type=int, default=20, help='number of forward steps in A3C (default: 20)') parser.add_argument('--tau', type=float, default=1.00, help='parameter for GAE (default: 1.00)') parser.add_argument('--value-loss-coef', type=float, default=0.5, help='value loss coefficient (default: 0.5)') parser.add_argument('--max-grad-norm', type=float, default=5, help='value loss coefficient (default: 5)') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--max-episode-length', type=int, default=1000000, help='maximum length of an episode (default: 1000000)') parser.add_argument('--name', default='train', help='name of the process') parser.add_argument('--no-shared', default=False, help='use an optimizer without shared momentum.') args = parser.parse_args() if not os.path.exists(args.logdir): os.mkdir(args.logdir) shared_model = AdaptiveActorCritic(max_beam_size=max_beam_size, action_space=3) shared_model.share_memory() if args.no_shared: shared_optimizer = None # default here (False) else: shared_optimizer = SharedAdam(params=shared_model.parameters(), lr=args.lr) # optimizer = optim.Adam(shared_model.parameters(), lr=learning_rate) shared_optimizer.share_memory() # -------------------------------------------- # RL TRAINING # -------------------------------------------- # For German dataset, f_score_index_begin = 5 (because O_INDEX = 4) # For toy dataset, f_score_index_begin = 4 (because {0: '<s>', 1: '<e>', 2: '<p>', 3: '<u>', ...}) # For CCG dataset, f_score_index_begin = 2 (because {0: _PAD, 1: _SOS, ...}) f_score_index_begin = 2 # RL reward coefficient reward_coef_fscore = 1 reward_coef_beam_size = 0.02 train_adaptive(0, machine, max_beam_size, shared_model, shared_optimizer, val_X, val_Y, index2word, index2label, "val", "adaptive", initial_beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args)
def main(): rnd_seed = None if rnd_seed: torch.manual_seed(rnd_seed) np.random.seed(rnd_seed) # --------------------------------------- # DATA LOADING # --------------------------------------- result_path = "./result/" if not os.path.exists(result_path): os.makedirs(result_path) dict_file = "../../dataset/German/vocab1.de" entity_file = "../../dataset/German/vocab1.en" index2word = get_index2word(dict_file) index2label = get_index2label(entity_file) vocab_size = len(index2word) label_size = len(index2label) train_X, train_Y = minibatch_of_one_de('train') val_X, val_Y = minibatch_of_one_de('valid') test_X, test_Y = minibatch_of_one_de('test') # --------------------------------------- # HYPER PARAMETERS # --------------------------------------- # Using word2vec pre-trained embedding word_embedding_dim = 300 hidden_dim = 64 label_embedding_dim = 8 max_epoch = 100 # 0.001 is a good value learning_rate = 0.001 pretrained = 'de64' if pretrained == 'de64': word_embedding_dim = 64 # --------------------------------------- # GPU OR NOT? # --------------------------------------- gpu = False if gpu and rnd_seed: torch.cuda.manual_seed(rnd_seed) # --------------------------------------- # MODEL INSTANTIATION # --------------------------------------- attention = "fixed" attn_string = '_attention' if attention else '' load_model_filename = os.path.join(result_path, "ckpt" + attn_string + ".pth") batch_size = 1 machine = ner(word_embedding_dim, hidden_dim, label_embedding_dim, vocab_size, label_size, learning_rate=learning_rate, minibatch_size=batch_size, max_epoch=max_epoch, train_X=None, train_Y=None, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y, attention=attention, gpu=gpu, pretrained=pretrained, load_model_filename=load_model_filename, load_map_location="cpu") if gpu: machine = machine.cuda() initial_beam_size = 10 # When you have only one beam, it does not make sense to consider # max_beam_size larger than the size of your label vocabulary max_beam_size = label_size # ============ INIT RL ===================== os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = "" parser = argparse.ArgumentParser(description='A3C') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor for rewards (default: 0.99)') parser.add_argument('--tau', type=float, default=1.00, help='parameter for GAE (default: 1.00)') parser.add_argument('--entropy-coef', type=float, default=0.01, help='entropy term coefficient (default: 0.01)') parser.add_argument('--value-loss-coef', type=float, default=0.5, help='value loss coefficient (default: 0.5)') parser.add_argument('--max-grad-norm', type=float, default=5, help='value loss coefficient (default: 5)') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument( '--n_epochs', type=int, default=30, help='number of epochs for training agent(default: 30)') parser.add_argument('--num-processes', type=int, default=4, help='how many training processes to use (default: 4)') parser.add_argument('--num-steps', type=int, default=20, help='number of forward steps in A3C (default: 20)') parser.add_argument('--max-episode-length', type=int, default=1000000, help='maximum length of an episode (default: 1000000)') parser.add_argument('--name', default='train', help='name of the process') parser.add_argument('--logdir', default='log', help='name of logging directory') parser.add_argument('--no-shared', default=False, help='use an optimizer without shared momentum.') args = parser.parse_args() if not os.path.exists(args.logdir): os.mkdir(args.logdir) shared_model = AdaptiveActorCritic(max_beam_size=max_beam_size, action_space=3) shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = SharedAdam(params=shared_model.parameters(), lr=learning_rate) # optimizer = optim.Adam(shared_model.parameters(), lr=learning_rate) optimizer.share_memory() # -------------------------------------------- # RL TRAINING # -------------------------------------------- # For German dataset, f_score_index_begin = 5 (because O_INDEX = 4) # For toy dataset, f_score_index_begin = 4 (because {0: '<s>', 1: '<e>', 2: '<p>', 3: '<u>', ...}) f_score_index_begin = 5 # RL reward coefficient reward_coef_fscore = 1 reward_coef_beam_size = 0.1 processes = [] counter = mp.Value('i', 0) lock = mp.Lock() # eval along with many processes of training RL args.name = "val" p_val = mp.Process(target=test_adaptive, args=(args.num_processes, machine, max_beam_size, learning_rate, shared_model, counter, val_X, val_Y, index2word, index2label, "val", "log_", "adaptive", initial_beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args)) p_val.start() processes.append(p_val) args.name = "test" p_test = mp.Process(target=test_adaptive, args=(args.num_processes + 1, machine, max_beam_size, learning_rate, shared_model, counter, test_X, test_Y, index2word, index2label, "test", "log_", "adaptive", initial_beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args)) p_test.start() processes.append(p_test) args.name = "train" for rank in range(0, args.num_processes): p = mp.Process(target=train_adaptive, args=(rank, machine, max_beam_size, learning_rate, shared_model, counter, lock, optimizer, train_X, train_Y, index2word, index2label, "train", "log_", "adaptive", initial_beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args)) p.start() processes.append(p) for p in processes: p.join() # ===================================== print("TESTING w SHARED MODEL") processes = [] counter = mp.Value('i', 0) # test for only 1 epoch args.n_epoches = 1 args.name = "final_test" p = mp.Process(target=test_adaptive, args=(args.num_processes + 2, machine, max_beam_size, learning_rate, shared_model, counter, test_X, test_Y, index2word, index2label, "test", args.name, "adaptive", initial_beam_size, reward_coef_beam_size, f_score_index_begin, f_score_index_begin, args)) p.start() processes.append(p) for p in processes: p.join()