def main(): start_a = time.time() args_parser = argparse.ArgumentParser() args_parser.add_argument('--word_embedding', default='glove', help='Embedding for words') args_parser.add_argument('--word_embedding_file', default=main_path + 'Data/NeuralRST/glove.6B.200d.txt.gz') args_parser.add_argument('--train', default=main_path + 'Data/NeuralRST/rst.train312') args_parser.add_argument('--test', default=main_path + 'Data/NeuralRST/rst.test38') args_parser.add_argument('--dev', default=main_path + 'Data/NeuralRST/rst.dev35') args_parser.add_argument( '--train_syn_feat', default=main_path + 'Data/NeuralRST/SyntaxBiaffine/train.conll.dump.results') args_parser.add_argument( '--test_syn_feat', default=main_path + 'Data/NeuralRST/SyntaxBiaffine/test.conll.dump.results') args_parser.add_argument( '--dev_syn_feat', default=main_path + 'Data/NeuralRST/SyntaxBiaffine/dev.conll.dump.results') args_parser.add_argument('--model_path', default=main_path + 'Workspace/NeuralRST/experiment') args_parser.add_argument('--experiment', help='Name of your experiment', required=True) args_parser.add_argument('--model_name', default='network.pt') args_parser.add_argument('--max_iter', type=int, default=1000, help='maximum epoch') args_parser.add_argument('--word_dim', type=int, default=200, help='Dimension of word embeddings') args_parser.add_argument('--tag_dim', type=int, default=200, help='Dimension of POS tag embeddings') args_parser.add_argument('--etype_dim', type=int, default=100, help='Dimension of Etype embeddings') args_parser.add_argument('--syntax_dim', type=int, default=1200, help='Dimension of Etype embeddings') args_parser.add_argument( '--freeze', default=True, help='frozen the word embedding (disable fine-tuning).') args_parser.add_argument('--max_sent_size', type=int, default=20, help='maximum word size in 1 edu') args_parser.add_argument('--max_edu_size', type=int, default=120, help='maximum edu size') args_parser.add_argument('--max_state_size', type=int, default=1024, help='maximum decoding steps') args_parser.add_argument('--hidden_size', type=int, default=200, help='') args_parser.add_argument('--drop_prob', type=float, default=0.2, help='default drop_prob') args_parser.add_argument('--num_layers', type=int, default=1, help='number of RNN layers') args_parser.add_argument('--batch_size', type=int, default=8, help='Number of sentences in each batch') args_parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') args_parser.add_argument('--ada_eps', type=float, default=1e-8, help='epsilon for adam or adamax') args_parser.add_argument( '--opt', default='adam', help='Optimization, choose between adam, sgd, and adamax') args_parser.add_argument('--start_decay', type=int, default=0, help='') args_parser.add_argument('--beta1', type=float, default=0.9, help='beta1 for adam') args_parser.add_argument('--beta2', type=float, default=0.999, help='beta2 for adam') args_parser.add_argument('--gamma', type=float, default=2e-6, help='weight for regularization') args_parser.add_argument('--clip', type=float, default=10.0, help='gradient clipping') args_parser.add_argument('--decay', type=int, default=0, help='') args_parser.add_argument('--oracle_prob', type=float, default=0.66666, help='') args_parser.add_argument('--start_dynamic_oracle', type=int, default=20, help='') args_parser.add_argument('--use_dynamic_oracle', type=int, default=0, help='') args_parser.add_argument('--early_stopping', type=int, default=50, help='') args = args_parser.parse_args() config = Config(args) torch.manual_seed(123) if config.use_gpu: torch.cuda.manual_seed_all(999) if not os.path.exists(config.model_path): os.makedirs(config.model_path) logger = get_logger("RSTParser", config.use_dynamic_oracle, config.model_path) if config.use_dynamic_oracle: logger.info( "This is using DYNAMIC oracle, and will be activated at Epoch %d" % (config.start_dynamic_oracle)) model_name = 'dynamic_' + config.model_name else: logger.info("This is using STATIC oracle") model_name = 'static_' + config.model_name logger.info("Load word embedding") pretrained_embed, word_dim = load_embedding_dict( config.word_embedding, config.word_embedding_file) assert (word_dim == config.word_dim) logger.info("Reading Train start") reader = Reader(config.train_path, config.train_syn_feat_path) train_instances = reader.read_data() logger.info('Finish reading training instances: ' + str(len(train_instances))) # config.max_edu_size, config.max_sent_size, config.max_state_size = get_max_parameter (train_instances) logger.info('Max edu size: ' + str(config.max_edu_size)) logger.info('Max sentence size: ' + str(config.max_sent_size)) logger.info('Max gold action / state size: ' + str(config.max_state_size)) logger.info('Creating Alphabet....') config.model_name = os.path.join(config.model_path, config.model_name) word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha = create_alphabet( train_instances, config.alphabet_path, logger) vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha) set_label_action(action_label_alpha.alpha2id, train_instances) logger.info('Checking Gold Actions....') validate_gold_actions(train_instances, config.max_state_size) word_table = construct_embedding_table(word_alpha, config.word_dim, config.freeze, pretrained_embed) tag_table = construct_embedding_table(tag_alpha, config.tag_dim, config.freeze) etype_table = construct_embedding_table(etype_alpha, config.etype_dim, config.freeze) logger.info("Finish reading train data by: " + str(time.time() - start_a)) # DEV data processing reader = Reader(config.dev_path, config.dev_syn_feat_path) dev_instances = reader.read_data() logger.info('Finish reading dev instances') # TEST data processing reader = Reader(config.test_path, config.test_syn_feat_path) test_instances = reader.read_data() logger.info('Finish reading test instances') torch.set_num_threads(4) network = MainArchitecture(vocab, config, word_table, tag_table, etype_table) if config.freeze: network.word_embedd.freeze() if config.use_gpu: network.cuda() # Set-up Optimizer def generate_optimizer(config, params): params = filter(lambda param: param.requires_grad, params) if config.opt == 'adam': return Adam(params, lr=config.lr, betas=config.betas, weight_decay=config.gamma, eps=config.ada_eps) elif config.opt == 'sgd': return SGD(params, lr=config.lr, momentum=config.momentum, weight_decay=config.start_decay, nesterov=True) elif opt == 'adamax': return Adamax(params, lr=config.lr, betas=config.betas, weight_decay=config.start_decay, eps=config.ada_eps) else: raise ValueError('Unknown optimization algorithm: %s' % config.opt) optim = generate_optimizer(config, network.parameters()) opt_info = 'opt: %s, ' % config.opt if config.opt == 'adam': opt_info += 'betas=%s, eps=%.1e, lr=%.2f, weight_decay=%.1e' % ( config.betas, config.ada_eps, config.lr, config.gamma) elif config.opt == 'sgd': opt_info += 'momentum=%.2f' % config.momentum elif config.opt == 'adamax': opt_info += 'betas=%s, eps=%.1e, lr=%f' % (config.betas, config.ada_eps, config.lr) logger.info(opt_info) def get_subtrees(data, indices): subtrees = [] for i in indices: subtrees.append(data[i].result) return subtrees # START TRAINING config.save() batch_size = config.batch_size logger.info('Start doing training....') total_data = len(train_instances) logger.info('Batch size: %d' % batch_size) num_batch = total_data / batch_size + 1 es_counter = 0 best_S = 0 best_N = 0 best_R = 0 best_F = 0 iteration = -1 for epoch in range(0, config.max_iter): logger.info('Epoch %d ' % (epoch)) logger.info("Current learning rate: %.4f" % (config.lr)) if epoch == config.start_dynamic_oracle and config.use_dynamic_oracle: logger.info("In this epoch, dynamic oracle is activated!") config.flag_oracle = True permutation = torch.randperm(total_data).long() network.metric.reset() time_start = datetime.now() for i in range(0, total_data, batch_size): network.train() network.training = True indices = permutation[i:i + batch_size] # subset_data = words_var, tags_var, etypes_var, edu_mask_var, word_mask_var, gold_actions_var, len_edus, word_denominator, syntax subset_data = batch_data_variable(train_instances, indices, vocab, config) gold_subtrees = get_subtrees(train_instances, indices) cost, cost_val = network.loss(subset_data, gold_subtrees) cost.backward() clip_grad_norm(network.parameters(), config.clip) optim.step() network.zero_grad() time_elapsed = datetime.now() - time_start m, s = divmod(time_elapsed.seconds, 60) logger.info( 'Epoch %d, Batch %d, Cost: %.2f, Correct: %.2f, {} mins {} secs' .format(m, s) % (epoch, (i + batch_size) / batch_size, cost_val, network.metric.get_accuracy())) logger.info('Batch ends, performing test for DEV set') # START EVALUATING DEV: network.eval() network.training = False time_start = datetime.now() span = Metric() nuclear = Metric() relation = Metric() full = Metric() predictions = [] total_data_dev = len(dev_instances) for i in range(0, total_data_dev, batch_size): end_index = i + batch_size if end_index > total_data_dev: end_index = total_data_dev indices = np.array((range(i, end_index))) subset_data_dev = batch_data_variable(dev_instances, indices, vocab, config) prediction_of_subtrees = network.loss(subset_data_dev, None) predictions += prediction_of_subtrees for i in range(total_data_dev): span, nuclear, relation, full = dev_instances[i].evaluate( predictions[i], span, nuclear, relation, full) time_elapsed = datetime.now() - time_start m, s = divmod(time_elapsed.seconds, 60) logger.info('DEV is finished in {} mins {} secs'.format(m, s)) logger.info("S: " + span.print_metric()) logger.info("N: " + nuclear.print_metric()) logger.info("R: " + relation.print_metric()) logger.info("F: " + full.print_metric()) if best_F < full.get_f_measure(): best_S = span.get_f_measure() best_N = nuclear.get_f_measure() best_R = relation.get_f_measure() best_F = full.get_f_measure() iteration = epoch #save the model config.save() torch.save(network.state_dict(), config.model_name) logger.info('Model is successfully saved') es_counter = 0 else: logger.info( "NOT exceed best Full F-score: history = %.2f, current = %.2f" % (best_F, full.get_f_measure())) logger.info( "Best dev performance in Iteration %d with result S: %.4f, N: %.4f, R: %.4f, F: %.4f" % (iteration, best_S, best_N, best_R, best_F)) if es_counter > config.early_stopping: logger.info( 'Early stopping after getting lower DEV performance in %d consecutive epoch. BYE, Assalamualaikum!' % (es_counter)) sys.exit() es_counter += 1 # # START EVALUATING TEST: time_start = datetime.now() span = Metric() nuclear = Metric() relation = Metric() full = Metric() predictions = [] total_data_test = len(test_instances) for i in range(0, total_data_test, batch_size): end_index = i + batch_size if end_index > total_data_test: end_index = total_data_test indices = np.array(range(i, end_index)) subset_data_test = batch_data_variable(test_instances, indices, vocab, config) prediction_of_subtrees = network.loss(subset_data_test, None) predictions += prediction_of_subtrees for i in range(total_data_test): span, nuclear, relation, full = test_instances[i].evaluate( predictions[i], span, nuclear, relation, full) time_elapsed = datetime.now() - time_start m, s = divmod(time_elapsed.seconds, 60) logger.info('TEST is finished in {} mins {} secs'.format(m, s)) logger.info("S: " + span.print_metric()) logger.info("N: " + nuclear.print_metric()) logger.info("R: " + relation.print_metric()) logger.info("F: " + full.print_metric())
def main(): start_a = time.time() args_parser = argparse.ArgumentParser() args_parser.add_argument('--word_embedding', default='glove', help='Embedding for words') args_parser.add_argument('--word_embedding_file', default=main_path + 'Data/NeuralRST/glove.6B.200d.txt.gz') args_parser.add_argument('--train', default=main_path + 'Data/NeuralRST/rst.train312') args_parser.add_argument('--test', default=main_path + 'Data/NeuralRST/rst.test38') args_parser.add_argument('--dev', default=main_path + 'Data/NeuralRST/rst.dev35') args_parser.add_argument( '--train_syn_feat', default=main_path + 'Data/NeuralRST/SyntaxBiaffine/train.conll.dump.results') args_parser.add_argument( '--test_syn_feat', default=main_path + 'Data/NeuralRST/SyntaxBiaffine/test.conll.dump.results') args_parser.add_argument( '--dev_syn_feat', default=main_path + 'Data/NeuralRST/SyntaxBiaffine/dev.conll.dump.results') args_parser.add_argument('--model_path', default=main_path + 'Data/NeuralRST/experiment') args_parser.add_argument('--experiment', help='Name of your experiment', required=True) args_parser.add_argument('--model_name', default='network.pt') args_parser.add_argument('--max_iter', type=int, default=1000, help='maximum epoch') args_parser.add_argument('--word_dim', type=int, default=200, help='Dimension of word embeddings') args_parser.add_argument('--tag_dim', type=int, default=200, help='Dimension of POS tag embeddings') args_parser.add_argument('--etype_dim', type=int, default=100, help='Dimension of Etype embeddings') args_parser.add_argument('--syntax_dim', type=int, default=1200, help='Dimension of Sytax embeddings') args_parser.add_argument( '--freeze', default=True, help='frozen the word embedding (disable fine-tuning).') args_parser.add_argument('--max_sent_size', type=int, default=20, help='maximum word size in 1 edu') # two args below are not used in top-down discourse parsing args_parser.add_argument('--max_edu_size', type=int, default=400, help='maximum edu size') args_parser.add_argument('--max_state_size', type=int, default=1024, help='maximum decoding steps') args_parser.add_argument('--hidden_size', type=int, default=200, help='') args_parser.add_argument('--hidden_size_tagger', type=int, default=100, help='') args_parser.add_argument('--drop_prob', type=float, default=0.5, help='default drop_prob') args_parser.add_argument('--num_layers', type=int, default=1, help='number of RNN layers') args_parser.add_argument('--batch_size', type=int, default=4, help='Number of sentences in each batch') args_parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') args_parser.add_argument('--ada_eps', type=float, default=1e-6, help='epsilon for adam or adamax') args_parser.add_argument( '--opt', default='adam', help='Optimization, choose between adam, sgd, and adamax') args_parser.add_argument('--start_decay', type=int, default=0, help='') args_parser.add_argument('--grad_accum', type=int, default=2, help='gradient accumulation setting') args_parser.add_argument('--beta1', type=float, default=0.9, help='beta1 for adam') args_parser.add_argument('--beta2', type=float, default=0.999, help='beta2 for adam') args_parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') args_parser.add_argument('--clip', type=float, default=10.0, help='gradient clipping') args_parser.add_argument('--loss_nuc_rel', type=float, default=1.0, help='weight for nucleus and relation loss') args_parser.add_argument('--loss_seg', type=float, default=1.0, help='weight for segmentation loss') args_parser.add_argument('--activate_nuc_rel_loss', type=int, default=0, help='set the starting epoch for nuclear loss') args_parser.add_argument( '--activate_seg_loss', type=int, default=0, help='set the starting epoch for segmentation loss') args_parser.add_argument('--decay', type=int, default=0, help='') args_parser.add_argument('--oracle_prob', type=float, default=0.66666, help='') args_parser.add_argument('--start_dynamic_oracle', type=int, default=20, help='') args_parser.add_argument('--use_dynamic_oracle', type=int, default=0, help='') args_parser.add_argument('--early_stopping', type=int, default=50, help='') args_parser.add_argument('--beam_search', type=int, default=1, help='assign parameter k for beam search') args_parser.add_argument( '--depth_alpha', type=float, default=0.0, help='multiplier of loss based on depth of the subtree') args_parser.add_argument( '--elem_alpha', type=float, default=0.0, help='multiplier of loss based on number of element in a subtree') args_parser.add_argument('--seed', type=int, default=999, help='random seed') args = args_parser.parse_args() config = Config(args) torch.manual_seed(config.seed) if config.use_gpu: torch.cuda.manual_seed_all(config.seed) if not os.path.exists(config.model_path): os.makedirs(config.model_path) logger = get_logger("RSTParser", config.use_dynamic_oracle, config.model_path) if config.use_dynamic_oracle: logger.info( "This is using DYNAMIC oracle, and will be activated at Epoch %d" % (config.start_dynamic_oracle)) model_name = 'dynamic_' + config.model_name else: logger.info("This is using STATIC oracle") model_name = 'static_' + config.model_name logger.info("Load word embedding, will take 2 minutes") pretrained_embed, word_dim = load_embedding_dict( config.word_embedding, config.word_embedding_file) assert (word_dim == config.word_dim) logger.info("Reading Train start") reader = Reader(config.train_path, config.train_syn_feat_path) train_instances = reader.read_data() logger.info('Finish reading training instances: ' + str(len(train_instances))) logger.info('Max sentence size: ' + str(config.max_sent_size)) logger.info('Creating Alphabet....') config.model_name = os.path.join(config.model_path, config.model_name) word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, relation_alpha, nuclear_alpha, nuclear_relation_alpha, etype_alpha = create_alphabet( train_instances, config.alphabet_path, logger) vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha, relation_alpha, nuclear_alpha, nuclear_relation_alpha) set_label_action(action_label_alpha.alpha2id, train_instances) # logger.info('Checking Gold Actions for transition-based parser....') # validate_gold_actions(train_instances, config.max_state_size) logger.info('Checking Gold Labels for top-down parser....') validate_gold_top_down(train_instances) word_table = construct_embedding_table(word_alpha, config.word_dim, config.freeze, pretrained_embed) tag_table = construct_embedding_table(tag_alpha, config.tag_dim, config.freeze) etype_table = construct_embedding_table(etype_alpha, config.etype_dim, config.freeze) logger.info("Finish reading train data by:" + str(round(time.time() - start_a, 2)) + 'sec') # DEV data processing reader = Reader(config.dev_path, config.dev_syn_feat_path) dev_instances = reader.read_data() logger.info('Finish reading dev instances') # TEST data processing reader = Reader(config.test_path, config.test_syn_feat_path) test_instances = reader.read_data() logger.info('Finish reading test instances') torch.set_num_threads(4) network = MainArchitecture(vocab, config, word_table, tag_table, etype_table) if config.freeze: network.word_embedd.freeze() if config.use_gpu: network.cuda() # Set-up Optimizer def generate_optimizer(config, params): params = filter(lambda param: param.requires_grad, params) if config.opt == 'adam': return Adam(params, lr=config.lr, betas=config.betas, weight_decay=config.gamma, eps=config.ada_eps) elif config.opt == 'sgd': return SGD(params, lr=config.lr, momentum=config.momentum, weight_decay=config.start_decay, nesterov=True) elif opt == 'adamax': return Adamax(params, lr=config.lr, betas=config.betas, weight_decay=config.start_decay, eps=config.ada_eps) else: raise ValueError('Unknown optimization algorithm: %s' % config.opt) optim = generate_optimizer(config, network.parameters()) opt_info = 'opt: %s, ' % config.opt if config.opt == 'adam': opt_info += 'betas=%s, eps=%.1e, lr=%.5f, weight_decay=%.1e' % ( config.betas, config.ada_eps, config.lr, config.gamma) elif config.opt == 'sgd': opt_info += 'momentum=%.2f' % config.momentum elif config.opt == 'adamax': opt_info += 'betas=%s, eps=%.1e, lr=%f' % (config.betas, config.ada_eps, config.lr) logger.info(opt_info) def get_subtrees(data, indices): subtrees = [] for i in indices: subtrees.append(data[i].result) return subtrees # START TRAINING config.save() batch_size = config.batch_size logger.info('Start doing training....') total_data = len(train_instances) logger.info('Batch size: %d' % batch_size) num_batch = total_data / batch_size + 1 es_counter = 0 best_S = 0 best_S_ori = 0 best_N = 0 best_N_ori = 0 best_R = 0 best_R_ori = 0 best_F = 0 best_F_ori = 0 iteration = -1 for epoch in range(0, config.max_iter): logger.info('Epoch %d ' % (epoch)) logger.info("Current learning rate: %.5f" % (config.lr)) if epoch == config.start_dynamic_oracle and config.use_dynamic_oracle: logger.info("In this epoch, dynamic oracle is activated!") config.flag_oracle = True permutation = torch.randperm(total_data).long() network.metric_span.reset() network.metric_nuclear_relation.reset() time_start = datetime.now() costs = [] counter_acc = 0 for i in range(0, total_data, batch_size): network.train() network.training = True indices = permutation[i:i + batch_size] subset_data = batch_data_variable(train_instances, indices, vocab, config) gold_subtrees = get_subtrees(train_instances, indices) cost, cost_val = network.loss(subset_data, gold_subtrees, epoch=epoch) costs.append(cost_val) cost.backward() counter_acc += 1 if config.grad_accum > 1 and counter_acc == config.grad_accum: clip_grad_norm_(network.parameters(), config.clip) optim.step() network.zero_grad() counter_acc = 0 elif config.grad_accum == 1: optim.step() network.zero_grad() counter_acc = 0 time_elapsed = datetime.now() - time_start m, s = divmod(time_elapsed.seconds, 60) logger.info( 'Epoch %d, Batch %d, AvgCost: %.2f, CorrectSpan: %.2f, CorrectNuclearRelation: %.2f - {} mins {} secs' .format(m, s) % (epoch, (i + batch_size) / batch_size, np.mean(costs), network.metric_span.get_accuracy(), network.metric_nuclear_relation.get_accuracy())) # Perform evaluation if span accuracy is at leas 0.8 OR when dynamic oracle is activated if network.metric_span.get_accuracy() < 0.8 and not config.flag_oracle: logger.info( 'We only perform test for DEV and TEST set if the span accuracy >= 0.80' ) continue logger.info('Batch ends, performing test for DEV and TEST set') # START EVALUATING DEV: network.eval() network.training = False logger.info('Evaluate DEV:') span, nuclear, relation, full, span_ori, nuclear_ori, relation_ori, full_ori =\ predict(network, dev_instances, vocab, config, logger) if best_F < full.get_f_measure(): best_S = span.get_f_measure() best_S_ori = span_ori.get_f_measure() best_N = nuclear.get_f_measure() best_N_ori = nuclear_ori.get_f_measure() best_R = relation.get_f_measure() best_R_ori = relation_ori.get_f_measure() best_F = full.get_f_measure() best_F_ori = full_ori.get_f_measure() iteration = epoch #save the model config.save() torch.save(network.state_dict(), config.model_name) logger.info('Model is successfully saved') es_counter = 0 else: logger.info( "NOT exceed best Full F-score: history = %.2f, current = %.2f" % (best_F, full.get_f_measure())) logger.info( "Best dev performance in Iteration %d with result S (rst): %.4f, N (rst): %.4f, R (rst): %.4f, F (rst): %.4f" % (iteration, best_S, best_N, best_R, best_F)) #logger.info("Best dev performance in Iteration %d with result S (ori): %.4f, N (ori): %.4f, R (ori): %.4f, F (ori): %.4f" %(iteration, best_S_ori, best_N_ori, best_R_ori, best_F_ori)) if es_counter > config.early_stopping: logger.info( 'Early stopping after getting lower DEV performance in %d consecutive epoch. BYE, Assalamualaikum!' % (es_counter)) sys.exit() es_counter += 1 # START EVALUATING TEST: logger.info('Evaluate TEST:') span, nuclear, relation, full, span_ori, nuclear_ori, relation_ori, full_ori =\ predict(network, test_instances, vocab, config, logger)