def build_vocab(dataset): word_freq = Counter() pos_freq = Counter() nuc_freq = Counter() rel_freq = Counter() for paragraph in chain(*dataset): for node in paragraph.iterfind(filter=node_type_filter([EDU, Relation])): if isinstance(node, EDU): word_freq.update(node.words) pos_freq.update(node.tags) elif isinstance(node, Relation): nuc_freq[node.nuclear] += 1 rel_freq[node.ftype] += 1 word_vocab = Vocab("word", word_freq) pos_vocab = Vocab("part of speech", pos_freq) nuc_label = Label("nuclear", nuc_freq) rel_label = Label("relation", rel_freq) return word_vocab, pos_vocab, nuc_label, rel_label
def build_vocab(instances): words_counter = Counter() poses_counter = Counter() trans_counter = Counter() for words, poses, trans in instances: words_counter.update(chain(*words)) poses_counter.update(chain(*poses)) trans_counter.update(trans) word_vocab = Vocab("word", words_counter) pos_vocab = Vocab("part of speech", poses_counter) trans_label = Label("transition", trans_counter) return word_vocab, pos_vocab, trans_label
def build_vocab(trees, trans): trans_label = Label("transition", Counter(chain(*trans))) words_counter = Counter() poses_counter = Counter() for tree in trees: edus = list(tree.edus()) words = [getattr(edu, "words") for edu in edus] poses = [getattr(edu, "tags") for edu in edus] words_counter.update(chain(*words)) poses_counter.update(chain(*poses)) word_vocab = Vocab("word", words_counter) pos_vocab = Vocab("part of speech", poses_counter) return word_vocab, pos_vocab, trans_label
def main(args): random.seed(args.seed) torch.random.manual_seed(args.seed) np.random.seed(args.seed) logger.info("args:" + str(args)) # load dataset cdtb = CDTB(args.data, "TRAIN", "VALIDATE", "TEST", ctb_dir=args.ctb_dir, preprocess=True, cache_dir=args.cache_dir) word_vocab, pos_vocab = build_vocab(cdtb.train) instances, tags = gen_train_instances(cdtb.train) tag_label = Label("tag", Counter(chain(*tags))) trainset = numericalize(instances, tags, word_vocab, pos_vocab, tag_label) # build model model = RNNSegmenterModel(hidden_size=args.hidden_size, dropout=args.dropout, rnn_layers=args.rnn_layers, word_vocab=word_vocab, pos_vocab=pos_vocab, tag_label=tag_label, pos_size=args.pos_size, pretrained=args.pretrained, w2v_freeze=args.w2v_freeze, use_gpu=args.use_gpu) if args.use_gpu: model.cuda() logger.info(model) # train step = 0 best_model_f1 = 0 wait_count = 0 optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3) for nepoch in range(1, args.epoch + 1): batch_iter = gen_batch_iter(trainset, args.batch_size, use_gpu=args.use_gpu) for nbatch, (inputs, target) in enumerate(batch_iter, start=1): step += 1 model.train() optimizer.zero_grad() loss = model.loss(inputs, target) loss.backward() optimizer.step() if nbatch > 0 and nbatch % args.log_every == 0: logger.info( "step %d, patient %d, lr %f, epoch %d, batch %d, train loss %.4f" % (step, wait_count, get_lr(optimizer), nepoch, nbatch, loss.item())) # model selection score = evaluate(cdtb.validate, model) f1 = score[-1] scheduler.step(f1, nepoch) logger.info("evaluation score:") logger.info("\n" + gen_edu_report(score)) if f1 > best_model_f1: wait_count = 0 best_model_f1 = f1 logger.info("save new best model to %s" % args.model_save) with open(args.model_save, "wb+") as model_fd: torch.save(model, model_fd) logger.info("test on new best model...") test_score = evaluate(cdtb.test, model) logger.info("test score:") logger.info("\n" + gen_edu_report(test_score)) else: wait_count += 1 if wait_count > args.patient: logger.info("early stopping...") break with open(args.model_save, "rb") as model_fd: best_model = torch.load(model_fd) test_score = evaluate(cdtb.test, best_model) logger.info("test score on final best model:") logger.info("\n" + gen_edu_report(test_score))