def test(model_file, test_file, filter_type=True, limit=-1, device=-1): context = utils.Saver.load_context(model_file) logger = logging.getLogger() logger.trace('# context: {}'.format(context)) if context.seed is not None: utils.set_random_seed(context.seed, device) loader = context.builder.loader loader.filter_coord = filter_type encoder_input = context.encoder_input cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input) use_cont_embed = cont_embed_file_ext is not None test_dataset = loader.load_with_external_resources( test_file, train=False, bucketing=False, size=None if limit < 0 else limit, use_external_postags=True, use_contextualized_embed=use_cont_embed, contextualized_embed_file_ext=cont_embed_file_ext, logger=logger) logger.info('{} samples loaded for test'.format(len(test_dataset))) model = context.builder.build() chainer.serializers.load_npz(model_file, model) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) parser = parsers.build_parser(loader, model) evaluator = eval_module.Evaluator( parser, logger=logger, report_details=True) reporter = training.listeners.Reporter(logger) logger.info('Start decoding') utils.chainer_train_off() evaluator.on_epoch_validate_begin({'epoch': -1}) pbar = tqdm(total=len(test_dataset)) for batch in test_dataset.batch( context.batch_size, colwise=True, shuffle=False): xs, ts = batch[:-1], batch[-1] ys = model.forward(*xs) loss = model.compute_loss(ys, ts) with reporter: values = dict(loss=float(chainer.cuda.to_cpu(loss.data))) model.compute_accuracy(ys, ts) for k, v in model.result.items(): if 'loss' in k: values[k] = float(chainer.cuda.to_cpu(v.data)) elif 'accuracy' in k: values[k] = v reporter.report(values) evaluator.on_batch_end({'train': False, 'xs': xs, 'ts': ts}) pbar.update(len(ts)) pbar.close() reporter._output_log("testing", reporter.get_summary(), {'epoch': -1, 'size': len(test_dataset)}) evaluator.on_epoch_validate_end({'epoch': -1})
def parse(model_file, target_file, contextualized_embed_file=None, n_best=1, device=-1): context = utils.Saver.load_context(model_file) logging.trace('# context: {}'.format(context)) if context.seed is not None: utils.set_random_seed(context.seed, device) loader = context.builder.loader encoder_input = context.encoder_input use_cont_embed = _get_cont_embed_file_ext(encoder_input) is not None if use_cont_embed and contextualized_embed_file is None: raise ValueError( "contextualized_embed_file must be specified when using " "a model trained with contextualized embeddings") elif not use_cont_embed and contextualized_embed_file is not None: raise ValueError( "contextualized_embed_file must not be specified when using " "a model trained without contextualized embeddings") target_dataset = loader.load_from_tagged_file(target_file, contextualized_embed_file) logging.info('{} samples loaded for parsing'.format(len(target_dataset))) model = context.builder.build() chainer.serializers.load_npz(model_file, model) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) parser = parsers.build_parser(loader, model) logging.info('Start parsing') utils.chainer_train_off() pbar = tqdm(total=len(target_dataset)) for batch in target_dataset.batch(context.batch_size, colwise=True, shuffle=False): xs, (words, is_quote, sentence_id) = batch[:-3], batch[-3:] parsed = parser.parse(*xs, n_best) for results, words_i, is_quote_i, sentence_id_i \ in zip(parsed, words, is_quote, sentence_id): raw_sentence = ' '.join(words_i) for best_k, (coords, score) in enumerate(results): output = [ "#!RAW: {}".format(raw_sentence), "SENTENCE: {}".format(sentence_id_i), "CANDIDATE: #{}".format(best_k), "SCORE: {}".format(score), ] coords = dataset.post_process(coords, is_quote_i) for cc, coord in sorted(coords.items()): output.append("CC: {} {}".format(cc, words_i[cc])) if coord is not None: b, e = coord.conjuncts[0][0], coord.conjuncts[-1][1] output.append("COORD: {} {} {}".format( b, e, ' '.join(words_i[b:e + 1]))) for (b, e) in coord.conjuncts: output.append("CONJ: {} {} {}".format( b, e, ' '.join(words_i[b:e + 1]))) else: output.append("COORD: None") print('\n'.join(output) + '\n') pbar.update(len(sentence_id)) pbar.close()
def train(train_file, test_file=None, format='tree', embed_file=None, n_epoch=20, batch_size=20, lr=0.001, limit=-1, l2_lambda=0.0, grad_clip=5.0, encoder_input=('char', 'postag'), model_config=None, device=-1, save_dir=None, seed=None, cache_dir='', refresh_cache=False, bert_model=0, bert_dir=''): if seed is not None: utils.set_random_seed(seed, device) logger = logging.getLogger() # logger.configure(filename='log.txt', logdir=save_dir) assert isinstance(logger, logging.AppLogger) if model_config is None: model_config = {} model_config['bert_model'] = bert_model model_config['bert_dir'] = bert_dir os.makedirs(save_dir, exist_ok=True) read_genia = format == 'genia' loader = dataset.DataLoader.build( postag_embed_size=model_config.get('postag_embed_size', 50), char_embed_size=model_config.get('char_embed_size', 10), word_embed_file=embed_file, filter_coord=(not read_genia), refresh_cache=refresh_cache, format=format, cache_options=dict(dir=cache_dir, mkdir=True, logger=logger), extra_ids=(git.hash(), )) use_external_postags = not read_genia cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input) use_cont_embed = cont_embed_file_ext is not None train_dataset = loader.load_with_external_resources( train_file, train=True, bucketing=False, size=None if limit < 0 else limit, refresh_cache=refresh_cache, use_external_postags=use_external_postags, use_contextualized_embed=use_cont_embed, contextualized_embed_file_ext=cont_embed_file_ext) logging.info('{} samples loaded for training'.format(len(train_dataset))) test_dataset = None if test_file is not None: test_dataset = loader.load_with_external_resources( test_file, train=False, bucketing=False, size=None if limit < 0 else limit // 10, refresh_cache=refresh_cache, use_external_postags=use_external_postags, use_contextualized_embed=use_cont_embed, contextualized_embed_file_ext=cont_embed_file_ext) logging.info('{} samples loaded for validation'.format( len(test_dataset))) builder = models.CoordSolverBuilder(loader, inputs=encoder_input, **model_config) logger.info("{}".format(builder)) model = builder.build() logger.trace("Model: {}".format(model)) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) if bert_model == 1: optimizer = chainer.optimizers.AdamW(alpha=lr) optimizer.setup(model) # optimizer.add_hook(chainer.optimizer.GradientClipping(1.)) else: optimizer = chainer.optimizers.AdamW(alpha=lr, beta1=0.9, beta2=0.999, eps=1e-08) optimizer.setup(model) if l2_lambda > 0.0: optimizer.add_hook(chainer.optimizer.WeightDecay(l2_lambda)) if grad_clip > 0.0: optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip)) def _report(y, t): values = {} model.compute_accuracy(y, t) for k, v in model.result.items(): if 'loss' in k: values[k] = float(chainer.cuda.to_cpu(v.data)) elif 'accuracy' in k: values[k] = v training.report(values) trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss) trainer.configure(utils.training_config) trainer.add_listener( training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200) trainer.add_hook(training.BATCH_END, lambda data: _report(data['ys'], data['ts'])) if test_dataset: parser = parsers.build_parser(loader, model) evaluator = eval_module.Evaluator(parser, logger=logging, report_details=False) trainer.add_listener(evaluator) if bert_model == 2: num_train_steps = 20000 * 5 / 20 num_warmup_steps = 10000 / 20 learning_rate = 2e-5 # learning rate (eta) scheduling in Adam lr_decay_init = learning_rate * \ (num_train_steps - num_warmup_steps) / num_train_steps trainer.add_hook( training.BATCH_END, extensions.LinearShift( # decay 'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps), optimizer=optimizer)) trainer.add_hook( training.BATCH_END, extensions.WarmupShift( # warmup 'eta', 0., num_warmup_steps, learning_rate, optimizer=optimizer)) if save_dir is not None: accessid = logging.getLogger().accessid date = logging.getLogger().accesstime.strftime('%Y%m%d') # metric = 'whole' if isinstance(model, models.Teranishi17) else 'inner' metric = 'exact' trainer.add_listener( utils.Saver( model, basename="{}-{}".format(date, accessid), context=dict(App.context, builder=builder), directory=save_dir, logger=logger, save_best=True, evaluate=(lambda _: evaluator.get_overall_score(metric)))) trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)