def run(cls, command=None): cls.configure() command, command_args, common_args \ = cls._parse_args(command) try: def handler(signum, frame): raise SystemExit("Signal(%d) received: " "The program %s will be closed" % (signum, __file__)) signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) os.umask(0) self = cls._get_instance() self._initialize(command, command_args, common_args) self._preprocess() self._process() self._postprocess() self._finalize() except Exception: logging.e("Exception occurred during execution:", exc_info=True, stack_info=cls.debug) except SystemExit as e: logging.w(e) finally: logging.getLogger().finalize()
def test(model_file, test_file, device=-1): context = utils.Saver.load_context(model_file) if context.seed is not None: utils.set_random_seed(context.seed, device) test_dataset = context.loader.load(test_file, train=False, bucketing=True) kwargs = dict(context) if context.model_config is not None: kwargs.update(context.model_config) model = _build_parser(**dict(kwargs)) chainer.serializers.load_npz(model_file, model) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) pbar = training.listeners.ProgressBar(lambda n: tqdm(total=n)) pbar.init(len(test_dataset)) evaluator = Evaluator(model, context.loader.rel_map, test_file, logging.getLogger()) utils.chainer_train_off() for batch in test_dataset.batch(context.batch_size, colwise=True, shuffle=False): xs, ts = batch[:-1], batch[-1] ys = model.forward(*xs) evaluator.on_batch_end({'train': False, 'xs': xs, 'ys': ys, 'ts': ts}) pbar.update(len(ts)) evaluator.on_epoch_validate_end({})
def test(model_file, test_file, device=-1): context = utils.Saver.load_context(model_file) if context.seed is not None: utils.set_random_seed(context.seed, device) test_dataset = context.loader.load(test_file, train=False, bucketing=True) model = _build_parser(**dict(context)) chainer.serializers.load_npz(model_file, model) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) pbar = training.listeners.ProgressBar(lambda n: tqdm(total=n)) pbar.init(len(test_dataset)) evaluator = Evaluator(model, context.loader.rel_map, test_file, Log.getLogger()) utils.chainer_train_off() for batch in test_dataset.batch(context.batch_size, colwise=True, shuffle=False): xs, ts = batch[:-1], batch[-1] parsed = model.parse(*xs) evaluator.append([tokens[1:] for tokens in xs[-1]], parsed) pbar.update(len(ts)) evaluator.report(show_details=False)
def test(model_file, test_file, filter_type=True, limit=-1, device=-1): context = utils.Saver.load_context(model_file) logger = logging.getLogger() logger.trace('# context: {}'.format(context)) if context.seed is not None: utils.set_random_seed(context.seed, device) loader = context.builder.loader loader.filter_coord = filter_type encoder_input = context.encoder_input cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input) use_cont_embed = cont_embed_file_ext is not None test_dataset = loader.load_with_external_resources( test_file, train=False, bucketing=False, size=None if limit < 0 else limit, use_external_postags=True, use_contextualized_embed=use_cont_embed, contextualized_embed_file_ext=cont_embed_file_ext, logger=logger) logger.info('{} samples loaded for test'.format(len(test_dataset))) model = context.builder.build() chainer.serializers.load_npz(model_file, model) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) parser = parsers.build_parser(loader, model) evaluator = eval_module.Evaluator( parser, logger=logger, report_details=True) reporter = training.listeners.Reporter(logger) logger.info('Start decoding') utils.chainer_train_off() evaluator.on_epoch_validate_begin({'epoch': -1}) pbar = tqdm(total=len(test_dataset)) for batch in test_dataset.batch( context.batch_size, colwise=True, shuffle=False): xs, ts = batch[:-1], batch[-1] ys = model.forward(*xs) loss = model.compute_loss(ys, ts) with reporter: values = dict(loss=float(chainer.cuda.to_cpu(loss.data))) model.compute_accuracy(ys, ts) for k, v in model.result.items(): if 'loss' in k: values[k] = float(chainer.cuda.to_cpu(v.data)) elif 'accuracy' in k: values[k] = v reporter.report(values) evaluator.on_batch_end({'train': False, 'xs': xs, 'ts': ts}) pbar.update(len(ts)) pbar.close() reporter._output_log("testing", reporter.get_summary(), {'epoch': -1, 'size': len(test_dataset)}) evaluator.on_epoch_validate_end({'epoch': -1})
def check_grammar(test_file, limit=-1, grammar_type=1): logger = logging.getLogger() loader = dataset.DataLoader(filter_coord=True) test_dataset = loader.load(test_file, train=True, bucketing=False, size=None if limit < 0 else limit) word_vocab = loader.get_processor('word').vocab from models.gold import GoldModel model = GoldModel() if grammar_type == 1: cfg = parsers.Grammar.CFG_COORD_1 + parsers.Grammar.CFG elif grammar_type == 2: cfg = parsers.Grammar.CFG_COORD_2 + parsers.Grammar.CFG else: raise ValueError("Invalid grammar type: {}".format(grammar_type)) grammar = parsers.Grammar(word_vocab, cfg) parser = parsers.CkyParser(model, grammar) evaluator = eval_module.Evaluator( parser, logger=logger, report_details=False) n_corrects = 0 pbar = tqdm(total=len(test_dataset)) for batch in test_dataset.batch(size=20, colwise=True, shuffle=False): xs, ts = batch[:-1], batch[-1] true_coords_batch = ts model.set_gold(true_coords_batch) pred_coords_batch = evaluator._parser.parse(*xs, n_best=1) for i, (pred_coord_entries, true_coords) in \ enumerate(zip(pred_coords_batch, true_coords_batch)): pred_coords, _score = pred_coord_entries[0] true_coords = {ckey: coord for ckey, coord in true_coords.items() if coord is not None} for k, v in tuple(pred_coords.items()): if v is None: del pred_coords[k] if pred_coords == true_coords: n_corrects += 1 else: sentence = ' '.join( [word_vocab.lookup(word_id) for word_id in xs[0][i]]) print("SENTENCE: {}\nPRED: {}\nTRUE: {}\n-" .format(sentence, pred_coords, true_coords)) evaluator.add(pred_coords, true_coords) pbar.update(len(ts)) pbar.close() evaluator.report() logger.info("Number of correct tree: {}/{}" .format(n_corrects, len(test_dataset)))
def fit(self, data, valid_data=None, epochs=10, batch_size=32, valid_batch_size=None): if isinstance(data, Dataset): train_dataset = data elif isinstance(data, (tuple, list)) and len(data) == 2: train_dataset = Dataset(*data) else: raise ValueError('invalid data: {}'.format(type(data))) if valid_data: do_validation = True if isinstance(valid_data, Dataset): val_dataset = valid_data elif isinstance(valid_data, (tuple, list)) \ and len(valid_data) == 2: val_dataset = Dataset(*valid_data) else: raise ValueError('When passing valid_data, ' 'it must be dataset or contain ' 'two (x_val, y_val) items: {}' .format(type(valid_data))) if valid_batch_size is None: valid_batch_size = batch_size else: do_validation = False self._reporter = listeners.Reporter(logging.getLogger()) self.add_listener(self._reporter, priority=110) if self._acc_func is not None: def _report_accuracy(data): listeners.report( {"accuracy": self._acc_func(data['ys'], data['ts'])}) self.add_hook(TrainEvent.BATCH_END, _report_accuracy, priority=120) forward = self._forward if not callable(forward): if hasattr(self._forward, 'forward'): forward = self._forward.forward else: raise RuntimeError('`forward` is not callable') lossfun = self._loss_func convert = (self._converter if callable(self._converter) else lambda x: x) history = [] self.notify(TrainEvent.TRAIN_BEGIN) def main_loop(): for epoch in range(1, epochs + 1): epoch_logs = PseudoImmutableMap( epoch=epoch, size=train_dataset.size, ) self.notify(TrainEvent.EPOCH_BEGIN, epoch_logs) self._process(forward, train_dataset, lossfun, convert, batch_size, epoch_logs.copy(), train=True) if do_validation: self._process( forward, val_dataset, lossfun, convert, valid_batch_size, epoch_logs.copy(), train=False) self.notify(TrainEvent.EPOCH_END, epoch_logs) if self._reporter is not None: with self._reporter: main_loop() else: main_loop() self.notify(TrainEvent.TRAIN_END) return history
def train(train_file, test_file=None, format='tree', embed_file=None, n_epoch=20, batch_size=20, lr=0.001, limit=-1, l2_lambda=0.0, grad_clip=5.0, encoder_input=('char', 'postag'), model_config=None, device=-1, save_dir=None, seed=None, cache_dir='', refresh_cache=False, bert_model=0, bert_dir=''): if seed is not None: utils.set_random_seed(seed, device) logger = logging.getLogger() # logger.configure(filename='log.txt', logdir=save_dir) assert isinstance(logger, logging.AppLogger) if model_config is None: model_config = {} model_config['bert_model'] = bert_model model_config['bert_dir'] = bert_dir os.makedirs(save_dir, exist_ok=True) read_genia = format == 'genia' loader = dataset.DataLoader.build( postag_embed_size=model_config.get('postag_embed_size', 50), char_embed_size=model_config.get('char_embed_size', 10), word_embed_file=embed_file, filter_coord=(not read_genia), refresh_cache=refresh_cache, format=format, cache_options=dict(dir=cache_dir, mkdir=True, logger=logger), extra_ids=(git.hash(), )) use_external_postags = not read_genia cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input) use_cont_embed = cont_embed_file_ext is not None train_dataset = loader.load_with_external_resources( train_file, train=True, bucketing=False, size=None if limit < 0 else limit, refresh_cache=refresh_cache, use_external_postags=use_external_postags, use_contextualized_embed=use_cont_embed, contextualized_embed_file_ext=cont_embed_file_ext) logging.info('{} samples loaded for training'.format(len(train_dataset))) test_dataset = None if test_file is not None: test_dataset = loader.load_with_external_resources( test_file, train=False, bucketing=False, size=None if limit < 0 else limit // 10, refresh_cache=refresh_cache, use_external_postags=use_external_postags, use_contextualized_embed=use_cont_embed, contextualized_embed_file_ext=cont_embed_file_ext) logging.info('{} samples loaded for validation'.format( len(test_dataset))) builder = models.CoordSolverBuilder(loader, inputs=encoder_input, **model_config) logger.info("{}".format(builder)) model = builder.build() logger.trace("Model: {}".format(model)) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) if bert_model == 1: optimizer = chainer.optimizers.AdamW(alpha=lr) optimizer.setup(model) # optimizer.add_hook(chainer.optimizer.GradientClipping(1.)) else: optimizer = chainer.optimizers.AdamW(alpha=lr, beta1=0.9, beta2=0.999, eps=1e-08) optimizer.setup(model) if l2_lambda > 0.0: optimizer.add_hook(chainer.optimizer.WeightDecay(l2_lambda)) if grad_clip > 0.0: optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip)) def _report(y, t): values = {} model.compute_accuracy(y, t) for k, v in model.result.items(): if 'loss' in k: values[k] = float(chainer.cuda.to_cpu(v.data)) elif 'accuracy' in k: values[k] = v training.report(values) trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss) trainer.configure(utils.training_config) trainer.add_listener( training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200) trainer.add_hook(training.BATCH_END, lambda data: _report(data['ys'], data['ts'])) if test_dataset: parser = parsers.build_parser(loader, model) evaluator = eval_module.Evaluator(parser, logger=logging, report_details=False) trainer.add_listener(evaluator) if bert_model == 2: num_train_steps = 20000 * 5 / 20 num_warmup_steps = 10000 / 20 learning_rate = 2e-5 # learning rate (eta) scheduling in Adam lr_decay_init = learning_rate * \ (num_train_steps - num_warmup_steps) / num_train_steps trainer.add_hook( training.BATCH_END, extensions.LinearShift( # decay 'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps), optimizer=optimizer)) trainer.add_hook( training.BATCH_END, extensions.WarmupShift( # warmup 'eta', 0., num_warmup_steps, learning_rate, optimizer=optimizer)) if save_dir is not None: accessid = logging.getLogger().accessid date = logging.getLogger().accesstime.strftime('%Y%m%d') # metric = 'whole' if isinstance(model, models.Teranishi17) else 'inner' metric = 'exact' trainer.add_listener( utils.Saver( model, basename="{}-{}".format(date, accessid), context=dict(App.context, builder=builder), directory=save_dir, logger=logger, save_best=True, evaluate=(lambda _: evaluator.get_overall_score(metric)))) trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)
def parse(model_file, target_file, contextualized_embed_file=None, n_best=1, device=-1): context = utils.Saver.load_context(model_file) logger = logging.getLogger() logger.trace('# context: {}'.format(context)) if context.seed is not None: utils.set_random_seed(context.seed, device) loader = context.builder.loader encoder_input = context.encoder_input use_cont_embed = _get_cont_embed_file_ext(encoder_input) is not None if use_cont_embed and contextualized_embed_file is None: raise ValueError( "contextualized_embed_file must be specified when using " "a model trained with contextualized embeddings") elif not use_cont_embed and contextualized_embed_file is not None: raise ValueError( "contextualized_embed_file must not be specified when using " "a model trained without contextualized embeddings") if target_file.endswith('.txt'): loader.init_reader(format='default') loader.set_contextualized_embed_file(contextualized_embed_file) target_dataset = loader.load_with_external_resources( target_file, mode='parse', use_external_postags=True, logger=logger) logger.info('{} samples loaded for parsing'.format(len(target_dataset))) model = context.builder.build() chainer.serializers.load_npz(model_file, model) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) parser = parsers.build_parser(loader, model) logger.info('Start parsing') utils.chainer_train_off() pbar = tqdm(total=len(target_dataset)) for batch in target_dataset.batch( context.batch_size, colwise=True, shuffle=False): xs, (words, indices, sentence_id) = batch[:-3], batch[-3:] parsed = parser.parse(*xs, n_best) for results, words_i, indices_i, sentence_id_i \ in zip(parsed, words, indices, sentence_id): raw_sentence = ' '.join(words_i) for best_k, (coords, score) in enumerate(results): output = [ "#!RAW: {}".format(raw_sentence), "SENTENCE: {}".format(sentence_id_i), "CANDIDATE: #{}".format(best_k), "SCORE: {}".format(score), ] if indices_i is not None: coords = dataset.postprocess(coords, indices_i) for cc, coord in sorted(coords.items()): output.append("CC: {} {}".format(cc, words_i[cc])) if coord is not None: b, e = coord.conjuncts[0][0], coord.conjuncts[-1][1] output.append("COORD: {} {} {}".format( b, e, ' '.join(words_i[b:e + 1]))) for (b, e) in coord.conjuncts: output.append("CONJ: {} {} {}".format( b, e, ' '.join(words_i[b:e + 1]))) else: output.append("COORD: None") print('\n'.join(output) + '\n') pbar.update(len(sentence_id)) pbar.close()
def train(train_file, test_file=None, embed_file=None, n_epoch=20, batch_size=5000, lr=2e-3, model_config=None, device=-1, save_dir=None, seed=None, cache_dir='', refresh_cache=False): if seed is not None: utils.set_random_seed(seed, device) logger = logging.getLogger() assert isinstance(logger, logging.AppLogger) if model_config is None: model_config = {} loader = dataset.DataLoader.build(input_file=train_file, word_embed_file=embed_file, refresh_cache=refresh_cache, extra_ids=(git.hash(), ), cache_options=dict(dir=cache_dir, mkdir=True, logger=logger)) train_dataset = loader.load(train_file, train=True, bucketing=True, refresh_cache=refresh_cache) test_dataset = None if test_file is not None: test_dataset = loader.load(test_file, train=False, bucketing=True, refresh_cache=refresh_cache) model = _build_parser(loader, **model_config) if device >= 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu(device) optimizer = chainer.optimizers.Adam(alpha=lr, beta1=0.9, beta2=0.9, eps=1e-12) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(5.0)) optimizer.add_hook( optimizers.ExponentialDecayAnnealing(initial_lr=lr, decay_rate=0.75, decay_step=5000, lr_key='alpha')) def _report(y, t): arc_accuracy, rel_accuracy = model.compute_accuracy(y, t) training.report({ 'arc_accuracy': arc_accuracy, 'rel_accuracy': rel_accuracy }) trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss) trainer.configure(utils.training_config) trainer.add_listener( training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200) trainer.add_hook(training.BATCH_END, lambda data: _report(data['ys'], data['ts'])) if test_dataset: evaluator = Evaluator(model, loader.rel_map, test_file, logger) trainer.add_listener(evaluator, priority=128) if save_dir is not None: accessid = logger.accessid date = logger.accesstime.strftime('%Y%m%d') trainer.add_listener( utils.Saver(model, basename="{}-{}".format(date, accessid), context=dict(App.context, loader=loader), directory=save_dir, logger=logger, save_best=True, evaluate=(lambda _: evaluator._parsed['UAS']))) trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)