Esempio n. 1
0
 def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=False, logger=None, verbose=True,
         **kwargs):
     self._capture_config(locals())
     self.transform = self.build_transform(**self.config)
     if not save_dir:
         save_dir = tempdir_human()
     if not logger:
         logger = init_logger(name='train', root_dir=save_dir, level=logging.INFO if verbose else logging.WARN)
     logger.info('Hyperparameter:\n' + self.config.to_json())
     num_examples = self.build_vocab(trn_data, logger)
     # assert num_examples, 'You forgot to return the number of training examples in your build_vocab'
     logger.info('Building...')
     train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None
     self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None
     model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True))
     logger.info('Model built:\n' + summary_of_model(self.model))
     self.save_config(save_dir)
     self.save_vocabs(save_dir)
     self.save_meta(save_dir)
     trn_data = self.build_train_dataset(trn_data, batch_size, num_examples)
     dev_data = self.build_valid_dataset(dev_data, batch_size)
     callbacks = self.build_callbacks(save_dir, logger, **self.config)
     # need to know #batches, otherwise progbar crashes
     dev_steps = math.ceil(self.num_samples_in(dev_data) / batch_size)
     checkpoint = get_callback_by_class(callbacks, tf.keras.callbacks.ModelCheckpoint)
     timer = Timer()
     history = None
     try:
         history = self.train_loop(**merge_dict(self.config, trn_data=trn_data, dev_data=dev_data, epochs=epochs,
                                                num_examples=num_examples,
                                                train_steps_per_epoch=train_steps_per_epoch, dev_steps=dev_steps,
                                                callbacks=callbacks, logger=logger, model=model, optimizer=optimizer,
                                                loss=loss,
                                                metrics=metrics, overwrite=True))
     except KeyboardInterrupt:
         print()
         if not checkpoint or checkpoint.best in (np.Inf, -np.Inf):
             self.save_weights(save_dir)
             logger.info('Aborted with model saved')
         else:
             logger.info(f'Aborted with model saved with best {checkpoint.monitor} = {checkpoint.best:.4f}')
     if not history:
         # noinspection PyTypeChecker
         history: tf.keras.callbacks.History = get_callback_by_class(callbacks, tf.keras.callbacks.History)
     delta_time = timer.stop()
     best_epoch_ago = 0
     if history and hasattr(history, 'epoch'):
         trained_epoch = len(history.epoch)
         logger.info('Trained {} epochs in {}, each epoch takes {}'.
                     format(trained_epoch, delta_time, delta_time / trained_epoch if trained_epoch else delta_time))
         io_util.save_json(history.history, io_util.path_join(save_dir, 'history.json'), cls=io_util.NumpyEncoder)
         monitor_history: List = history.history.get(checkpoint.monitor, None)
         if monitor_history:
             best_epoch_ago = len(monitor_history) - monitor_history.index(checkpoint.best)
         if checkpoint and monitor_history and checkpoint.best != monitor_history[-1]:
             logger.info(f'Restored the best model with '
                         f'{checkpoint.monitor} = {checkpoint.best:.4f} '
                         f'saved {best_epoch_ago} epochs ago')
             self.load_weights(save_dir)  # restore best model
     return history
Esempio n. 2
0
    def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, logger: logging.Logger = None,
                 callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=True, verbose=True, **kwargs):
        input_path = get_resource(input_path)
        file_prefix, ext = os.path.splitext(input_path)
        name = os.path.basename(file_prefix)
        if not name:
            name = 'evaluate'
        if save_dir and not logger:
            logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN,
                                 mode='w')
        tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size)
        samples = self.num_samples_in(tst_data)
        num_batches = math.ceil(samples / batch_size)
        if warm_up:
            self.model.predict_on_batch(tst_data.take(1))
        if output:
            assert save_dir, 'Must pass save_dir in order to output'
            if isinstance(output, bool):
                output = os.path.join(save_dir, name) + '.predict' + ext
            elif isinstance(output, str):
                output = output
            else:
                raise RuntimeError('output ({}) must be of type bool or str'.format(repr(output)))
        timer = Timer()
        eval_outputs = self.evaluate_dataset(tst_data, callbacks, output, num_batches, **kwargs)
        loss, score, output = eval_outputs[0], eval_outputs[1], eval_outputs[2]
        delta_time = timer.stop()
        speed = samples / delta_time.delta_seconds

        if logger:
            f1: IOBES_F1_TF = None
            for metric in self.model.metrics:
                if isinstance(metric, IOBES_F1_TF):
                    f1 = metric
                    break
            extra_report = ''
            if f1:
                overall, by_type, extra_report = f1.state.result(full=True, verbose=False)
                extra_report = ' \n' + extra_report
            logger.info('Evaluation results for {} - '
                        'loss: {:.4f} - {} - speed: {:.2f} sample/sec{}'
                        .format(name + ext, loss,
                                format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics),
                                speed, extra_report))
        if output:
            logger.info('Saving output to {}'.format(output))
            with open(output, 'w', encoding='utf-8') as out:
                self.evaluate_output(tst_data, out, num_batches, self.model.metrics)

        return (loss, score, speed) + eval_outputs[3:]
 def evaluate(self,
              tst_path,
              batch_size=None,
              save_dir=None,
              logger=None,
              **kwargs):
     if logger is None:
         logger = init_logger(name='train',
                              root_dir=save_dir,
                              level=logging.INFO)
     if not batch_size:
         batch_size = self.config.get('batch_size', 32)
     dataset = self.build_dataloader(tst_path, batch_size, False,
                                     self.devices[0], **kwargs)
     return self.evaluate_dataloader(data=dataset,
                                     **merge_dict(self.config,
                                                  batch_size=batch_size,
                                                  logger=logger,
                                                  **kwargs))
 def fit(self,
         trn_data,
         dev_data,
         save_dir,
         batch_size,
         epochs,
         devices=None,
         logger=None,
         verbose=True,
         **kwargs):
     # Common initialization steps
     config = self._capture_config(locals())
     if not logger:
         logger = init_logger(
             name='train',
             root_dir=save_dir,
             level=logging.INFO if verbose else logging.WARN)
     devices = cuda_devices(devices)
     trn = self.build_dataloader(trn_data, batch_size, True, devices[0],
                                 logger)
     dev = self.build_dataloader(dev_data, batch_size, False, devices[0],
                                 logger)
     self.save_config(save_dir)
     self.save_vocabs(save_dir)
     self.model = self.build_model(**config)
     self.to(devices, logger)
     criterion = self.build_criterion(**self.config)
     optimizer = self.build_optimizer(**self.config)
     metric = self.build_metric(**self.config)
     return self.run_fit(**merge_dict(config,
                                      trn=trn,
                                      dev=dev,
                                      epochs=epochs,
                                      criterion=criterion,
                                      optimizer=optimizer,
                                      metric=metric,
                                      logger=logger,
                                      save_dir=save_dir,
                                      overwrite=True))
def run(lang, do_train=True, do_eval=True, mbert=True):
    """
    Run training and decoding
    :param lang: Language code, 2 letters.
    :param do_train: Train model or not.
    :param do_eval: Evaluate performance (generating output) or not.
    :param mbert: Use mbert or language specific transformers.
    """
    dataset = f'data/iwpt2020/train-dev-combined/{lang}'
    trnfile = f'{dataset}/train.short.conllu'
    # for idx, sent in enumerate(read_conll(trnfile)):
    #     print(f'\r{idx}', end='')
    devfile = f'{dataset}/dev.short.conllu'
    testfile = f'data/iwpt2020/test-udpipe/{lang}.fixed.short.conllu'
    prefix = 'mbert'
    transformer = 'bert-base-multilingual-cased'
    if not mbert:
        prefix = 'bert'
        if lang == 'sv':
            transformer = "KB/bert-base-swedish-cased"
        if lang == 'ar':
            transformer = "asafaya/bert-base-arabic"
        elif lang == 'en':
            transformer = 'albert-xxlarge-v2'
        elif lang == 'ru':
            transformer = "DeepPavlov/rubert-base-cased"
        elif lang == 'fi':
            transformer = "TurkuNLP/bert-base-finnish-cased-v1"
        elif lang == 'it':
            transformer = "dbmdz/bert-base-italian-cased"
        elif lang == 'nl':
            transformer = "wietsedv/bert-base-dutch-cased"
        elif lang == 'et':
            transformer = get_resource(
                'http://dl.turkunlp.org/estonian-bert/etwiki-bert/pytorch/etwiki-bert-base-cased.tar.gz'
            )
        elif lang == 'fr':
            transformer = 'camembert-base'
        elif lang == 'pl':
            transformer = "dkleczek/bert-base-polish-uncased-v1"
        elif lang == 'sk' or lang == 'bg' or lang == 'cs':
            transformer = get_resource(
                'http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt.tar.gz'
            )
        else:
            prefix = 'mbert'
    save_dir = f'data/model/iwpt2020/{lang}/{prefix}_dep'
    # if do_train and os.path.isdir(save_dir):
    #     return
    strategy = tf.distribute.MirroredStrategy()
    print("Number of devices: {}".format(strategy.num_replicas_in_sync))
    with strategy.scope():
        parser = BiaffineTransformerDependencyParser(strategy=strategy)
        if do_train:
            parser.fit(
                trnfile,
                devfile,
                save_dir,
                transformer,
                batch_size=4096,
                warmup_steps_ratio=.1,
                samples_per_batch=150,
                # max_samples_per_batch=75,
                transformer_dropout=.33,
                learning_rate=2e-3,
                learning_rate_transformer=1e-5,
                # max_seq_length=512,
                # epochs=1
            )
    logger = init_logger(name='test', root_dir=save_dir, mode='w')
    parser.config.tree = 'mst'
    # dep_dev_output = f'{save_dir}/{os.path.basename(devfile.replace(".conllu", ".dep.pred.conllu"))}'
    # if not os.path.isfile(dep_dev_output) or do_eval:
    #     parser.evaluate(devfile, save_dir, warm_up=False, output=dep_dev_output, logger=logger)
    dep_test_output = f'{save_dir}/{os.path.basename(testfile.replace(".conllu", ".dep.pred.conllu"))}'
    if not os.path.isfile(dep_test_output) or do_eval:
        parser.load(save_dir, tree='mst')
        parser.evaluate(testfile,
                        save_dir,
                        warm_up=False,
                        output=dep_test_output,
                        logger=None)
    # score = evaluate(devfile, dep_dev_output)
    # dep_dev_elas = score["ELAS"].f1
    # dep_dev_clas = score["CLAS"].f1
    # logger.info(f'DEP score for {lang}:')
    # logger.info(f'ELAS: {dep_dev_elas * 100:.2f} - CLAS:{dep_dev_clas * 100:.2f}')
    if do_train:
        print(f'Model saved in {save_dir}')

    save_dir = f'data/model/iwpt2020/{lang}/{prefix}_sdp'
    parser = BiaffineTransformerSemanticDependencyParser()
    if do_train and not os.path.isdir(save_dir):
        parser.fit(
            trnfile,
            devfile,
            save_dir,
            transformer,
            batch_size=1000 if lang == 'cs' else 3000,
            warmup_steps_ratio=.1,
            samples_per_batch=150,
            # max_samples_per_batch=150,
            transformer_dropout=.33,
            learning_rate=2e-3,
            learning_rate_transformer=1e-5,
            # max_seq_length=512,
            # epochs=1
        )
    # (sdp_dev_elas, final_sdp_dev_output), (ensemble_dev_elas, final_ensemble_dev_output) = \
    #     eval_sdp_and_ensemble(parser, devfile, dep_dev_output, save_dir, lang, logger)
    (sdp_test_elas, final_sdp_test_output), (ensemble_test_elas, final_ensemble_test_output) = \
        eval_sdp_and_ensemble(parser, testfile, dep_test_output, save_dir, lang, logger, do_eval)
    save_dir = f'data/model/iwpt2020/{lang}/'
    # copyfile(dep_dev_output, save_dir + 'dev.dep.conllu')
    # copyfile(final_sdp_dev_output, save_dir + 'dev.sdp.conllu')
    # copyfile(final_ensemble_dev_output, save_dir + 'dev.ens.conllu')
    # dev_scores = [dep_dev_elas, sdp_dev_elas, ensemble_dev_elas]
    # winner = max(dev_scores)
    # widx = dev_scores.index(winner)
    dep_test_output = merge_long_sent(dep_test_output)
    evaluate(f'data/iwpt2020/test-udpipe/{lang}.fixed.conllu', dep_test_output)
    dep_test_output = dep_test_output.replace('.conllu', '.fixed.conllu')
    # if widx == 0:
    #     # dep wins, but we don't have output for dep, so let's do it below
    #     best_test_output = dep_test_output
    #     best_task = 'dep'
    # elif widx == 1:
    #     # sdp wins
    #     best_test_output = final_sdp_test_output
    #     best_task = 'sdp'
    # else:
    #     # ensemble wins
    #     best_test_output = final_ensemble_test_output
    #     best_task = 'ens'
    #
    # info = {
    #     'best_task': best_task,
    #     'dev_scores': dict((x, y) for x, y in zip(['dep', 'sdp', 'ens'], dev_scores))
    # }
    # save_json(info, save_dir + 'scores.json')
    # copyfile(best_test_output, save_dir + lang + '.conllu')
    # dev_json = 'data/model/iwpt2020/dev.json'
    # try:
    #     total = load_json(dev_json)
    # except FileNotFoundError:
    #     total = {}
    # total[lang] = info
    # save_json(total, dev_json)

    final_root = f'data/model/iwpt2020/{prefix}'
    dep_root = f'{final_root}/dep'
    sdp_root = f'{final_root}/sdp'
    ens_root = f'{final_root}/ens'
    outputs = [
        dep_test_output, final_sdp_test_output, final_ensemble_test_output
    ]
    folders = [dep_root, sdp_root, ens_root]
    for o, f in zip(outputs, folders):
        os.makedirs(f, exist_ok=True)
        tmp = f'/tmp/{lang}.conllu'
        copyfile(o, tmp)
        remove_complete_edges(tmp, tmp)
        restore_collapse_edges(tmp, tmp)
        conllu_quick_fix(tmp, f'{f}/{lang}.conllu')
Esempio n. 6
0
#            save_dir,
#            'albert-xxlarge-v2',
#            batch_size=1024,
#            warmup_steps_ratio=.1,
#            samples_per_batch=150,
#            max_samples_per_batch=75,
#            transformer_dropout=.33,
#            learning_rate=2e-3,
#            learning_rate_transformer=1e-5,
#            # enhanced_only=True,
#            # epochs=1
#            )
parser.load(save_dir)
output = f'{testfile.replace(".conllu", ".pred.conllu")}'
output = f'{save_dir}/{os.path.basename(output)}'
logger = init_logger(name='test', root_dir=save_dir, mode='w')
pkl_path = f'{save_dir}/sdp.pkl'
try:
    scores = load_pickle(pkl_path)
except FileNotFoundError:
    scores = parser.evaluate(testfile, save_dir, warm_up=False, output=output, ret_scores=True, logger=logger)[-1]
    save_pickle(scores, pkl_path)

with open(output, 'w') as out:
    num = 0
    trees = CoNLLSentence.from_file(
        '/home/hhe43/hanlp/data/model/iwpt2020/en/albert_dep2/en_ewt-ud-dev.enhanced_collapse_empty_nodes.pred.fixed.conllu')
    for arc_scores, rel_scores, mask in scores:
        for a, r, m in zip(arc_scores, rel_scores, mask):
            # tree, graph = mst_then_greedy(a, r, m)
            tree = [0] + [x.head for x in trees[num]]