Ejemplo n.º 1
0
 def __init__(self, filepath: str, padding=PAD, name=None, **kwargs):
     import fasttext
     self.padding = padding.encode('utf-8')
     self.filepath = filepath
     filepath = get_resource(filepath)
     assert os.path.isfile(filepath), f'Resolved path {filepath} is not a file'
     existed = global_cache.get(filepath, None)
     if existed:
         logger.debug('Use cached fasttext model [{}].'.format(filepath))
         self.model = existed
     else:
         logger.debug('Loading fasttext model from [{}].'.format(filepath))
         # fasttext print a blank line here
         with stdout_redirected(to=os.devnull, stdout=sys.stderr):
             self.model = fasttext.load_model(filepath)
         global_cache[filepath] = self.model
     kwargs.pop('input_dim', None)
     kwargs.pop('output_dim', None)
     kwargs.pop('mask_zero', None)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim=len(self.model.words), output_dim=self.model['king'].size,
                      mask_zero=padding is not None, trainable=False, dtype=tf.string, name=name, **kwargs)
     embed_fn = np.frompyfunc(self.embed, 1, 1)
     # vf = np.vectorize(self.embed, otypes=[np.ndarray])
     self._embed_np = embed_fn
Ejemplo n.º 2
0
 def _init():
     with open(get_resource(HANLP_CHAR_TABLE), encoding='utf-8') as src:
         for line in src:
             cells = line.rstrip('\n')
             if len(cells) != 3:
                 continue
             a, _, b = cells
             CharTable.convert[a] = b
Ejemplo n.º 3
0
 def load_vocabs(self, save_dir, filename='vocabs.json'):
     save_dir = get_resource(save_dir)
     vocabs = SerializableDict()
     vocabs.load_json(os.path.join(save_dir, filename))
     for key, value in vocabs.items():
         vocab = VocabTF()
         vocab.copy_from(value)
         setattr(self.transform, key, vocab)
Ejemplo n.º 4
0
 def file_to_inputs(self, filepath: str, gold=True):
     filepath = get_resource(filepath)
     with open(filepath, encoding='utf-8') as src:
         for line in src:
             sentence = line.strip()
             if not sentence:
                 continue
             yield sentence
Ejemplo n.º 5
0
 def load(self, save_dir: str, logger=edparser.utils.log_util.logger, **kwargs):
     self.meta['load_path'] = save_dir
     save_dir = get_resource(save_dir)
     self.load_config(save_dir)
     self.load_vocabs(save_dir)
     self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True))
     self.load_weights(save_dir, **kwargs)
     self.load_meta(save_dir)
Ejemplo n.º 6
0
def load_from_meta_file(save_dir: str,
                        meta_filename='meta.json',
                        transform_only=False,
                        load_kwargs=None,
                        **kwargs) -> Component:
    identifier = save_dir
    load_path = save_dir
    save_dir = get_resource(save_dir)
    if save_dir.endswith('.json'):
        meta_filename = os.path.basename(save_dir)
        save_dir = os.path.dirname(save_dir)
    metapath = os.path.join(save_dir, meta_filename)
    if not os.path.isfile(metapath):
        tips = ''
        if save_dir.isupper():
            from difflib import SequenceMatcher
            similar_keys = sorted(pretrained.ALL.keys(),
                                  key=lambda k: SequenceMatcher(
                                      None, save_dir, metapath).ratio(),
                                  reverse=True)[:5]
            tips = f'Check its spelling based on the available keys:\n' + \
                   f'{sorted(pretrained.ALL.keys())}\n' + \
                   f'Tips: it might be one of {similar_keys}'
        raise FileNotFoundError(
            f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}'
        )
    meta: dict = load_json(metapath)
    cls = meta.get('class_path', None)
    assert cls, f'{meta_filename} doesn\'t contain class_path field'
    try:
        obj: Component = object_from_class_path(cls, **kwargs)
        if hasattr(obj, 'load'):
            if transform_only:
                # noinspection PyUnresolvedReferences
                obj.load_transform(save_dir)
            else:
                if load_kwargs is None:
                    load_kwargs = {}
                if os.path.isfile(os.path.join(save_dir, 'config.json')):
                    obj.load(save_dir, **load_kwargs)
                else:
                    obj.load(metapath, **load_kwargs)
            obj.meta['load_path'] = load_path
        return obj
    except Exception as e:
        eprint(f'Failed to load {identifier}. See stack trace below')
        traceback.print_exc()
        model_version = meta.get("hanlp_version", "unknown")
        cur_version = version.__version__
        if model_version != cur_version:
            eprint(
                f'{identifier} was created with hanlp-{model_version}, while you are running {cur_version}. '
                f'Try to upgrade hanlp with\n'
                f'pip install --upgrade hanlp\n'
                f'If the problem persists, please submit an issue to https://github.com/hankcs/HanLP/issues .'
            )
        exit(1)
 def _load_lm(self, filepath):
     filepath = get_resource(filepath)
     lm = RNNLanguageModel()
     lm.load(filepath)
     model: tf.keras.Sequential = lm.model
     for idx, layer in enumerate(model.layers):
         if isinstance(layer, tf.keras.layers.LSTM):
             lm.model = tf.keras.Sequential(
                 model.layers[:idx + 1])  # discard dense layer
             return lm
 def file_to_samples(self, filepath: str, gold=True):
     """
     Transform file to samples
     Parameters
     ----------
     filepath
     gold
     """
     filepath = get_resource(filepath)
     inputs = self.file_to_inputs(filepath, gold)
     yield from self.inputs_to_samples(inputs, gold)
Ejemplo n.º 9
0
def make(train):
    root = get_resource(SIGHAN2005)
    train = os.path.join(root, train.split('#')[-1])
    if not os.path.isfile(train):
        full = train.replace('_90.txt', '.utf8')
        logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion')
        valid = train.replace('90.txt', '10.txt')
        split_file(full, train=0.9, valid=0.1, test=0, names={'train': train, 'valid': valid})
        assert os.path.isfile(train), f'Failed to make {train}'
        assert os.path.isfile(valid), f'Failed to make {valid}'
        logger.info(f'Successfully made {train} {valid}')
 def load(self, save_dir: str, device=None, **kwargs):
     save_dir = get_resource(save_dir)
     self.load_config(save_dir)
     self.load_vocabs(save_dir)
     self.model = self.build_model(**merge_dict(self.config,
                                                training=False,
                                                logger=logger,
                                                **kwargs,
                                                overwrite=True,
                                                inplace=True))
     self.to(device)
     self.load_weights(save_dir, **kwargs)
Ejemplo n.º 11
0
 def load_transform(self, save_dir) -> Transform:
     """
     Try to load transform only. This method might fail due to the fact it avoids building the model.
     If it do fail, then you have to use `load` which might be too heavy but that's the best we can do.
     :param save_dir: The path to load.
     """
     save_dir = get_resource(save_dir)
     self.load_config(save_dir)
     self.load_vocabs(save_dir)
     self.transform.build_config()
     self.transform.lock_vocabs()
     return self.transform
Ejemplo n.º 12
0
    def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, logger: logging.Logger = None,
                 callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=True, verbose=True, **kwargs):
        input_path = get_resource(input_path)
        file_prefix, ext = os.path.splitext(input_path)
        name = os.path.basename(file_prefix)
        if not name:
            name = 'evaluate'
        if save_dir and not logger:
            logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN,
                                 mode='w')
        tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size)
        samples = self.num_samples_in(tst_data)
        num_batches = math.ceil(samples / batch_size)
        if warm_up:
            self.model.predict_on_batch(tst_data.take(1))
        if output:
            assert save_dir, 'Must pass save_dir in order to output'
            if isinstance(output, bool):
                output = os.path.join(save_dir, name) + '.predict' + ext
            elif isinstance(output, str):
                output = output
            else:
                raise RuntimeError('output ({}) must be of type bool or str'.format(repr(output)))
        timer = Timer()
        eval_outputs = self.evaluate_dataset(tst_data, callbacks, output, num_batches, **kwargs)
        loss, score, output = eval_outputs[0], eval_outputs[1], eval_outputs[2]
        delta_time = timer.stop()
        speed = samples / delta_time.delta_seconds

        if logger:
            f1: IOBES_F1_TF = None
            for metric in self.model.metrics:
                if isinstance(metric, IOBES_F1_TF):
                    f1 = metric
                    break
            extra_report = ''
            if f1:
                overall, by_type, extra_report = f1.state.result(full=True, verbose=False)
                extra_report = ' \n' + extra_report
            logger.info('Evaluation results for {} - '
                        'loss: {:.4f} - {} - speed: {:.2f} sample/sec{}'
                        .format(name + ext, loss,
                                format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics),
                                speed, extra_report))
        if output:
            logger.info('Saving output to {}'.format(output))
            with open(output, 'w', encoding='utf-8') as out:
                self.evaluate_output(tst_data, out, num_batches, self.model.metrics)

        return (loss, score, speed) + eval_outputs[3:]
Ejemplo n.º 13
0
def convert_to_stanford_dependency_330(src, dst):
    logger.info(
        f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. '
        f'It might take a while...')
    sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
    sp_home = get_resource(sp_home)
    # jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
    code, out, err = get_exitcode_stdout_stderr(
        f'java -cp {sp_home}/* edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -keepPunct -conllx '
        f'-treeFile {src}')
    with open(dst, 'w') as f:
        f.write(out)
    if code:
        raise RuntimeError(
            f'Conversion failed with code {code} for {src}. The err message is:\n {err}'
            f'Do you have java installed? Do you have enough memory?')
Ejemplo n.º 14
0
 def export_model_for_serving(self, export_dir=None, version=1, overwrite=False, show_hint=False):
     assert self.model, 'You have to fit or load a model before exporting it'
     if not export_dir:
         assert 'load_path' in self.meta, 'When not specifying save_dir, load_path has to present'
         export_dir = get_resource(self.meta['load_path'])
     model_path = os.path.join(export_dir, str(version))
     if os.path.isdir(model_path) and not overwrite:
         logger.info(f'{model_path} exists, skip since overwrite = {overwrite}')
         return export_dir
     logger.info(f'Exporting to {export_dir} ...')
     tf.saved_model.save(self.model, model_path)
     logger.info(f'Successfully exported model to {export_dir}')
     if show_hint:
         logger.info(f'You can serve it through \n'
                     f'tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} '
                     f'--model_base_path={export_dir} --rest_api_port=8888')
     return export_dir
Ejemplo n.º 15
0
def read_conll(filepath,
               underline_to_none=False,
               enhanced_collapse_empty_nodes=False):
    sent = []
    filepath: str = get_resource(filepath)
    if filepath.endswith('.conllu') and enhanced_collapse_empty_nodes is None:
        enhanced_collapse_empty_nodes = True
    with open(filepath, encoding='utf-8') as src:
        for idx, line in enumerate(src):
            if line.startswith('#'):
                continue
            line = line.strip()
            cells = line.split('\t')
            if line and cells:
                if enhanced_collapse_empty_nodes and '.' in cells[0]:
                    cells[0] = float(cells[0])
                    cells[6] = None
                else:
                    if '-' in cells[0] or '.' in cells[0]:
                        # sent[-1][1] += cells[1]
                        continue
                    cells[0] = int(cells[0])
                    try:
                        cells[6] = int(cells[6])
                    except ValueError:
                        cells[6] = 0
                        logger.exception(
                            f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}')
                if underline_to_none:
                    for i, x in enumerate(cells):
                        if x == '_':
                            cells[i] = None
                sent.append(cells)
            else:
                if enhanced_collapse_empty_nodes:
                    sent = collapse_enhanced_empty_nodes(sent)
                yield sent
                sent = []
    if sent:
        if enhanced_collapse_empty_nodes:
            sent = collapse_enhanced_empty_nodes(sent)
        yield sent
Ejemplo n.º 16
0
def evaluate(gold_file, pred_file):
    """
    Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)

    Parameters
    ----------
    gold_file : str
                The gold conllx file
    pred_file : str
                The pred conllx file

    Returns
    -------
    uas : float
        unlabeled attachment score
    las : float
        labeled attachment score
    """
    gold_file = get_resource(gold_file)
    fixed_pred_file = tempfile.NamedTemporaryFile().name
    copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False)
    if gold_file.endswith('.conllu'):
        fixed_gold_file = tempfile.NamedTemporaryFile().name
        copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False)
        gold_file = fixed_gold_file

    exitcode, out, err = get_exitcode_stdout_stderr(
        f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}')
    if exitcode:
        raise RuntimeError(
            f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.'
        )
    lines = out.split('\n')[-4:]
    las = int(lines[0].split()[3]) / int(lines[0].split()[5])
    uas = int(lines[1].split()[3]) / int(lines[1].split()[5])
    return uas, las
 def load_weights(self, save_dir, filename='model.pt', **kwargs):
     save_dir = get_resource(save_dir)
     self.model.load_state_dict(torch.load(os.path.join(save_dir,
                                                        filename)),
                                strict=False)
Ejemplo n.º 18
0
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=True,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              cpu=True,
              **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = VocabTF()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(
             vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(
         embeddings_initializer)
     with tf.device('cpu:0') if cpu else DummyContext():
         pret_embs = embeddings_initializer(
             shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim,
                      output_dim,
                      tf.keras.initializers.Constant(pret_embs),
                      embeddings_regularizer,
                      activity_regularizer,
                      embeddings_constraint,
                      mask_zero,
                      input_length,
                      name=name,
                      **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase
Ejemplo n.º 19
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-25 16:04

import os
import tempfile
from typing import List

from edparser.metrics.parsing.conllx_eval import copy_cols

from edparser.common.structure import SerializableDict
from edparser.metrics.parsing import iwpt20_xud_eval
from edparser.metrics.parsing.iwpt20_xud_eval import load_conllu_file
from edparser.utils.io_util import get_resource, get_exitcode_stdout_stderr

UD_TOOLS_ROOT = get_resource(
    'https://github.com/UniversalDependencies/tools/archive/1650bd354bd158c75836cff6650ea35cc9928fc8.zip')

ENHANCED_COLLAPSE_EMPTY_NODES = os.path.join(UD_TOOLS_ROOT, 'enhanced_collapse_empty_nodes.pl')
CONLLU_QUICK_FIX = os.path.join(UD_TOOLS_ROOT, 'conllu-quick-fix.pl')


def run_perl(script, src, dst=None):
    if not dst:
        dst = tempfile.NamedTemporaryFile().name
    exitcode, out, err = get_exitcode_stdout_stderr(
        f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}')
    if exitcode:
        # cpanm -l ~/.local namespace::autoclean
        # cpanm -l ~/.local Moose
        # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module
        raise RuntimeError(err)
Ejemplo n.º 20
0
 def load_meta(self, save_dir, filename='meta.json'):
     save_dir = get_resource(save_dir)
     metapath = os.path.join(save_dir, filename)
     if os.path.isfile(metapath):
         self.meta.update(load_json(metapath))
Ejemplo n.º 21
0
 def load_file(self, filepath):
     filepath = get_resource(filepath)
     for words, tags in generator_words_tags_from_tsv(filepath,
                                                      lower=False):
         yield {'word': words, 'tag': tags}
Ejemplo n.º 22
0
 def load_weights(self, save_dir, filename='model.h5', **kwargs):
     assert self.model.built or self.model.weights, 'You must call self.model.built() in build_model() ' \
                                                    'in order to load it'
     save_dir = get_resource(save_dir)
     self.model.load_weights(os.path.join(save_dir, filename))
Ejemplo n.º 23
0
    '‹': "'",
    '›': "'",
}


def convert_to_stanford_dependency_330(src, dst):
    logger.info(
        f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. '
        f'It might take a while...')
    sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
    sp_home = get_resource(sp_home)
    # jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
    code, out, err = get_exitcode_stdout_stderr(
        f'java -cp {sp_home}/* edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -keepPunct -conllx '
        f'-treeFile {src}')
    with open(dst, 'w') as f:
        f.write(out)
    if code:
        raise RuntimeError(
            f'Conversion failed with code {code} for {src}. The err message is:\n {err}'
            f'Do you have java installed? Do you have enough memory?')


for s, d in zip([PTB_TRAIN, PTB_VALID, PTB_TEST],
                [PTB_SD330_TRAIN, PTB_SD330_VALID, PTB_SD330_TEST]):
    s = get_resource(s)
    home = os.path.dirname(s)
    d = os.path.join(home, d.split('/')[-1])
    if not os.path.isfile(d):
        convert_to_stanford_dependency_330(s, d)
 def load_config(self, save_dir, filename='config.json'):
     save_dir = get_resource(save_dir)
     self.config.load_json(os.path.join(save_dir, filename))
Ejemplo n.º 25
0
def run(lang, do_train=True, do_eval=True, mbert=True):
    """
    Run training and decoding
    :param lang: Language code, 2 letters.
    :param do_train: Train model or not.
    :param do_eval: Evaluate performance (generating output) or not.
    :param mbert: Use mbert or language specific transformers.
    """
    dataset = f'data/iwpt2020/train-dev-combined/{lang}'
    trnfile = f'{dataset}/train.short.conllu'
    # for idx, sent in enumerate(read_conll(trnfile)):
    #     print(f'\r{idx}', end='')
    devfile = f'{dataset}/dev.short.conllu'
    testfile = f'data/iwpt2020/test-udpipe/{lang}.fixed.short.conllu'
    prefix = 'mbert'
    transformer = 'bert-base-multilingual-cased'
    if not mbert:
        prefix = 'bert'
        if lang == 'sv':
            transformer = "KB/bert-base-swedish-cased"
        if lang == 'ar':
            transformer = "asafaya/bert-base-arabic"
        elif lang == 'en':
            transformer = 'albert-xxlarge-v2'
        elif lang == 'ru':
            transformer = "DeepPavlov/rubert-base-cased"
        elif lang == 'fi':
            transformer = "TurkuNLP/bert-base-finnish-cased-v1"
        elif lang == 'it':
            transformer = "dbmdz/bert-base-italian-cased"
        elif lang == 'nl':
            transformer = "wietsedv/bert-base-dutch-cased"
        elif lang == 'et':
            transformer = get_resource(
                'http://dl.turkunlp.org/estonian-bert/etwiki-bert/pytorch/etwiki-bert-base-cased.tar.gz'
            )
        elif lang == 'fr':
            transformer = 'camembert-base'
        elif lang == 'pl':
            transformer = "dkleczek/bert-base-polish-uncased-v1"
        elif lang == 'sk' or lang == 'bg' or lang == 'cs':
            transformer = get_resource(
                'http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt.tar.gz'
            )
        else:
            prefix = 'mbert'
    save_dir = f'data/model/iwpt2020/{lang}/{prefix}_dep'
    # if do_train and os.path.isdir(save_dir):
    #     return
    strategy = tf.distribute.MirroredStrategy()
    print("Number of devices: {}".format(strategy.num_replicas_in_sync))
    with strategy.scope():
        parser = BiaffineTransformerDependencyParser(strategy=strategy)
        if do_train:
            parser.fit(
                trnfile,
                devfile,
                save_dir,
                transformer,
                batch_size=4096,
                warmup_steps_ratio=.1,
                samples_per_batch=150,
                # max_samples_per_batch=75,
                transformer_dropout=.33,
                learning_rate=2e-3,
                learning_rate_transformer=1e-5,
                # max_seq_length=512,
                # epochs=1
            )
    logger = init_logger(name='test', root_dir=save_dir, mode='w')
    parser.config.tree = 'mst'
    # dep_dev_output = f'{save_dir}/{os.path.basename(devfile.replace(".conllu", ".dep.pred.conllu"))}'
    # if not os.path.isfile(dep_dev_output) or do_eval:
    #     parser.evaluate(devfile, save_dir, warm_up=False, output=dep_dev_output, logger=logger)
    dep_test_output = f'{save_dir}/{os.path.basename(testfile.replace(".conllu", ".dep.pred.conllu"))}'
    if not os.path.isfile(dep_test_output) or do_eval:
        parser.load(save_dir, tree='mst')
        parser.evaluate(testfile,
                        save_dir,
                        warm_up=False,
                        output=dep_test_output,
                        logger=None)
    # score = evaluate(devfile, dep_dev_output)
    # dep_dev_elas = score["ELAS"].f1
    # dep_dev_clas = score["CLAS"].f1
    # logger.info(f'DEP score for {lang}:')
    # logger.info(f'ELAS: {dep_dev_elas * 100:.2f} - CLAS:{dep_dev_clas * 100:.2f}')
    if do_train:
        print(f'Model saved in {save_dir}')

    save_dir = f'data/model/iwpt2020/{lang}/{prefix}_sdp'
    parser = BiaffineTransformerSemanticDependencyParser()
    if do_train and not os.path.isdir(save_dir):
        parser.fit(
            trnfile,
            devfile,
            save_dir,
            transformer,
            batch_size=1000 if lang == 'cs' else 3000,
            warmup_steps_ratio=.1,
            samples_per_batch=150,
            # max_samples_per_batch=150,
            transformer_dropout=.33,
            learning_rate=2e-3,
            learning_rate_transformer=1e-5,
            # max_seq_length=512,
            # epochs=1
        )
    # (sdp_dev_elas, final_sdp_dev_output), (ensemble_dev_elas, final_ensemble_dev_output) = \
    #     eval_sdp_and_ensemble(parser, devfile, dep_dev_output, save_dir, lang, logger)
    (sdp_test_elas, final_sdp_test_output), (ensemble_test_elas, final_ensemble_test_output) = \
        eval_sdp_and_ensemble(parser, testfile, dep_test_output, save_dir, lang, logger, do_eval)
    save_dir = f'data/model/iwpt2020/{lang}/'
    # copyfile(dep_dev_output, save_dir + 'dev.dep.conllu')
    # copyfile(final_sdp_dev_output, save_dir + 'dev.sdp.conllu')
    # copyfile(final_ensemble_dev_output, save_dir + 'dev.ens.conllu')
    # dev_scores = [dep_dev_elas, sdp_dev_elas, ensemble_dev_elas]
    # winner = max(dev_scores)
    # widx = dev_scores.index(winner)
    dep_test_output = merge_long_sent(dep_test_output)
    evaluate(f'data/iwpt2020/test-udpipe/{lang}.fixed.conllu', dep_test_output)
    dep_test_output = dep_test_output.replace('.conllu', '.fixed.conllu')
    # if widx == 0:
    #     # dep wins, but we don't have output for dep, so let's do it below
    #     best_test_output = dep_test_output
    #     best_task = 'dep'
    # elif widx == 1:
    #     # sdp wins
    #     best_test_output = final_sdp_test_output
    #     best_task = 'sdp'
    # else:
    #     # ensemble wins
    #     best_test_output = final_ensemble_test_output
    #     best_task = 'ens'
    #
    # info = {
    #     'best_task': best_task,
    #     'dev_scores': dict((x, y) for x, y in zip(['dep', 'sdp', 'ens'], dev_scores))
    # }
    # save_json(info, save_dir + 'scores.json')
    # copyfile(best_test_output, save_dir + lang + '.conllu')
    # dev_json = 'data/model/iwpt2020/dev.json'
    # try:
    #     total = load_json(dev_json)
    # except FileNotFoundError:
    #     total = {}
    # total[lang] = info
    # save_json(total, dev_json)

    final_root = f'data/model/iwpt2020/{prefix}'
    dep_root = f'{final_root}/dep'
    sdp_root = f'{final_root}/sdp'
    ens_root = f'{final_root}/ens'
    outputs = [
        dep_test_output, final_sdp_test_output, final_ensemble_test_output
    ]
    folders = [dep_root, sdp_root, ens_root]
    for o, f in zip(outputs, folders):
        os.makedirs(f, exist_ok=True)
        tmp = f'/tmp/{lang}.conllu'
        copyfile(o, tmp)
        remove_complete_edges(tmp, tmp)
        restore_collapse_edges(tmp, tmp)
        conllu_quick_fix(tmp, f'{f}/{lang}.conllu')
Ejemplo n.º 26
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 13:22
import glob

from edparser.utils.io_util import get_resource
import os
import shutil

from iwpt2020 import cdroot

cdroot()
iwpt_data = 'data/iwpt2020'
downloaded_iwpt_data = get_resource(
    'https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3238/iwpt2020stdata.tgz?sequence=1&isAllowed=y')
if os.path.isdir(iwpt_data):
    shutil.rmtree(iwpt_data, ignore_errors=True)
train_dev = f'{iwpt_data}/train-dev'
os.makedirs(train_dev)
for treebank in glob.glob(f'{downloaded_iwpt_data}/UD_*'):
    shutil.copytree(treebank, f'{train_dev}/{os.path.basename(treebank)}')
shutil.copytree(f'{downloaded_iwpt_data}/test-blind', f'{iwpt_data}/test-blind')

from iwpt2020 import preprocess_testset
preprocess_testset.main()

from iwpt2020 import enhanced_collapse_empty_nodes_for_all
enhanced_collapse_empty_nodes_for_all.main()

from iwpt2020 import combine_treebanks
combine_treebanks.main()
Ejemplo n.º 27
0
def build_transformer(transformer, max_seq_length=None, num_labels=None, tagging=True, tokenizer_only=False):
    spm_model_file = None
    if transformer in zh_albert_models_google:
        from bert.tokenization.albert_tokenization import FullTokenizer
        model_url = zh_albert_models_google[transformer]
        albert = True
    elif transformer in albert_models_tfhub:
        from edparser.layers.transformers.albert_tokenization import FullTokenizer
        with stdout_redirected(to=os.devnull):
            model_url = fetch_tfhub_albert_model(transformer,
                                                 os.path.join(hanlp_home(), 'thirdparty', 'tfhub.dev', 'google',
                                                              transformer))
        albert = True
        spm_model_file = glob.glob(os.path.join(model_url, 'assets', '*.model'))
        assert len(spm_model_file) == 1, 'No vocab found or unambiguous vocabs found'
        spm_model_file = spm_model_file[0]
    elif transformer in bert_models_google:
        from bert.tokenization.bert_tokenization import FullTokenizer
        model_url = bert_models_google[transformer]
        albert = False
    else:
        raise ValueError(
            f'Unknown model {transformer}, available ones: {list(bert_models_google.keys()) + list(zh_albert_models_google.keys()) + list(albert_models_tfhub.keys())}')
    bert_dir = get_resource(model_url)
    if spm_model_file:
        vocab = glob.glob(os.path.join(bert_dir, 'assets', '*.vocab'))
    else:
        vocab = glob.glob(os.path.join(bert_dir, '*vocab*.txt'))
    assert len(vocab) == 1, 'No vocab found or unambiguous vocabs found'
    vocab = vocab[0]
    lower_case = any(key in transformer for key in ['uncased', 'multilingual', 'chinese', 'albert'])
    if spm_model_file:
        # noinspection PyTypeChecker
        tokenizer = FullTokenizer(vocab_file=vocab, spm_model_file=spm_model_file, do_lower_case=lower_case)
    else:
        tokenizer = FullTokenizer(vocab_file=vocab, do_lower_case=lower_case)
    if tokenizer_only:
        return tokenizer
    if spm_model_file:
        bert_params = albert_params(bert_dir)
    else:
        bert_params = bert.params_from_pretrained_ckpt(bert_dir)
    l_bert = bert.BertModelLayer.from_params(bert_params, name='albert' if albert else "bert")
    if not max_seq_length:
        return l_bert, tokenizer, bert_dir
    l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids")
    l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids")
    l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids")
    output = l_bert([l_input_ids, l_token_type_ids], mask=l_mask_ids)
    if not tagging:
        output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    if bert_params.hidden_dropout:
        output = tf.keras.layers.Dropout(bert_params.hidden_dropout, name='hidden_dropout')(output)
    logits = tf.keras.layers.Dense(num_labels, kernel_initializer=tf.keras.initializers.TruncatedNormal(
        bert_params.initializer_range))(output)
    model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits)
    model.build(input_shape=(None, max_seq_length))
    if not spm_model_file:
        ckpt = glob.glob(os.path.join(bert_dir, '*.index'))
        assert ckpt, f'No checkpoint found under {bert_dir}'
        ckpt, _ = os.path.splitext(ckpt[0])
    with stdout_redirected(to=os.devnull):
        if albert:
            if spm_model_file:
                skipped_weight_value_tuples = bert.load_albert_weights(l_bert, bert_dir)
            else:
                # noinspection PyUnboundLocalVariable
                skipped_weight_value_tuples = load_stock_weights(l_bert, ckpt)
        else:
            # noinspection PyUnboundLocalVariable
            skipped_weight_value_tuples = bert.load_bert_weights(l_bert, ckpt)
    assert 0 == len(skipped_weight_value_tuples), f'failed to load pretrained {transformer}'
    return model, tokenizer
Ejemplo n.º 28
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-08 22:35
import tempfile

from edparser.components.parsers.conll import read_conll
from edparser.utils.io_util import get_resource, get_exitcode_stdout_stderr

CONLLX_EVAL = get_resource(
    'https://github.com/elikip/bist-parser/archive/master.zip' +
    '#bmstparser/src/utils/eval.pl')


def evaluate(gold_file, pred_file):
    """
    Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)

    Parameters
    ----------
    gold_file : str
                The gold conllx file
    pred_file : str
                The pred conllx file

    Returns
    -------
    uas : float
        unlabeled attachment score
    las : float
        labeled attachment score
    """
Ejemplo n.º 29
0
 def load_vocab(self, save_dir, filename='vocab.json'):
     save_dir = get_resource(save_dir)
     vocab = SerializableDict()
     vocab.load_json(os.path.join(save_dir, filename))
     self.vocab.copy_from(vocab)