def __init__(self, filepath: str, padding=PAD, name=None, **kwargs): import fasttext self.padding = padding.encode('utf-8') self.filepath = filepath filepath = get_resource(filepath) assert os.path.isfile(filepath), f'Resolved path {filepath} is not a file' existed = global_cache.get(filepath, None) if existed: logger.debug('Use cached fasttext model [{}].'.format(filepath)) self.model = existed else: logger.debug('Loading fasttext model from [{}].'.format(filepath)) # fasttext print a blank line here with stdout_redirected(to=os.devnull, stdout=sys.stderr): self.model = fasttext.load_model(filepath) global_cache[filepath] = self.model kwargs.pop('input_dim', None) kwargs.pop('output_dim', None) kwargs.pop('mask_zero', None) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim=len(self.model.words), output_dim=self.model['king'].size, mask_zero=padding is not None, trainable=False, dtype=tf.string, name=name, **kwargs) embed_fn = np.frompyfunc(self.embed, 1, 1) # vf = np.vectorize(self.embed, otypes=[np.ndarray]) self._embed_np = embed_fn
def _init(): with open(get_resource(HANLP_CHAR_TABLE), encoding='utf-8') as src: for line in src: cells = line.rstrip('\n') if len(cells) != 3: continue a, _, b = cells CharTable.convert[a] = b
def load_vocabs(self, save_dir, filename='vocabs.json'): save_dir = get_resource(save_dir) vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) for key, value in vocabs.items(): vocab = VocabTF() vocab.copy_from(value) setattr(self.transform, key, vocab)
def file_to_inputs(self, filepath: str, gold=True): filepath = get_resource(filepath) with open(filepath, encoding='utf-8') as src: for line in src: sentence = line.strip() if not sentence: continue yield sentence
def load(self, save_dir: str, logger=edparser.utils.log_util.logger, **kwargs): self.meta['load_path'] = save_dir save_dir = get_resource(save_dir) self.load_config(save_dir) self.load_vocabs(save_dir) self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True)) self.load_weights(save_dir, **kwargs) self.load_meta(save_dir)
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, load_kwargs=None, **kwargs) -> Component: identifier = save_dir load_path = save_dir save_dir = get_resource(save_dir) if save_dir.endswith('.json'): meta_filename = os.path.basename(save_dir) save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, meta_filename) if not os.path.isfile(metapath): tips = '' if save_dir.isupper(): from difflib import SequenceMatcher similar_keys = sorted(pretrained.ALL.keys(), key=lambda k: SequenceMatcher( None, save_dir, metapath).ratio(), reverse=True)[:5] tips = f'Check its spelling based on the available keys:\n' + \ f'{sorted(pretrained.ALL.keys())}\n' + \ f'Tips: it might be one of {similar_keys}' raise FileNotFoundError( f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}' ) meta: dict = load_json(metapath) cls = meta.get('class_path', None) assert cls, f'{meta_filename} doesn\'t contain class_path field' try: obj: Component = object_from_class_path(cls, **kwargs) if hasattr(obj, 'load'): if transform_only: # noinspection PyUnresolvedReferences obj.load_transform(save_dir) else: if load_kwargs is None: load_kwargs = {} if os.path.isfile(os.path.join(save_dir, 'config.json')): obj.load(save_dir, **load_kwargs) else: obj.load(metapath, **load_kwargs) obj.meta['load_path'] = load_path return obj except Exception as e: eprint(f'Failed to load {identifier}. See stack trace below') traceback.print_exc() model_version = meta.get("hanlp_version", "unknown") cur_version = version.__version__ if model_version != cur_version: eprint( f'{identifier} was created with hanlp-{model_version}, while you are running {cur_version}. ' f'Try to upgrade hanlp with\n' f'pip install --upgrade hanlp\n' f'If the problem persists, please submit an issue to https://github.com/hankcs/HanLP/issues .' ) exit(1)
def _load_lm(self, filepath): filepath = get_resource(filepath) lm = RNNLanguageModel() lm.load(filepath) model: tf.keras.Sequential = lm.model for idx, layer in enumerate(model.layers): if isinstance(layer, tf.keras.layers.LSTM): lm.model = tf.keras.Sequential( model.layers[:idx + 1]) # discard dense layer return lm
def file_to_samples(self, filepath: str, gold=True): """ Transform file to samples Parameters ---------- filepath gold """ filepath = get_resource(filepath) inputs = self.file_to_inputs(filepath, gold) yield from self.inputs_to_samples(inputs, gold)
def make(train): root = get_resource(SIGHAN2005) train = os.path.join(root, train.split('#')[-1]) if not os.path.isfile(train): full = train.replace('_90.txt', '.utf8') logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion') valid = train.replace('90.txt', '10.txt') split_file(full, train=0.9, valid=0.1, test=0, names={'train': train, 'valid': valid}) assert os.path.isfile(train), f'Failed to make {train}' assert os.path.isfile(valid), f'Failed to make {valid}' logger.info(f'Successfully made {train} {valid}')
def load(self, save_dir: str, device=None, **kwargs): save_dir = get_resource(save_dir) self.load_config(save_dir) self.load_vocabs(save_dir) self.model = self.build_model(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True)) self.to(device) self.load_weights(save_dir, **kwargs)
def load_transform(self, save_dir) -> Transform: """ Try to load transform only. This method might fail due to the fact it avoids building the model. If it do fail, then you have to use `load` which might be too heavy but that's the best we can do. :param save_dir: The path to load. """ save_dir = get_resource(save_dir) self.load_config(save_dir) self.load_vocabs(save_dir) self.transform.build_config() self.transform.lock_vocabs() return self.transform
def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, logger: logging.Logger = None, callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=True, verbose=True, **kwargs): input_path = get_resource(input_path) file_prefix, ext = os.path.splitext(input_path) name = os.path.basename(file_prefix) if not name: name = 'evaluate' if save_dir and not logger: logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN, mode='w') tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size) samples = self.num_samples_in(tst_data) num_batches = math.ceil(samples / batch_size) if warm_up: self.model.predict_on_batch(tst_data.take(1)) if output: assert save_dir, 'Must pass save_dir in order to output' if isinstance(output, bool): output = os.path.join(save_dir, name) + '.predict' + ext elif isinstance(output, str): output = output else: raise RuntimeError('output ({}) must be of type bool or str'.format(repr(output))) timer = Timer() eval_outputs = self.evaluate_dataset(tst_data, callbacks, output, num_batches, **kwargs) loss, score, output = eval_outputs[0], eval_outputs[1], eval_outputs[2] delta_time = timer.stop() speed = samples / delta_time.delta_seconds if logger: f1: IOBES_F1_TF = None for metric in self.model.metrics: if isinstance(metric, IOBES_F1_TF): f1 = metric break extra_report = '' if f1: overall, by_type, extra_report = f1.state.result(full=True, verbose=False) extra_report = ' \n' + extra_report logger.info('Evaluation results for {} - ' 'loss: {:.4f} - {} - speed: {:.2f} sample/sec{}' .format(name + ext, loss, format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics), speed, extra_report)) if output: logger.info('Saving output to {}'.format(output)) with open(output, 'w', encoding='utf-8') as out: self.evaluate_output(tst_data, out, num_batches, self.model.metrics) return (loss, score, speed) + eval_outputs[3:]
def convert_to_stanford_dependency_330(src, dst): logger.info( f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. ' f'It might take a while...') sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip' sp_home = get_resource(sp_home) # jar_path = get_resource(f'{sp_home}#stanford-parser.jar') code, out, err = get_exitcode_stdout_stderr( f'java -cp {sp_home}/* edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -keepPunct -conllx ' f'-treeFile {src}') with open(dst, 'w') as f: f.write(out) if code: raise RuntimeError( f'Conversion failed with code {code} for {src}. The err message is:\n {err}' f'Do you have java installed? Do you have enough memory?')
def export_model_for_serving(self, export_dir=None, version=1, overwrite=False, show_hint=False): assert self.model, 'You have to fit or load a model before exporting it' if not export_dir: assert 'load_path' in self.meta, 'When not specifying save_dir, load_path has to present' export_dir = get_resource(self.meta['load_path']) model_path = os.path.join(export_dir, str(version)) if os.path.isdir(model_path) and not overwrite: logger.info(f'{model_path} exists, skip since overwrite = {overwrite}') return export_dir logger.info(f'Exporting to {export_dir} ...') tf.saved_model.save(self.model, model_path) logger.info(f'Successfully exported model to {export_dir}') if show_hint: logger.info(f'You can serve it through \n' f'tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} ' f'--model_base_path={export_dir} --rest_api_port=8888') return export_dir
def read_conll(filepath, underline_to_none=False, enhanced_collapse_empty_nodes=False): sent = [] filepath: str = get_resource(filepath) if filepath.endswith('.conllu') and enhanced_collapse_empty_nodes is None: enhanced_collapse_empty_nodes = True with open(filepath, encoding='utf-8') as src: for idx, line in enumerate(src): if line.startswith('#'): continue line = line.strip() cells = line.split('\t') if line and cells: if enhanced_collapse_empty_nodes and '.' in cells[0]: cells[0] = float(cells[0]) cells[6] = None else: if '-' in cells[0] or '.' in cells[0]: # sent[-1][1] += cells[1] continue cells[0] = int(cells[0]) try: cells[6] = int(cells[6]) except ValueError: cells[6] = 0 logger.exception( f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}') if underline_to_none: for i, x in enumerate(cells): if x == '_': cells[i] = None sent.append(cells) else: if enhanced_collapse_empty_nodes: sent = collapse_enhanced_empty_nodes(sent) yield sent sent = [] if sent: if enhanced_collapse_empty_nodes: sent = collapse_enhanced_empty_nodes(sent) yield sent
def evaluate(gold_file, pred_file): """ Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) Parameters ---------- gold_file : str The gold conllx file pred_file : str The pred conllx file Returns ------- uas : float unlabeled attachment score las : float labeled attachment score """ gold_file = get_resource(gold_file) fixed_pred_file = tempfile.NamedTemporaryFile().name copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False) if gold_file.endswith('.conllu'): fixed_gold_file = tempfile.NamedTemporaryFile().name copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False) gold_file = fixed_gold_file exitcode, out, err = get_exitcode_stdout_stderr( f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}') if exitcode: raise RuntimeError( f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.' ) lines = out.split('\n')[-4:] las = int(lines[0].split()[3]) / int(lines[0].split()[5]) uas = int(lines[1].split()[3]) / int(lines[1].split()[5]) return uas, las
def load_weights(self, save_dir, filename='model.pt', **kwargs): save_dir = get_resource(save_dir) self.model.load_state_dict(torch.load(os.path.join(save_dir, filename)), strict=False)
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, cpu=True, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = VocabTF() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len( vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get( embeddings_initializer) with tf.device('cpu:0') if cpu else DummyContext(): pret_embs = embeddings_initializer( shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-25 16:04 import os import tempfile from typing import List from edparser.metrics.parsing.conllx_eval import copy_cols from edparser.common.structure import SerializableDict from edparser.metrics.parsing import iwpt20_xud_eval from edparser.metrics.parsing.iwpt20_xud_eval import load_conllu_file from edparser.utils.io_util import get_resource, get_exitcode_stdout_stderr UD_TOOLS_ROOT = get_resource( 'https://github.com/UniversalDependencies/tools/archive/1650bd354bd158c75836cff6650ea35cc9928fc8.zip') ENHANCED_COLLAPSE_EMPTY_NODES = os.path.join(UD_TOOLS_ROOT, 'enhanced_collapse_empty_nodes.pl') CONLLU_QUICK_FIX = os.path.join(UD_TOOLS_ROOT, 'conllu-quick-fix.pl') def run_perl(script, src, dst=None): if not dst: dst = tempfile.NamedTemporaryFile().name exitcode, out, err = get_exitcode_stdout_stderr( f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}') if exitcode: # cpanm -l ~/.local namespace::autoclean # cpanm -l ~/.local Moose # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module raise RuntimeError(err)
def load_meta(self, save_dir, filename='meta.json'): save_dir = get_resource(save_dir) metapath = os.path.join(save_dir, filename) if os.path.isfile(metapath): self.meta.update(load_json(metapath))
def load_file(self, filepath): filepath = get_resource(filepath) for words, tags in generator_words_tags_from_tsv(filepath, lower=False): yield {'word': words, 'tag': tags}
def load_weights(self, save_dir, filename='model.h5', **kwargs): assert self.model.built or self.model.weights, 'You must call self.model.built() in build_model() ' \ 'in order to load it' save_dir = get_resource(save_dir) self.model.load_weights(os.path.join(save_dir, filename))
'‹': "'", '›': "'", } def convert_to_stanford_dependency_330(src, dst): logger.info( f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. ' f'It might take a while...') sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip' sp_home = get_resource(sp_home) # jar_path = get_resource(f'{sp_home}#stanford-parser.jar') code, out, err = get_exitcode_stdout_stderr( f'java -cp {sp_home}/* edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -keepPunct -conllx ' f'-treeFile {src}') with open(dst, 'w') as f: f.write(out) if code: raise RuntimeError( f'Conversion failed with code {code} for {src}. The err message is:\n {err}' f'Do you have java installed? Do you have enough memory?') for s, d in zip([PTB_TRAIN, PTB_VALID, PTB_TEST], [PTB_SD330_TRAIN, PTB_SD330_VALID, PTB_SD330_TEST]): s = get_resource(s) home = os.path.dirname(s) d = os.path.join(home, d.split('/')[-1]) if not os.path.isfile(d): convert_to_stanford_dependency_330(s, d)
def load_config(self, save_dir, filename='config.json'): save_dir = get_resource(save_dir) self.config.load_json(os.path.join(save_dir, filename))
def run(lang, do_train=True, do_eval=True, mbert=True): """ Run training and decoding :param lang: Language code, 2 letters. :param do_train: Train model or not. :param do_eval: Evaluate performance (generating output) or not. :param mbert: Use mbert or language specific transformers. """ dataset = f'data/iwpt2020/train-dev-combined/{lang}' trnfile = f'{dataset}/train.short.conllu' # for idx, sent in enumerate(read_conll(trnfile)): # print(f'\r{idx}', end='') devfile = f'{dataset}/dev.short.conllu' testfile = f'data/iwpt2020/test-udpipe/{lang}.fixed.short.conllu' prefix = 'mbert' transformer = 'bert-base-multilingual-cased' if not mbert: prefix = 'bert' if lang == 'sv': transformer = "KB/bert-base-swedish-cased" if lang == 'ar': transformer = "asafaya/bert-base-arabic" elif lang == 'en': transformer = 'albert-xxlarge-v2' elif lang == 'ru': transformer = "DeepPavlov/rubert-base-cased" elif lang == 'fi': transformer = "TurkuNLP/bert-base-finnish-cased-v1" elif lang == 'it': transformer = "dbmdz/bert-base-italian-cased" elif lang == 'nl': transformer = "wietsedv/bert-base-dutch-cased" elif lang == 'et': transformer = get_resource( 'http://dl.turkunlp.org/estonian-bert/etwiki-bert/pytorch/etwiki-bert-base-cased.tar.gz' ) elif lang == 'fr': transformer = 'camembert-base' elif lang == 'pl': transformer = "dkleczek/bert-base-polish-uncased-v1" elif lang == 'sk' or lang == 'bg' or lang == 'cs': transformer = get_resource( 'http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt.tar.gz' ) else: prefix = 'mbert' save_dir = f'data/model/iwpt2020/{lang}/{prefix}_dep' # if do_train and os.path.isdir(save_dir): # return strategy = tf.distribute.MirroredStrategy() print("Number of devices: {}".format(strategy.num_replicas_in_sync)) with strategy.scope(): parser = BiaffineTransformerDependencyParser(strategy=strategy) if do_train: parser.fit( trnfile, devfile, save_dir, transformer, batch_size=4096, warmup_steps_ratio=.1, samples_per_batch=150, # max_samples_per_batch=75, transformer_dropout=.33, learning_rate=2e-3, learning_rate_transformer=1e-5, # max_seq_length=512, # epochs=1 ) logger = init_logger(name='test', root_dir=save_dir, mode='w') parser.config.tree = 'mst' # dep_dev_output = f'{save_dir}/{os.path.basename(devfile.replace(".conllu", ".dep.pred.conllu"))}' # if not os.path.isfile(dep_dev_output) or do_eval: # parser.evaluate(devfile, save_dir, warm_up=False, output=dep_dev_output, logger=logger) dep_test_output = f'{save_dir}/{os.path.basename(testfile.replace(".conllu", ".dep.pred.conllu"))}' if not os.path.isfile(dep_test_output) or do_eval: parser.load(save_dir, tree='mst') parser.evaluate(testfile, save_dir, warm_up=False, output=dep_test_output, logger=None) # score = evaluate(devfile, dep_dev_output) # dep_dev_elas = score["ELAS"].f1 # dep_dev_clas = score["CLAS"].f1 # logger.info(f'DEP score for {lang}:') # logger.info(f'ELAS: {dep_dev_elas * 100:.2f} - CLAS:{dep_dev_clas * 100:.2f}') if do_train: print(f'Model saved in {save_dir}') save_dir = f'data/model/iwpt2020/{lang}/{prefix}_sdp' parser = BiaffineTransformerSemanticDependencyParser() if do_train and not os.path.isdir(save_dir): parser.fit( trnfile, devfile, save_dir, transformer, batch_size=1000 if lang == 'cs' else 3000, warmup_steps_ratio=.1, samples_per_batch=150, # max_samples_per_batch=150, transformer_dropout=.33, learning_rate=2e-3, learning_rate_transformer=1e-5, # max_seq_length=512, # epochs=1 ) # (sdp_dev_elas, final_sdp_dev_output), (ensemble_dev_elas, final_ensemble_dev_output) = \ # eval_sdp_and_ensemble(parser, devfile, dep_dev_output, save_dir, lang, logger) (sdp_test_elas, final_sdp_test_output), (ensemble_test_elas, final_ensemble_test_output) = \ eval_sdp_and_ensemble(parser, testfile, dep_test_output, save_dir, lang, logger, do_eval) save_dir = f'data/model/iwpt2020/{lang}/' # copyfile(dep_dev_output, save_dir + 'dev.dep.conllu') # copyfile(final_sdp_dev_output, save_dir + 'dev.sdp.conllu') # copyfile(final_ensemble_dev_output, save_dir + 'dev.ens.conllu') # dev_scores = [dep_dev_elas, sdp_dev_elas, ensemble_dev_elas] # winner = max(dev_scores) # widx = dev_scores.index(winner) dep_test_output = merge_long_sent(dep_test_output) evaluate(f'data/iwpt2020/test-udpipe/{lang}.fixed.conllu', dep_test_output) dep_test_output = dep_test_output.replace('.conllu', '.fixed.conllu') # if widx == 0: # # dep wins, but we don't have output for dep, so let's do it below # best_test_output = dep_test_output # best_task = 'dep' # elif widx == 1: # # sdp wins # best_test_output = final_sdp_test_output # best_task = 'sdp' # else: # # ensemble wins # best_test_output = final_ensemble_test_output # best_task = 'ens' # # info = { # 'best_task': best_task, # 'dev_scores': dict((x, y) for x, y in zip(['dep', 'sdp', 'ens'], dev_scores)) # } # save_json(info, save_dir + 'scores.json') # copyfile(best_test_output, save_dir + lang + '.conllu') # dev_json = 'data/model/iwpt2020/dev.json' # try: # total = load_json(dev_json) # except FileNotFoundError: # total = {} # total[lang] = info # save_json(total, dev_json) final_root = f'data/model/iwpt2020/{prefix}' dep_root = f'{final_root}/dep' sdp_root = f'{final_root}/sdp' ens_root = f'{final_root}/ens' outputs = [ dep_test_output, final_sdp_test_output, final_ensemble_test_output ] folders = [dep_root, sdp_root, ens_root] for o, f in zip(outputs, folders): os.makedirs(f, exist_ok=True) tmp = f'/tmp/{lang}.conllu' copyfile(o, tmp) remove_complete_edges(tmp, tmp) restore_collapse_edges(tmp, tmp) conllu_quick_fix(tmp, f'{f}/{lang}.conllu')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-22 13:22 import glob from edparser.utils.io_util import get_resource import os import shutil from iwpt2020 import cdroot cdroot() iwpt_data = 'data/iwpt2020' downloaded_iwpt_data = get_resource( 'https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3238/iwpt2020stdata.tgz?sequence=1&isAllowed=y') if os.path.isdir(iwpt_data): shutil.rmtree(iwpt_data, ignore_errors=True) train_dev = f'{iwpt_data}/train-dev' os.makedirs(train_dev) for treebank in glob.glob(f'{downloaded_iwpt_data}/UD_*'): shutil.copytree(treebank, f'{train_dev}/{os.path.basename(treebank)}') shutil.copytree(f'{downloaded_iwpt_data}/test-blind', f'{iwpt_data}/test-blind') from iwpt2020 import preprocess_testset preprocess_testset.main() from iwpt2020 import enhanced_collapse_empty_nodes_for_all enhanced_collapse_empty_nodes_for_all.main() from iwpt2020 import combine_treebanks combine_treebanks.main()
def build_transformer(transformer, max_seq_length=None, num_labels=None, tagging=True, tokenizer_only=False): spm_model_file = None if transformer in zh_albert_models_google: from bert.tokenization.albert_tokenization import FullTokenizer model_url = zh_albert_models_google[transformer] albert = True elif transformer in albert_models_tfhub: from edparser.layers.transformers.albert_tokenization import FullTokenizer with stdout_redirected(to=os.devnull): model_url = fetch_tfhub_albert_model(transformer, os.path.join(hanlp_home(), 'thirdparty', 'tfhub.dev', 'google', transformer)) albert = True spm_model_file = glob.glob(os.path.join(model_url, 'assets', '*.model')) assert len(spm_model_file) == 1, 'No vocab found or unambiguous vocabs found' spm_model_file = spm_model_file[0] elif transformer in bert_models_google: from bert.tokenization.bert_tokenization import FullTokenizer model_url = bert_models_google[transformer] albert = False else: raise ValueError( f'Unknown model {transformer}, available ones: {list(bert_models_google.keys()) + list(zh_albert_models_google.keys()) + list(albert_models_tfhub.keys())}') bert_dir = get_resource(model_url) if spm_model_file: vocab = glob.glob(os.path.join(bert_dir, 'assets', '*.vocab')) else: vocab = glob.glob(os.path.join(bert_dir, '*vocab*.txt')) assert len(vocab) == 1, 'No vocab found or unambiguous vocabs found' vocab = vocab[0] lower_case = any(key in transformer for key in ['uncased', 'multilingual', 'chinese', 'albert']) if spm_model_file: # noinspection PyTypeChecker tokenizer = FullTokenizer(vocab_file=vocab, spm_model_file=spm_model_file, do_lower_case=lower_case) else: tokenizer = FullTokenizer(vocab_file=vocab, do_lower_case=lower_case) if tokenizer_only: return tokenizer if spm_model_file: bert_params = albert_params(bert_dir) else: bert_params = bert.params_from_pretrained_ckpt(bert_dir) l_bert = bert.BertModelLayer.from_params(bert_params, name='albert' if albert else "bert") if not max_seq_length: return l_bert, tokenizer, bert_dir l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids") l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids") l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids") output = l_bert([l_input_ids, l_token_type_ids], mask=l_mask_ids) if not tagging: output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) if bert_params.hidden_dropout: output = tf.keras.layers.Dropout(bert_params.hidden_dropout, name='hidden_dropout')(output) logits = tf.keras.layers.Dense(num_labels, kernel_initializer=tf.keras.initializers.TruncatedNormal( bert_params.initializer_range))(output) model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits) model.build(input_shape=(None, max_seq_length)) if not spm_model_file: ckpt = glob.glob(os.path.join(bert_dir, '*.index')) assert ckpt, f'No checkpoint found under {bert_dir}' ckpt, _ = os.path.splitext(ckpt[0]) with stdout_redirected(to=os.devnull): if albert: if spm_model_file: skipped_weight_value_tuples = bert.load_albert_weights(l_bert, bert_dir) else: # noinspection PyUnboundLocalVariable skipped_weight_value_tuples = load_stock_weights(l_bert, ckpt) else: # noinspection PyUnboundLocalVariable skipped_weight_value_tuples = bert.load_bert_weights(l_bert, ckpt) assert 0 == len(skipped_weight_value_tuples), f'failed to load pretrained {transformer}' return model, tokenizer
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-08 22:35 import tempfile from edparser.components.parsers.conll import read_conll from edparser.utils.io_util import get_resource, get_exitcode_stdout_stderr CONLLX_EVAL = get_resource( 'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl') def evaluate(gold_file, pred_file): """ Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) Parameters ---------- gold_file : str The gold conllx file pred_file : str The pred conllx file Returns ------- uas : float unlabeled attachment score las : float labeled attachment score """
def load_vocab(self, save_dir, filename='vocab.json'): save_dir = get_resource(save_dir) vocab = SerializableDict() vocab.load_json(os.path.join(save_dir, filename)) self.vocab.copy_from(vocab)