def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, load_kwargs=None, **kwargs) -> Component: identifier = save_dir load_path = save_dir save_dir = get_resource(save_dir) if save_dir.endswith('.json'): meta_filename = os.path.basename(save_dir) save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, meta_filename) if not os.path.isfile(metapath): tips = '' if save_dir.isupper(): from difflib import SequenceMatcher similar_keys = sorted(pretrained.ALL.keys(), key=lambda k: SequenceMatcher( None, save_dir, metapath).ratio(), reverse=True)[:5] tips = f'Check its spelling based on the available keys:\n' + \ f'{sorted(pretrained.ALL.keys())}\n' + \ f'Tips: it might be one of {similar_keys}' raise FileNotFoundError( f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}' ) meta: dict = load_json(metapath) cls = meta.get('class_path', None) assert cls, f'{meta_filename} doesn\'t contain class_path field' try: obj: Component = object_from_class_path(cls, **kwargs) if hasattr(obj, 'load'): if transform_only: # noinspection PyUnresolvedReferences obj.load_transform(save_dir) else: if load_kwargs is None: load_kwargs = {} if os.path.isfile(os.path.join(save_dir, 'config.json')): obj.load(save_dir, **load_kwargs) else: obj.load(metapath, **load_kwargs) obj.meta['load_path'] = load_path return obj except Exception as e: eprint(f'Failed to load {identifier}. See stack trace below') traceback.print_exc() model_version = meta.get("hanlp_version", "unknown") cur_version = version.__version__ if model_version != cur_version: eprint( f'{identifier} was created with hanlp-{model_version}, while you are running {cur_version}. ' f'Try to upgrade hanlp with\n' f'pip install --upgrade hanlp\n' f'If the problem persists, please submit an issue to https://github.com/hankcs/HanLP/issues .' ) exit(1)
def file_to_inputs(self, filepath: str, gold=True): data = load_json(filepath) for d in data: tokens = list(d['token']) ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] pos = d['stanford_pos'] ner = d['stanford_ner'] deprel = d['stanford_deprel'] head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) relation = d['relation'] yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation
def merge_long_sent(file, lang=None): if not lang: lang = os.path.basename(file).split('.')[0] long_sent: dict = load_json( f'data/iwpt2020/test-udpipe/{lang}.fixed.long.json') long_sent = dict((int(x), y) for x, y in long_sent.items()) idx = 0 fout = file.replace('.short', '') with open(fout, 'w') as out: for sent in load_conll(file): long = long_sent.get(idx, None) if long: out.write(f'{long}\n\n') idx += 1 out.write(f'{sent}\n\n') idx += 1 return fout
def eval_sdp_and_ensemble(parser, devfile, dep_dev_output, save_dir, lang, logger, do_eval=True): long_sent: dict = load_json(devfile.replace('.short.conllu', '.long.json')) long_sent = dict((int(x), y) for x, y in long_sent.items()) sdp_dev_output = f'{save_dir}/{os.path.basename(devfile.replace(".conllu", ".sdp.pred.conllu"))}' sdp_dev_output = sdp_dev_output.replace('.short', '') if not os.path.isfile(sdp_dev_output) or do_eval: if not parser.model: parser.load(save_dir) scores = parser.evaluate(devfile, save_dir, warm_up=False, ret_scores=True, logger=logger, batch_size=256 if lang == 'cs' else None)[-1] sdp_to_dag(parser, scores, sdp_dev_output, long_sent) score = evaluate(devfile.replace('.short', ''), sdp_dev_output) final_sdp_dev_output = sdp_dev_output.replace('.conllu', '.fixed.conllu') sdp_elas = score["ELAS"].f1 sdp_clas = score["CLAS"].f1 logger.info(f'SDP score for {lang}:') logger.info(f'ELAS: {sdp_elas * 100:.2f} - CLAS:{sdp_clas * 100:.2f}') print(f'Model saved in {save_dir}') ensemble_output = f'{save_dir}/{os.path.basename(devfile.replace(".conllu", ".ensemble.pred.conllu"))}' if not os.path.isfile(sdp_dev_output) or do_eval: sdp_to_dag(parser, scores, ensemble_output, long_sent, dep_dev_output) score = evaluate(devfile.replace('.short', ''), ensemble_output) final_ensemble_output = ensemble_output.replace('.conllu', '.fixed.conllu') logger.info(f'Ensemble score for {lang}:') ensemble_elas = score["ELAS"].f1 logger.info( f'ELAS: {ensemble_elas * 100:.2f} - CLAS:{score["CLAS"].f1 * 100:.2f}') return (sdp_elas, final_sdp_dev_output), (ensemble_elas, final_ensemble_output)
def from_meta(meta: Union[dict, str], **kwargs): if isinstance(meta, str): meta = load_json(meta) return Pipeline(*[load_from_meta(pipe) for pipe in meta['pipes']])
def load(self, filepath): meta = load_json(filepath) self.clear() self.extend(Pipeline.from_meta(meta))
def load_json(self, path): item = load_json(path) return self.copy_from(item)
def load_meta(self, save_dir, filename='meta.json'): save_dir = get_resource(save_dir) metapath = os.path.join(save_dir, filename) if os.path.isfile(metapath): self.meta.update(load_json(metapath))