Example #1
0
 def __init__(self, mapper: Union[str, dict], src: str, dst: str = None) -> None:
     super().__init__(src, dst)
     self.mapper = mapper
     if isinstance(mapper, str):
         mapper = get_resource(mapper)
     if isinstance(mapper, str):
         self._table = load_json(mapper)
     elif isinstance(mapper, dict):
         self._table = mapper
     else:
         raise ValueError(f'Unrecognized mapper type {mapper}')
Example #2
0
 def file_to_inputs(self, filepath: str, gold=True):
     data = load_json(filepath)
     for d in data:
         tokens = list(d['token'])
         ss, se = d['subj_start'], d['subj_end']
         os, oe = d['obj_start'], d['obj_end']
         pos = d['stanford_pos']
         ner = d['stanford_ner']
         deprel = d['stanford_deprel']
         head = [int(x) for x in d['stanford_head']]
         assert any([x == 0 for x in head])
         relation = d['relation']
         yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation
Example #3
0
 def load_meta(self, save_dir, filename='meta.json'):
     save_dir = get_resource(save_dir)
     metapath = os.path.join(save_dir, filename)
     if os.path.isfile(metapath):
         self.meta.update(load_json(metapath))
Example #4
0
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE,
                        **kwargs) -> Component:
    """

    Args:
        save_dir:
        meta_filename (str): The meta file of that saved component, which stores the classpath and version.
        transform_only:
        **kwargs:

    Returns:

    """
    identifier = save_dir
    load_path = save_dir
    save_dir = get_resource(save_dir)
    if save_dir.endswith('.json'):
        meta_filename = os.path.basename(save_dir)
        save_dir = os.path.dirname(save_dir)
    metapath = os.path.join(save_dir, meta_filename)
    if not os.path.isfile(metapath):
        tf_model = False
        metapath = os.path.join(save_dir, 'config.json')
    else:
        tf_model = True
    if not os.path.isfile(metapath):
        tips = ''
        if save_dir.isupper():
            from difflib import SequenceMatcher
            similar_keys = sorted(pretrained.ALL.keys(),
                                  key=lambda k: SequenceMatcher(None, save_dir, metapath).ratio(),
                                  reverse=True)[:5]
            tips = f'Check its spelling based on the available keys:\n' + \
                   f'{sorted(pretrained.ALL.keys())}\n' + \
                   f'Tips: it might be one of {similar_keys}'
        raise FileNotFoundError(f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}')
    meta: dict = load_json(metapath)
    cls = meta.get('classpath', None)
    if not cls:
        cls = meta.get('class_path', None)  # For older version
    if tf_model:
        # tf models are trained with version <= 2.0. To migrate them to 2.1, map their classpath to new locations
        upgrade = {
            'hanlp.components.tok.TransformerTokenizer': 'hanlp.components.tok_tf.TransformerTokenizerTF',
            'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF',
            'hanlp.components.pos.CNNPartOfSpeechTagger': 'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF',
            'hanlp.components.ner.TransformerNamedEntityRecognizer': 'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF',
            'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF',
            'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF',
            'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tok_tf.NgramConvTokenizerTF',
            'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF',
            'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF',
        }
        cls = upgrade.get(cls, cls)
    assert cls, f'{meta_filename} doesn\'t contain classpath field'
    try:
        obj: Component = object_from_classpath(cls)
        if hasattr(obj, 'load'):
            if transform_only:
                # noinspection PyUnresolvedReferences
                obj.load_transform(save_dir)
            else:
                if os.path.isfile(os.path.join(save_dir, 'config.json')):
                    obj.load(save_dir, verbose=verbose, **kwargs)
                else:
                    obj.load(metapath, **kwargs)
            obj.config['load_path'] = load_path
        return obj
    except ModuleNotFoundError:
        raise ModuleNotFoundError('Some modules required by this model are missing. Please install the full version:'
                                  '\n\n\tpip install hanlp[full]') from None
    except Exception as e:
        eprint(f'Failed to load {identifier}.')
        from pkg_resources import parse_version
        model_version = meta.get("hanlp_version", "unknown")
        if model_version == '2.0.0':  # Quick fix: the first version used a wrong string
            model_version = '2.0.0-alpha.0'
        model_version = parse_version(model_version)
        installed_version = parse_version(version.__version__)
        try:
            latest_version = get_latest_info_from_pypi()
        except:
            latest_version = None
        if model_version > installed_version:
            eprint(f'{identifier} was created with hanlp-{model_version}, '
                   f'while you are running a lower version: {installed_version}. ')
        if installed_version != latest_version:
            eprint(
                f'Please upgrade HanLP with:\n'
                f'\n\tpip install --upgrade hanlp\n')
        eprint(
            'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n'
            'When reporting an issue, make sure to paste the FULL ERROR LOG below.')

        eprint(f'{"ERROR LOG BEGINS":=^80}')
        import platform
        eprint(f'OS: {platform.platform()}')
        eprint(f'Python: {platform.python_version()}')
        import torch
        eprint(f'PyTorch: {torch.__version__}')
        if tf_model:
            try:
                import tensorflow
                tf_version = tensorflow.__version__
            except ModuleNotFoundError:
                tf_version = 'not installed'
            eprint(f'TensorFlow: {tf_version}')
        eprint(f'HanLP: {version.__version__}')
        import sys
        sys.stderr.flush()
        try:
            if e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str):
                e.args = (e.args[0] + f'\n{"ERROR LOG ENDS":=^80}',) + e.args[1:]
        except:
            pass
        raise e from None
Example #5
0
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE,
                        **kwargs) -> Component:
    """
    Load a component from a ``meta.json`` (legacy TensorFlow component) or a ``config.json`` file.

    Args:
        save_dir: The identifier.
        meta_filename (str): The meta file of that saved component, which stores the classpath and version.
        transform_only: Load and return only the transform.
        **kwargs: Extra parameters passed to ``component.load()``.

    Returns:

        A component.
    """
    identifier = save_dir
    load_path = save_dir
    save_dir = get_resource(save_dir)
    if save_dir.endswith('.json'):
        meta_filename = os.path.basename(save_dir)
        save_dir = os.path.dirname(save_dir)
    metapath = os.path.join(save_dir, meta_filename)
    if not os.path.isfile(metapath):
        tf_model = False
        metapath = os.path.join(save_dir, 'config.json')
    else:
        tf_model = True
    cls = None
    if not os.path.isfile(metapath):
        tips = ''
        if save_dir.isupper():
            from difflib import SequenceMatcher
            similar_keys = sorted(pretrained.ALL.keys(),
                                  key=lambda k: SequenceMatcher(None, k, identifier).ratio(),
                                  reverse=True)[:5]
            tips = f'Check its spelling based on the available keys:\n' + \
                   f'{sorted(pretrained.ALL.keys())}\n' + \
                   f'Tips: it might be one of {similar_keys}'
        # These components are not intended to be loaded in this way, but I'm tired of explaining it again and again
        if identifier in pretrained.word2vec.ALL.values():
            save_dir = os.path.dirname(save_dir)
            metapath = os.path.join(save_dir, 'config.json')
            save_json({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent',
                       'embed': {'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbedding',
                                 'embed': identifier, 'field': 'token', 'normalize': 'l2'},
                       'hanlp_version': version.__version__}, metapath)
        elif identifier in pretrained.fasttext.ALL.values():
            save_dir = os.path.dirname(save_dir)
            metapath = os.path.join(save_dir, 'config.json')
            save_json({'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbeddingComponent',
                       'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding',
                                 'filepath': identifier, 'src': 'token'},
                       'hanlp_version': version.__version__}, metapath)
        else:
            raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}')
    meta: dict = load_json(metapath)
    cls = meta.get('classpath', cls)
    if not cls:
        cls = meta.get('class_path', None)  # For older version
    if tf_model:
        # tf models are trained with version < 2.1. To migrate them to 2.1, map their classpath to new locations
        upgrade = {
            'hanlp.components.tok_tf.TransformerTokenizerTF': 'hanlp.components.tokenizers.tok_tf.TransformerTokenizerTF',
            'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF',
            'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF',
            'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.CNNPartOfSpeechTaggerTF',
            'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF': 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF',
            'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF',
            'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF',
            'hanlp.components.tok_tf.NgramConvTokenizerTF': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF',
            'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF',
            'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF',
            'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF',
        }
        cls = upgrade.get(cls, cls)
    assert cls, f'{meta_filename} doesn\'t contain classpath field'
    try:
        obj: Component = object_from_classpath(cls)
        if hasattr(obj, 'load'):
            if transform_only:
                # noinspection PyUnresolvedReferences
                obj.load_transform(save_dir)
            else:
                if os.path.isfile(os.path.join(save_dir, 'config.json')):
                    obj.load(save_dir, verbose=verbose, **kwargs)
                else:
                    obj.load(metapath, **kwargs)
            obj.config['load_path'] = load_path
        return obj
    except ModuleNotFoundError as e:
        if isdebugging():
            raise e from None
        else:
            raise ModuleNotFoundError(
                f'Some modules ({e.name} etc.) required by this model are missing. Please install the full version:'
                '\n\n\tpip install hanlp[full] -U') from None
    except ValueError as e:
        if e.args and isinstance(e.args[0], str) and 'Internet connection' in e.args[0]:
            raise ConnectionError(
                'Hugging Face 🤗 Transformers failed to download because your Internet connection is either off or bad.\n'
                'See https://hanlp.hankcs.com/docs/install.html#server-without-internet for solutions.') \
                from None
        raise e from None
    except Exception as e:
        # Some users often install an incompatible tf and put the blame on HanLP. Teach them the basics.
        try:
            you_installed_wrong_versions, extras = check_version_conflicts(extras=('full',) if tf_model else None)
        except:
            you_installed_wrong_versions, extras = None, None
        if you_installed_wrong_versions:
            raise version.NotCompatible(you_installed_wrong_versions + '\nPlease reinstall HanLP in the right way:' +
                                        '\n\n\tpip install --upgrade hanlp' + (
                                            f'[{",".join(extras)}]' if extras else '')) from None
        eprint(f'Failed to load {identifier}.')
        from pkg_resources import parse_version
        model_version = meta.get("hanlp_version", '2.0.0-alpha.0')
        if model_version == '2.0.0':  # Quick fix: the first version used a wrong string
            model_version = '2.0.0-alpha.0'
        model_version = parse_version(model_version)
        installed_version = parse_version(version.__version__)
        try:
            latest_version = get_latest_info_from_pypi()
        except:
            latest_version = None
        if model_version > installed_version:
            eprint(f'{identifier} was created with hanlp-{model_version}, '
                   f'while you are running a lower version: {installed_version}. ')
        if installed_version != latest_version:
            eprint(
                f'Please upgrade HanLP with:\n'
                f'\n\tpip install --upgrade hanlp\n')
        eprint(
            'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n'
            'When reporting an issue, make sure to paste the FULL ERROR LOG below.')

        eprint(f'{"ERROR LOG BEGINS":=^80}')
        import platform
        eprint(f'OS: {platform.platform()}')
        eprint(f'Python: {platform.python_version()}')
        import torch
        eprint(f'PyTorch: {torch.__version__}')
        if tf_model:
            try:
                import tensorflow
                tf_version = tensorflow.__version__
            except ModuleNotFoundError:
                tf_version = 'not installed'
            eprint(f'TensorFlow: {tf_version}')
        eprint(f'HanLP: {version.__version__}')
        import sys
        sys.stderr.flush()
        try:
            if e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str):
                e.args = (e.args[0] + f'\n{"ERROR LOG ENDS":=^80}',) + e.args[1:]
        except:
            pass
        raise e from None
Example #6
0
 def load_json(self, path):
     item = load_json(path)
     return self.copy_from(item)
Example #7
0
 def from_config(meta: Union[dict, str], **kwargs):
     if isinstance(meta, str):
         meta = load_json(meta)
     return Pipeline(*[load_from_meta(pipe) for pipe in meta['pipes']])
Example #8
0
 def load(self, filepath):
     meta = load_json(filepath)
     self.clear()
     self.extend(Pipeline.from_config(meta))
Example #9
0
 def load():
     return load_json(get_resource(HANLP_CHAR_TABLE_JSON))