def main(): if len(sys.argv) != 3: eprint('2 arguments required: ontonotes_path output_path') exit(1) ontonotes_path = sys.argv[1] output_path = sys.argv[2] make_ontonotes_jsonlines(ontonotes_path, output_path)
def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE): if not save_path: save_path = path_from_url(url, save_dir, prefix, append_location) if os.path.isfile(save_path): if verbose: eprint('Using local {}, ignore {}'.format(save_path, url)) return save_path else: makedirs(parent_dir(save_path)) if verbose: eprint('Downloading {} to {}'.format(url, save_path)) tmp_path = '{}.downloading'.format(save_path) remove_file(tmp_path) try: downloader = Downloader( url, tmp_path, 4, headers={ 'User-agent': f'HanLP/{__version__} ({platform.platform()})' }) if verbose: downloader.subscribe(DownloadCallback(show_header=False)) downloader.start_sync() except BaseException as e: remove_file(tmp_path) url = url.split('#')[0] if not windows(): hints_for_download = f'e.g. \nwget {url} -O {save_path}\n' else: hints_for_download = ' Use some decent downloading tools.\n' if not url.startswith(HANLP_URL): hints_for_download += 'For third party data, you may find it on our mirror site:\n' \ 'https://od.hankcs.com/hanlp/data/\n' installed_version, latest_version = check_outdated() if installed_version != latest_version: hints_for_download += f'Or upgrade to the latest version({latest_version}):\npip install -U hanlp' message = f'Download failed due to [red]{repr(e)}[/red]. Please download it to {save_path} by yourself. ' \ f'[yellow]{hints_for_download}[/yellow]' \ 'See https://hanlp.hankcs.com/docs/install.html#install-models for instructions.' if verbose: cprint(message) if hasattr(e, 'msg'): e.msg += '\n' + remove_color_tag(message) raise e remove_file(save_path) os.rename(tmp_path, save_path) return save_path
def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE): if not save_path: save_path = path_from_url(url, save_dir, prefix, append_location) if os.path.isfile(save_path): if verbose: eprint('Using local {}, ignore {}'.format(save_path, url)) return save_path else: makedirs(parent_dir(save_path)) if verbose: eprint('Downloading {} to {}'.format(url, save_path)) tmp_path = '{}.downloading'.format(save_path) remove_file(tmp_path) try: downloader = Downloader(url, tmp_path, 4, headers={ 'User-agent': f'HanLP/{__version__} ({platform.platform()})'}) if verbose: downloader.subscribe(DownloadCallback(show_header=False)) downloader.start_sync() except BaseException as e: remove_file(tmp_path) url = url.split('#')[0] try: installed_version, latest_version = check_outdated() except: installed_version, latest_version = None, None # No Internet if installed_version != latest_version: # Always prompt user to upgrade whenever a new version is available hints = f'[green]Please upgrade to the latest version ({latest_version}) with:[/green]' \ f'\n\n\t[yellow]pip install -U hanlp[/yellow]\n' else: # Otherwise, prompt user to re-try hints = f'[green]Please re-try or download it to {save_path} by yourself ' if not windows(): hints += f'with:[/green]\n\n\t[yellow]wget {url} -O {save_path}[/yellow]\n\n' else: hints += 'using some decent downloading tools.[/green]\n' if not url.startswith(HANLP_URL): hints += 'For third party data, you may find it on our mirror site:\n' \ 'https://od.hankcs.com/hanlp/data/\n' hints += 'See also https://hanlp.hankcs.com/docs/install.html#install-models for instructions.' message = f'Download failed due to [red]{repr(e)}[/red].\n' \ f'{hints}' if verbose: cprint(message) if hasattr(e, 'msg'): e.msg += '\n' + remove_color_tag(message) elif hasattr(e, 'args') and e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str): e.args = (e.args[0] + '\n' + remove_color_tag(message),) + e.args[1:] raise e from None remove_file(save_path) os.rename(tmp_path, save_path) return save_path
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE, **kwargs) -> Component: """ Args: save_dir: meta_filename (str): The meta file of that saved component, which stores the classpath and version. transform_only: **kwargs: Returns: """ identifier = save_dir load_path = save_dir save_dir = get_resource(save_dir) if save_dir.endswith('.json'): meta_filename = os.path.basename(save_dir) save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, meta_filename) if not os.path.isfile(metapath): tf_model = False metapath = os.path.join(save_dir, 'config.json') else: tf_model = True if not os.path.isfile(metapath): tips = '' if save_dir.isupper(): from difflib import SequenceMatcher similar_keys = sorted(pretrained.ALL.keys(), key=lambda k: SequenceMatcher(None, save_dir, metapath).ratio(), reverse=True)[:5] tips = f'Check its spelling based on the available keys:\n' + \ f'{sorted(pretrained.ALL.keys())}\n' + \ f'Tips: it might be one of {similar_keys}' raise FileNotFoundError(f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}') meta: dict = load_json(metapath) cls = meta.get('classpath', None) if not cls: cls = meta.get('class_path', None) # For older version if tf_model: # tf models are trained with version <= 2.0. To migrate them to 2.1, map their classpath to new locations upgrade = { 'hanlp.components.tok.TransformerTokenizer': 'hanlp.components.tok_tf.TransformerTokenizerTF', 'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF', 'hanlp.components.pos.CNNPartOfSpeechTagger': 'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF', 'hanlp.components.ner.TransformerNamedEntityRecognizer': 'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF', 'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF', 'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF', 'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tok_tf.NgramConvTokenizerTF', 'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF', 'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF', } cls = upgrade.get(cls, cls) assert cls, f'{meta_filename} doesn\'t contain classpath field' try: obj: Component = object_from_classpath(cls) if hasattr(obj, 'load'): if transform_only: # noinspection PyUnresolvedReferences obj.load_transform(save_dir) else: if os.path.isfile(os.path.join(save_dir, 'config.json')): obj.load(save_dir, verbose=verbose, **kwargs) else: obj.load(metapath, **kwargs) obj.config['load_path'] = load_path return obj except ModuleNotFoundError: raise ModuleNotFoundError('Some modules required by this model are missing. Please install the full version:' '\n\n\tpip install hanlp[full]') from None except Exception as e: eprint(f'Failed to load {identifier}.') from pkg_resources import parse_version model_version = meta.get("hanlp_version", "unknown") if model_version == '2.0.0': # Quick fix: the first version used a wrong string model_version = '2.0.0-alpha.0' model_version = parse_version(model_version) installed_version = parse_version(version.__version__) try: latest_version = get_latest_info_from_pypi() except: latest_version = None if model_version > installed_version: eprint(f'{identifier} was created with hanlp-{model_version}, ' f'while you are running a lower version: {installed_version}. ') if installed_version != latest_version: eprint( f'Please upgrade HanLP with:\n' f'\n\tpip install --upgrade hanlp\n') eprint( 'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n' 'When reporting an issue, make sure to paste the FULL ERROR LOG below.') eprint(f'{"ERROR LOG BEGINS":=^80}') import platform eprint(f'OS: {platform.platform()}') eprint(f'Python: {platform.python_version()}') import torch eprint(f'PyTorch: {torch.__version__}') if tf_model: try: import tensorflow tf_version = tensorflow.__version__ except ModuleNotFoundError: tf_version = 'not installed' eprint(f'TensorFlow: {tf_version}') eprint(f'HanLP: {version.__version__}') import sys sys.stderr.flush() try: if e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str): e.args = (e.args[0] + f'\n{"ERROR LOG ENDS":=^80}',) + e.args[1:] except: pass raise e from None
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE, **kwargs) -> Component: """ Load a component from a ``meta.json`` (legacy TensorFlow component) or a ``config.json`` file. Args: save_dir: The identifier. meta_filename (str): The meta file of that saved component, which stores the classpath and version. transform_only: Load and return only the transform. **kwargs: Extra parameters passed to ``component.load()``. Returns: A component. """ identifier = save_dir load_path = save_dir save_dir = get_resource(save_dir) if save_dir.endswith('.json'): meta_filename = os.path.basename(save_dir) save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, meta_filename) if not os.path.isfile(metapath): tf_model = False metapath = os.path.join(save_dir, 'config.json') else: tf_model = True cls = None if not os.path.isfile(metapath): tips = '' if save_dir.isupper(): from difflib import SequenceMatcher similar_keys = sorted(pretrained.ALL.keys(), key=lambda k: SequenceMatcher(None, k, identifier).ratio(), reverse=True)[:5] tips = f'Check its spelling based on the available keys:\n' + \ f'{sorted(pretrained.ALL.keys())}\n' + \ f'Tips: it might be one of {similar_keys}' # These components are not intended to be loaded in this way, but I'm tired of explaining it again and again if identifier in pretrained.word2vec.ALL.values(): save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, 'config.json') save_json({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent', 'embed': {'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbedding', 'embed': identifier, 'field': 'token', 'normalize': 'l2'}, 'hanlp_version': version.__version__}, metapath) elif identifier in pretrained.fasttext.ALL.values(): save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, 'config.json') save_json({'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbeddingComponent', 'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding', 'filepath': identifier, 'src': 'token'}, 'hanlp_version': version.__version__}, metapath) else: raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}') meta: dict = load_json(metapath) cls = meta.get('classpath', cls) if not cls: cls = meta.get('class_path', None) # For older version if tf_model: # tf models are trained with version < 2.1. To migrate them to 2.1, map their classpath to new locations upgrade = { 'hanlp.components.tok_tf.TransformerTokenizerTF': 'hanlp.components.tokenizers.tok_tf.TransformerTokenizerTF', 'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF', 'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF', 'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.CNNPartOfSpeechTaggerTF', 'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF': 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF', 'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF', 'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF', 'hanlp.components.tok_tf.NgramConvTokenizerTF': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF', 'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF', 'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF', 'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF', } cls = upgrade.get(cls, cls) assert cls, f'{meta_filename} doesn\'t contain classpath field' try: obj: Component = object_from_classpath(cls) if hasattr(obj, 'load'): if transform_only: # noinspection PyUnresolvedReferences obj.load_transform(save_dir) else: if os.path.isfile(os.path.join(save_dir, 'config.json')): obj.load(save_dir, verbose=verbose, **kwargs) else: obj.load(metapath, **kwargs) obj.config['load_path'] = load_path return obj except ModuleNotFoundError as e: if isdebugging(): raise e from None else: raise ModuleNotFoundError( f'Some modules ({e.name} etc.) required by this model are missing. Please install the full version:' '\n\n\tpip install hanlp[full] -U') from None except ValueError as e: if e.args and isinstance(e.args[0], str) and 'Internet connection' in e.args[0]: raise ConnectionError( 'Hugging Face 🤗 Transformers failed to download because your Internet connection is either off or bad.\n' 'See https://hanlp.hankcs.com/docs/install.html#server-without-internet for solutions.') \ from None raise e from None except Exception as e: # Some users often install an incompatible tf and put the blame on HanLP. Teach them the basics. try: you_installed_wrong_versions, extras = check_version_conflicts(extras=('full',) if tf_model else None) except: you_installed_wrong_versions, extras = None, None if you_installed_wrong_versions: raise version.NotCompatible(you_installed_wrong_versions + '\nPlease reinstall HanLP in the right way:' + '\n\n\tpip install --upgrade hanlp' + ( f'[{",".join(extras)}]' if extras else '')) from None eprint(f'Failed to load {identifier}.') from pkg_resources import parse_version model_version = meta.get("hanlp_version", '2.0.0-alpha.0') if model_version == '2.0.0': # Quick fix: the first version used a wrong string model_version = '2.0.0-alpha.0' model_version = parse_version(model_version) installed_version = parse_version(version.__version__) try: latest_version = get_latest_info_from_pypi() except: latest_version = None if model_version > installed_version: eprint(f'{identifier} was created with hanlp-{model_version}, ' f'while you are running a lower version: {installed_version}. ') if installed_version != latest_version: eprint( f'Please upgrade HanLP with:\n' f'\n\tpip install --upgrade hanlp\n') eprint( 'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n' 'When reporting an issue, make sure to paste the FULL ERROR LOG below.') eprint(f'{"ERROR LOG BEGINS":=^80}') import platform eprint(f'OS: {platform.platform()}') eprint(f'Python: {platform.python_version()}') import torch eprint(f'PyTorch: {torch.__version__}') if tf_model: try: import tensorflow tf_version = tensorflow.__version__ except ModuleNotFoundError: tf_version = 'not installed' eprint(f'TensorFlow: {tf_version}') eprint(f'HanLP: {version.__version__}') import sys sys.stderr.flush() try: if e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str): e.args = (e.args[0] + f'\n{"ERROR LOG ENDS":=^80}',) + e.args[1:] except: pass raise e from None
word.deps = [(word.head, word.deprel)] word.head = None word.deprel = None out.write(str(sent)) out.write('\n\n') for file in [ SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, SEMEVAL2016_NEWS_TEST, SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, SEMEVAL2016_TEXT_TEST ]: file = get_resource(file) conllu = os.path.splitext(file)[0] + '.conllu' if not os.path.isfile(conllu): eprint( f'Converting {os.path.basename(file)} to {os.path.basename(conllu)} ...' ) convert_conll_to_conllu(file) for group, part in zip( [[SEMEVAL2016_NEWS_TRAIN_CONLLU, SEMEVAL2016_TEXT_TRAIN_CONLLU], [SEMEVAL2016_NEWS_DEV_CONLLU, SEMEVAL2016_TEXT_DEV_CONLLU], [SEMEVAL2016_NEWS_TEST_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU]], ['train', 'valid', 'test']): root = get_resource(_SEMEVAL2016_HOME) dst = f'{root}/train/full.{part}.conllu' if not os.path.isfile(dst): group = [get_resource(x) for x in group] eprint( f'Concatenating {os.path.basename(group[0])} and {os.path.basename(group[1])} ' f'into full dataset {os.path.basename(dst)} ...')
def uncompress(path, dest=None, remove=True, verbose=HANLP_VERBOSE): """uncompress a file Args: path: The path to a compressed file dest: The dest folder. remove: Remove archive file after decompression. verbose: ``True`` to print log message. Returns: Destination path. """ # assert path.endswith('.zip') prefix, ext = split_if_compressed(path) folder_name = os.path.basename(prefix) file_is_zip = ext == '.zip' root_of_folder = None if ext == '.gz': with gzip.open(path, 'rb') as f_in, open(prefix, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) else: with zipfile.ZipFile(path, "r") if ext == '.zip' else tarfile.open( path, 'r:*') as archive: try: if not dest: namelist = sorted(archive.namelist( ) if file_is_zip else archive.getnames()) if namelist[0] == '.': namelist = namelist[1:] namelist = [ p[len('./'):] if p.startswith('./') else p for p in namelist ] if ext == '.tgz': roots = set(x.split('/')[0] for x in namelist) if len(roots) == 1: root_of_folder = next(iter(roots)) else: # only one file, root_of_folder = '' root_of_folder = namelist[0].strip( '/') if len(namelist) > 1 else '' if all( f.split('/')[0] == root_of_folder for f in namelist[1:]) or not root_of_folder: dest = os.path.dirname( path) # only one folder, unzip to the same dir else: root_of_folder = None dest = prefix # assume zip contains more than one files or folders if verbose: eprint('Extracting {} to {}'.format(path, dest)) archive.extractall(dest) if root_of_folder: if root_of_folder != folder_name: # move root to match folder name os.rename(path_join(dest, root_of_folder), path_join(dest, folder_name)) dest = path_join(dest, folder_name) elif len(namelist) == 1: dest = path_join(dest, namelist[0]) except (RuntimeError, KeyboardInterrupt) as e: remove = False if os.path.exists(dest): if os.path.isfile(dest): os.remove(dest) else: shutil.rmtree(dest) raise e if remove: remove_file(path) return dest
def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE): if not save_path: save_path = path_from_url(url, save_dir, prefix, append_location) if os.path.isfile(save_path): if verbose: eprint('Using local {}, ignore {}'.format(save_path, url)) return save_path else: makedirs(parent_dir(save_path)) if verbose: eprint('Downloading {} to {}'.format(url, save_path)) tmp_path = '{}.downloading'.format(save_path) remove_file(tmp_path) try: def reporthook(count, block_size, total_size): global start_time, progress_size if count == 0: start_time = time.time() progress_size = 0 return duration = time.time() - start_time duration = max(1e-8, duration) progress_size = int(count * block_size) if progress_size > total_size: progress_size = total_size speed = int(progress_size / duration) ratio = progress_size / total_size ratio = max(1e-8, ratio) percent = ratio * 100 eta = duration / ratio * (1 - ratio) speed = human_bytes(speed) progress_size = human_bytes(progress_size) if verbose: sys.stderr.write( "\r%.2f%%, %s/%s, %s/s, ETA %s " % (percent, progress_size, human_bytes(total_size), speed, time_util.report_time_delta(eta))) sys.stderr.flush() import socket socket.setdefaulttimeout(10) opener = urllib.request.build_opener() opener.addheaders = [('User-agent', f'HanLP/{__version__}')] urllib.request.install_opener(opener) urlretrieve(url, tmp_path, reporthook) eprint() except BaseException as e: remove_file(tmp_path) url = url.split('#')[0] if not windows(): hints_for_download = f'e.g. \nwget {url} -O {save_path}\n' else: hints_for_download = ' Use some decent downloading tools.\n' if not url.startswith(HANLP_URL): hints_for_download += 'For third party data, you may find it on our mirror site:\n' \ 'https://od.hankcs.com/hanlp/data/\n' installed_version, latest_version = check_outdated() if installed_version != latest_version: hints_for_download += f'Or upgrade to the latest version({latest_version}):\npip install -U hanlp' message = f'Download failed due to [red]{repr(e)}[/red]. Please download it to {save_path} by yourself. ' \ f'[yellow]{hints_for_download}[/yellow]' if verbose: cprint(message) if hasattr(e, 'msg'): e.msg += '\n' + remove_color_tag(message) raise e remove_file(save_path) os.rename(tmp_path, save_path) return save_path