def is_lmf(source: AnyPath) -> bool: """Return True if *source* is a WN-LMF file.""" source = Path(source).expanduser() if not is_xml(source): return False with source.open(mode='rb') as fh: try: _read_header(fh) except LMFError: return False return True
def is_lmf(source: AnyPath) -> bool: """Return True if *source* is a WN-LMF XML file.""" source = Path(source) if source.suffix.lower() != '.xml': return False with source.open() as fh: xmldecl = fh.readline().rstrip() doctype = fh.readline().rstrip() if not (xmldecl == _XMLDECL and doctype in _DOCTYPES): return False return True
def iterpackages(path: AnyPath) -> Iterator[Package]: """Yield any wordnet Packages found at *path*. The *path* argument can point to one of the following: - a lexical resource file - a wordnet package directory - a wordnet collection directory - a tar archive containing one of the above - a compressed (gzip or lzma) resource file or tar archive """ path = Path(path).expanduser() if path.is_dir(): if is_package_directory(path): yield Package(path) elif is_collection_directory(path): yield from Collection(path).packages() else: raise wn.Error( f'does not appear to be a valid package or collection: {path!s}' ) elif tarfile.is_tarfile(path): with tarfile.open(path) as tar: _check_tar(tar) with tempfile.TemporaryDirectory() as tmpdir: tar.extractall(path=tmpdir) contents = list(Path(tmpdir).iterdir()) if len(contents) != 1: raise wn.Error( 'archive may only have one resource, package, or collection' ) yield from iterpackages(contents[0]) else: decompressed: Path with _get_decompressed(path) as decompressed: if lmf.is_lmf(decompressed): yield _ResourceOnlyPackage(decompressed) else: raise wn.Error( f'not a valid lexical resource: {path!s}' )
def dump(lexicons: LexicalResource, destination: AnyPath, version: str = '1.0') -> None: """Write wordnets in the WN-LMF format. Args: lexicons: a list of :class:`Lexicon` objects """ if version not in _SCHEMAS: raise LMFError(f'invalid version: {version}') destination = Path(destination).expanduser() doctype = _DOCTYPE.format(schema=_SCHEMAS[version]) dc_uri = _DC_URIS[version] with destination.open('wt', encoding='utf-8') as out: print(_XMLDECL.decode('utf-8'), file=out) print(doctype, file=out) print(f'<LexicalResource xmlns:dc="{dc_uri}">', file=out) for lexicon in lexicons: _dump_lexicon(lexicon, out, version) print('</LexicalResource>', file=out)
def load(source: AnyPath) -> LexicalResource: """Load wordnets encoded in the WN-LMF format. Args: source: path to a WN-LMF file """ source = Path(source).expanduser() with source.open('rb') as fh: version = _read_header(fh) events = XMLEventIterator(ET.iterparse(source, events=('start', 'end'))) root = events.start('LexicalResource') lexicons: List[Lexicon] = [] while events.starts('Lexicon', 'LexiconExtension'): lexicons.append(_load_lexicon(events, version)) root.clear() events.end(root.tag) list(events) # consume remaining events, if any return lexicons
def is_collection_directory(path: AnyPath) -> bool: path = Path(path) return (path.is_dir() and len(list(filter(is_package_directory, path.iterdir()))) >= 1)
def is_package_directory(path: AnyPath) -> bool: path = Path(path) return (path.is_dir() and len(list(filter(is_lexical_resource, path.iterdir()))) == 1)
def is_collection_directory(path: AnyPath) -> bool: """Return ``True`` if *path* appears to be a wordnet collection.""" path = Path(path).expanduser() return (path.is_dir() and len(list(filter(is_package_directory, path.iterdir()))) >= 1)
def is_package_directory(path: AnyPath) -> bool: """Return ``True`` if *path* appears to be a Wordnet Package.""" path = Path(path).expanduser() return (path.is_dir() and len(list(filter(lmf.is_lmf, path.iterdir()))) == 1)