def _get_lexicon_rowids_for_lexicon( rows: List[Tuple[int, str, str, str]], lexicon: str, ) -> Set[int]: lexmap: Dict[str, Dict[str, int]] = {} for rowid, id, version, _ in rows: lexmap.setdefault(id, {})[version] = rowid lex_match: Set[int] = set() for id_ver in lexicon.split(): id, _, ver = id_ver.partition(':') if id == '*': for vermap in lexmap.values(): for version, rowid in vermap.items(): if ver in ('', '*', version): lex_match.add(rowid) elif id in lexmap: if ver == '*': lex_match.update(rowid for rowid in lexmap[id].values()) elif ver == '': lex_match.add(max( lexmap[id].values())) # last installed version elif ver in lexmap[id]: lex_match.add(lexmap[id][ver]) else: raise wn.Error( f"no lexicon with id '{id}' found with version '{ver}'") else: raise wn.Error(f"no lexicon found with id '{id}'") return lex_match
def _get_lexicon_rowids( conn: sqlite3.Connection, lgcode: str = None, lexicon: str = None, ) -> Tuple[int, ...]: rowids: Set[int] = set() query = '''SELECT DISTINCT rowid, id, version FROM lexicons WHERE :lgcode ISNULL OR :lgcode = language''' rows = conn.execute(query, {'lgcode': lgcode}) if not lexicon: rowids.update(rowid for rowid, _, _ in rows) else: lexmap: Dict[str, Dict[str, int]] = {} for rowid, id, version in rows: lexmap.setdefault(id, {})[version] = rowid for id_ver in lexicon.split(): id, _, ver = id_ver.partition(':') if id == '*': assert not ver, "version not allowed on '*'" for proj in lexmap.values(): rowids.update(proj.values()) break if id not in lexmap: raise wn.Error(f'invalid lexicon id: {id}') if ver == '*': rowids.update(lexmap[id].values()) elif not ver: rowids.add(next(iter(lexmap[id].values()))) elif ver not in lexmap[id]: raise wn.Error(f'invalid lexicon version: {ver} ({id})') else: rowids.add(lexmap[id][ver]) return tuple(rowids)
def resource_file(self) -> Path: files = list(filter(is_lexical_resource, self._path.iterdir())) if not files: raise wn.Error(f'no resource found in package: {self._path!s}') elif len(files) > 1: raise wn.Error( f'multiple resource found in package: {self._path!s}') return files[0]
def resource_file(self) -> Path: """Return the path of the package's resource file.""" files = list(filter(lmf.is_lmf, self._path.iterdir())) if not files: raise wn.Error(f'no resource found in package: {self._path!s}') elif len(files) > 1: raise wn.Error(f'multiple resource found in package: {self._path!s}') return files[0]
def resource_file(self) -> Path: """Return the path of the package's resource file.""" files = _package_directory_types(self._path) if not files: raise wn.Error(f'no resource found in package: {self._path!s}') elif len(files) > 1: raise wn.Error( f'multiple resource found in package: {self._path!s}') return files[0][0]
def ili(self, id: str) -> ILI: """Return the first ILI in this wordnet with identifer *id*.""" iterable = find_ilis(id=id, lexicon_rowids=self._lexicon_ids) try: return ILI(*next(iterable)) except StopIteration: raise wn.Error(f'no such ILI: {id}')
def _get_decompressed(source: Path) -> Iterator[Path]: gzipped = is_gzip(source) xzipped = is_lzma(source) if not (gzipped or xzipped): yield source else: tmp = tempfile.NamedTemporaryFile(suffix='.xml', delete=False) path = Path(tmp.name) try: if gzipped: with gzip.open(source, 'rb') as gzip_src: shutil.copyfileobj(gzip_src, tmp) else: # xzipped with lzma.open(source, 'rb') as lzma_src: shutil.copyfileobj(lzma_src, tmp) tmp.close() # Windows cannot reliably reopen until it's closed yield path except (OSError, EOFError, lzma.LZMAError) as exc: raise wn.Error(f'could not decompress file: {source}') from exc finally: path.unlink()
def sense(self, id: str) -> Sense: iterable = _db.find_senses(id=id, lexicon_rowids=self._lexicon_ids) try: lexid, rowid, id, entry_id, synset_id = next(iterable) return Sense(id, entry_id, synset_id, lexid, rowid, self) except StopIteration: raise wn.Error(f'no such sense: {id}')
def synset(self, id: str) -> Synset: iterable = _db.find_synsets(id=id, lexicon_rowids=self._lexicon_ids) try: lexid, rowid, id, pos, ili = next(iterable) return Synset(id, pos, ili, lexid, rowid, self) except StopIteration: raise wn.Error(f'no such synset: {id}')
def word(self, id: str) -> Word: iterable = _db.find_entries(id=id, lexicon_rowids=self._lexicon_ids) try: lexid, rowid, id, pos, forms = next(iterable) return Word(id, pos, forms, lexid, rowid, self) except StopIteration: raise wn.Error(f'no such lexical entry: {id}')
def sense(self, id: str) -> Sense: """Return the first sense in this wordnet with identifier *id*.""" iterable = find_senses(id=id, lexicon_rowids=self._lexicon_ids) try: return Sense(*next(iterable), self) except StopIteration: raise wn.Error(f'no such sense: {id}')
def get_metadata(rowid: int, table: str) -> Metadata: conn = connect() tablename = _SANITIZED_METADATA_TABLES.get(table) if tablename is None: raise wn.Error(f"'{table}' does not contain metadata") query = f'SELECT metadata FROM {tablename} WHERE rowid=?' return conn.execute(query, (rowid, )).fetchone()[0] or {}
def add( source: AnyPath, progress_handler: Optional[Type[ProgressHandler]] = ProgressBar, ) -> None: """Add the LMF file at *source* to the database. The file at *source* may be gzip-compressed or plain text XML. >>> wn.add('english-wordnet-2020.xml') Added ewn:2020 (English WordNet) The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. """ if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message='Database') logger.info('adding project to database') logger.info(' database: %s', wn.config.database_path) logger.info(' project file: %s', source) try: for package in iterpackages(source): if package.type == _WORDNET: _add_lmf(package.resource_file(), progress) elif package.type == _ILI: _add_ili(package.resource_file(), progress) else: raise wn.Error(f'unknown package type: {package.type}') finally: progress.close()
def word(self, id: str) -> Word: """Return the first word in this wordnet with identifier *id*.""" iterable = find_entries(id=id, lexicon_rowids=self._lexicon_ids) try: return Word(*next(iterable), self) except StopIteration: raise wn.Error(f'no such lexical entry: {id}')
def _check_tar(tar: tarfile.TarFile) -> None: """Check the tarfile to avoid potential security issues. Currently collections and packages have the following constraints: - Only regular files or directories - No paths starting with '/' or containing '..' """ for info in tar.getmembers(): if not (info.isfile() or info.isdir()): raise wn.Error( f'tarfile member is not a regular file or directory: {info.name}' ) if info.name.startswith('/') or '..' in info.name: raise wn.Error( f'tarfile member paths may not be absolute or contain ..: {info.name}' )
def synset(self, id: str) -> Synset: """Return the first synset in this wordnet with identifier *id*.""" iterable = _db.find_synsets(id=id, lexicon_rowids=self._lexicon_ids) try: lexid, rowid, id, pos, ili = next(iterable) return Synset(id, pos, ili, lexid, rowid, self) except StopIteration: raise wn.Error(f'no such synset: {id}')
def shortest_path(self, other: 'Synset', simulate_root: bool = False) -> List['Synset']: pathmap = self._shortest_hyp_paths(other, simulate_root) key = min(pathmap, key=lambda key: len(pathmap[key]), default=None) if key is None: raise wn.Error(f'no path between {self!r} and {other!r}') return pathmap[key]
def _get_lexicon_rowids( conn: sqlite3.Connection, lgcode: str = None, lexicon: str = None, ) -> List[int]: rows = conn.execute( 'SELECT rowid, id, version, language FROM lexicons').fetchall() lg_match: Set[int] = set() if lgcode: lg_match.update(rowid for rowid, _, _, language in rows if language == lgcode) if not lg_match: raise wn.Error(f"no lexicon found with language code '{lgcode}'") else: lg_match.update(row[0] for row in rows) lex_match: Set[int] = set() lex_specs = lexicon.split() if lexicon else [] if not lex_specs or '*' in lex_specs or '*:' in lex_specs: lex_match.update(row[0] for row in rows) else: lexmap: Dict[str, Dict[str, int]] = {} for rowid, id, version, _ in rows: lexmap.setdefault(id, {})[version] = rowid for id_ver in lex_specs: id, _, ver = id_ver.partition(':') if id == '*': raise wn.Error("version not allowed when lexicon id is '*'") elif id not in lexmap: raise wn.Error(f"no lexicon found with id '{id}'") if not ver: lex_match.add(next(iter(lexmap[id].values()))) elif ver not in lexmap[id]: raise wn.Error( f"no lexicon with id '{id}' found with version '{ver}'") else: lex_match.add(lexmap[id][ver]) result = lg_match & lex_match if rows and not result: raise wn.Error( f'no lexicon found with lgcode={lgcode!r} and lexicon={lexicon!r}') return sorted(result)
def iterpackages(path: AnyPath) -> Iterator[Package]: """Yield any wordnet Packages found at *path*. The *path* argument can point to one of the following: - a lexical resource file - a wordnet package directory - a wordnet collection directory - a tar archive containing one of the above - a compressed (gzip or lzma) resource file or tar archive """ path = Path(path).expanduser() if path.is_dir(): if is_package_directory(path): yield Package(path) elif is_collection_directory(path): yield from Collection(path).packages() else: raise wn.Error( f'does not appear to be a valid package or collection: {path!s}' ) elif tarfile.is_tarfile(path): with tarfile.open(path) as tar: _check_tar(tar) with tempfile.TemporaryDirectory() as tmpdir: tar.extractall(path=tmpdir) contents = list(Path(tmpdir).iterdir()) if len(contents) != 1: raise wn.Error( 'archive may only have one resource, package, or collection' ) yield from iterpackages(contents[0]) else: decompressed: Path with _get_decompressed(path) as decompressed: if lmf.is_lmf(decompressed): yield _ResourceOnlyPackage(decompressed) else: raise wn.Error( f'not a valid lexical resource: {path!s}' )
def get_lexicalized(rowid: int, table: str) -> bool: conn = connect() tablename = _SANITIZED_LEXICALIZED_TABLES.get(table) if tablename is None: raise wn.Error(f"'{table}' does not mark lexicalization") if rowid == NON_ROWID: return False query = f'SELECT lexicalized FROM {tablename} WHERE rowid=?' return conn.execute(query, (rowid, )).fetchone()[0]
def _get_lexicon_rowids_for_lang(rows: List[Tuple[int, str, str, str]], lang: Optional[str]) -> Set[int]: lg_match: Set[int] = set() if lang: lg_match.update(rowid for rowid, _, _, language in rows if language == lang) if not lg_match: raise wn.Error(f"no lexicon found with language code '{lang}'") else: lg_match.update(row[0] for row in rows) return lg_match
def _precheck(lexicons: Sequence[Lexicon]) -> None: all_ids: Set[str] = set() for lex in lexicons: lexids = (lex._id, ) idset = {lex.id} idset.update(row[0] for row in find_entries(lexicon_rowids=lexids)) idset.update(row[0] for row in find_senses(lexicon_rowids=lexids)) idset.update(row[0] for row in find_synsets(lexicon_rowids=lexids)) # TODO: syntactic behaviours if all_ids.intersection(idset): raise wn.Error('cannot export: non-unique identifiers in lexicons') all_ids |= idset
def _download(url: str, path: Path, progress: ProgressHandler) -> None: size: int = 0 try: with open(path, 'wb') as f: progress.set(status='Requesting') with requests.get(url, stream=True, timeout=TIMEOUT) as response: size = int(response.headers.get('Content-Length', 0)) progress.set(total=size, status='Receiving') for chunk in response.iter_content(chunk_size=CHUNK_SIZE): if chunk: f.write(chunk) progress.update(len(chunk)) progress.set(status='Complete') except requests.exceptions.RequestException as exc: path.unlink() raise wn.Error(f'Download failed at {size} bytes') from exc except KeyboardInterrupt as exc: path.unlink() raise wn.Error(f'Download cancelled at {size} bytes') from exc except Exception: path.unlink() raise
def shortest_path( self, other: 'Synset', simulate_root: bool = False ) -> List['Synset']: """Return the shortest path from the synset to the *other* synset. Arguments: other: endpoint synset of the path simulate_root: if :python:`True`, ensure any two synsets are always connected by positing a fake root node """ pathmap = self._shortest_hyp_paths(other, simulate_root) key = min(pathmap, key=lambda key: len(pathmap[key]), default=None) if key is None: raise wn.Error(f'no path between {self!r} and {other!r}') return pathmap[key][1:]
def find_lexicons( lexicon: str, lang: str = None, ) -> Iterator[_Lexicon]: conn = connect() rows = conn.execute( 'SELECT rowid, id, version, language FROM lexicons').fetchall() rowids = _get_lexicon_rowids_for_lang(rows, lang) # the next call is somewhat expensive, so try to skip it in a common case if lexicon != '*': rowids &= _get_lexicon_rowids_for_lexicon(rows, lexicon) if rows and not rowids: raise wn.Error( f'no lexicon found with lang={lang!r} and lexicon={lexicon!r}') for rowid in sorted(rowids): yield _get_lexicon(conn, rowid)
def get_examples( rowid: int, table: str, lexicon_rowids: Sequence[int], ) -> List[Tuple[str, str, int]]: conn = connect() prefix = _SANITIZED_EXAMPLE_PREFIXES.get(table) if prefix is None: raise wn.Error(f"'{table}' does not have examples") query = f''' SELECT example, language, rowid FROM {prefix}_examples WHERE {prefix}_rowid = ? AND lexicon_rowid IN ({_qs(lexicon_rowids)}) ''' return conn.execute(query, (rowid, *lexicon_rowids)).fetchall()
def _insert_sense_relations(lexicon, lexid, lexidmap, cur, progress): progress.set(status='Sense Relations') # need to separate relations into those targeting senses vs synsets synset_ids = {ss.id for ss in lexicon.synsets} sense_ids = {s.id for e in lexicon.lexical_entries for s in e.senses} s_s_rels = [] s_ss_rels = [] for entry in lexicon.lexical_entries: for sense in entry.senses: slid = lexidmap.get(sense.id, lexid) for relation in sense.relations: target_id = relation.target tlid = lexidmap.get(target_id, lexid) if target_id in sense_ids: s_s_rels.append((sense.id, slid, tlid, relation)) elif target_id in synset_ids: s_ss_rels.append((sense.id, slid, tlid, relation)) else: raise wn.Error( f'relation target is not a known sense or synset: {target_id}' ) hyperparams = [ ('sense_relations', SENSE_QUERY, s_s_rels), ('sense_synset_relations', SYNSET_QUERY, s_ss_rels), ] for table, target_query, rels in hyperparams: query = f''' INSERT INTO {table} VALUES (null,?,({SENSE_QUERY}),({target_query}),({RELTYPE_QUERY}),?) ''' for batch in _split(rels): data = [(lexid, sense_id, slid, relation.target, tlid, relation.type, relation.meta) for sense_id, slid, tlid, relation in batch] cur.executemany(query, data) progress.update(len(data))
def download( project_or_url: str, add: bool = True, progress_handler: Optional[Type[ProgressHandler]] = ProgressBar, ) -> Path: """Download the resource specified by *project_or_url*. First the URL of the resource is determined and then, depending on the parameters, the resource is downloaded and added to the database. The function then returns the path of the cached file. If *project_or_url* starts with `'http://'` or `'https://'`, then it is taken to be the URL for the resource. Otherwise, *project_or_url* is taken as a :ref:`project specifier <lexicon-specifiers>` and the URL is taken from a matching entry in Wn's project index. If no project matches the specifier, :exc:`wn.Error` is raised. If the URL has been downloaded and cached before, the cached file is used. Otherwise the URL is retrieved and stored in the cache. If the *add* paramter is ``True`` (default), the downloaded resource is added to the database. >>> wn.download('ewn:2020') Added ewn:2020 (English WordNet) The *progress_handler* parameter takes a subclass of :class:`wn.util.ProgressHandler`. An instance of the class will be created, used, and closed by this function. """ if is_url(project_or_url): url = project_or_url else: info = config.get_project_info(project_or_url) url = info['resource_url'] logger.info('download url: %s', url) path = config.get_cache_path(url) logger.info('download cache path: %s', path) if progress_handler is None: progress_handler = ProgressHandler progress = progress_handler(message='Download', unit=' bytes') try: if path.exists(): progress.flash(f'Cached file found: {path!s}') else: _download(url, path, progress) finally: progress.close() if add: try: add_to_db(path, progress_handler=progress_handler) except wn.Error as exc: raise wn.Error( f'could not add downloaded file: {path}\n You might try ' 'deleting the cached file and trying the download again.' ) from exc return path
def lch(synset1: Synset, synset2: Synset, max_depth: int = 0) -> float: """Return the Leacock-Chodorow similarity of *synset1* and *synset2*.""" distance = len(synset1.shortest_path(synset2, simulate_root=True)) if max_depth <= 0: raise wn.Error('max_depth must be greater than 0') return -math.log((distance + 1) / (2 * max_depth))