Ejemplo n.º 1
0
def _get_lexicon_rowids_for_lexicon(
    rows: List[Tuple[int, str, str, str]],
    lexicon: str,
) -> Set[int]:
    lexmap: Dict[str, Dict[str, int]] = {}
    for rowid, id, version, _ in rows:
        lexmap.setdefault(id, {})[version] = rowid

    lex_match: Set[int] = set()
    for id_ver in lexicon.split():
        id, _, ver = id_ver.partition(':')

        if id == '*':
            for vermap in lexmap.values():
                for version, rowid in vermap.items():
                    if ver in ('', '*', version):
                        lex_match.add(rowid)

        elif id in lexmap:
            if ver == '*':
                lex_match.update(rowid for rowid in lexmap[id].values())
            elif ver == '':
                lex_match.add(max(
                    lexmap[id].values()))  # last installed version
            elif ver in lexmap[id]:
                lex_match.add(lexmap[id][ver])
            else:
                raise wn.Error(
                    f"no lexicon with id '{id}' found with version '{ver}'")

        else:
            raise wn.Error(f"no lexicon found with id '{id}'")

    return lex_match
Ejemplo n.º 2
0
def _get_lexicon_rowids(
        conn: sqlite3.Connection,
        lgcode: str = None,
        lexicon: str = None,
) -> Tuple[int, ...]:
    rowids: Set[int] = set()
    query = '''SELECT DISTINCT rowid, id, version
                 FROM lexicons
                WHERE :lgcode ISNULL OR :lgcode = language'''
    rows = conn.execute(query, {'lgcode': lgcode})
    if not lexicon:
        rowids.update(rowid for rowid, _, _ in rows)
    else:
        lexmap: Dict[str, Dict[str, int]] = {}
        for rowid, id, version in rows:
            lexmap.setdefault(id, {})[version] = rowid
        for id_ver in lexicon.split():
            id, _, ver = id_ver.partition(':')
            if id == '*':
                assert not ver, "version not allowed on '*'"
                for proj in lexmap.values():
                    rowids.update(proj.values())
                break
            if id not in lexmap:
                raise wn.Error(f'invalid lexicon id: {id}')
            if ver == '*':
                rowids.update(lexmap[id].values())
            elif not ver:
                rowids.add(next(iter(lexmap[id].values())))
            elif ver not in lexmap[id]:
                raise wn.Error(f'invalid lexicon version: {ver} ({id})')
            else:
                rowids.add(lexmap[id][ver])
    return tuple(rowids)
Ejemplo n.º 3
0
 def resource_file(self) -> Path:
     files = list(filter(is_lexical_resource, self._path.iterdir()))
     if not files:
         raise wn.Error(f'no resource found in package: {self._path!s}')
     elif len(files) > 1:
         raise wn.Error(
             f'multiple resource found in package: {self._path!s}')
     return files[0]
Ejemplo n.º 4
0
 def resource_file(self) -> Path:
     """Return the path of the package's resource file."""
     files = list(filter(lmf.is_lmf, self._path.iterdir()))
     if not files:
         raise wn.Error(f'no resource found in package: {self._path!s}')
     elif len(files) > 1:
         raise wn.Error(f'multiple resource found in package: {self._path!s}')
     return files[0]
Ejemplo n.º 5
0
 def resource_file(self) -> Path:
     """Return the path of the package's resource file."""
     files = _package_directory_types(self._path)
     if not files:
         raise wn.Error(f'no resource found in package: {self._path!s}')
     elif len(files) > 1:
         raise wn.Error(
             f'multiple resource found in package: {self._path!s}')
     return files[0][0]
Ejemplo n.º 6
0
 def ili(self, id: str) -> ILI:
     """Return the first ILI in this wordnet with identifer *id*."""
     iterable = find_ilis(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         return ILI(*next(iterable))
     except StopIteration:
         raise wn.Error(f'no such ILI: {id}')
Ejemplo n.º 7
0
def _get_decompressed(source: Path) -> Iterator[Path]:
    gzipped = is_gzip(source)
    xzipped = is_lzma(source)
    if not (gzipped or xzipped):
        yield source
    else:
        tmp = tempfile.NamedTemporaryFile(suffix='.xml', delete=False)
        path = Path(tmp.name)
        try:
            if gzipped:
                with gzip.open(source, 'rb') as gzip_src:
                    shutil.copyfileobj(gzip_src, tmp)
            else:  # xzipped
                with lzma.open(source, 'rb') as lzma_src:
                    shutil.copyfileobj(lzma_src, tmp)

            tmp.close()  # Windows cannot reliably reopen until it's closed

            yield path

        except (OSError, EOFError, lzma.LZMAError) as exc:
            raise wn.Error(f'could not decompress file: {source}') from exc

        finally:
            path.unlink()
Ejemplo n.º 8
0
 def sense(self, id: str) -> Sense:
     iterable = _db.find_senses(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         lexid, rowid, id, entry_id, synset_id = next(iterable)
         return Sense(id, entry_id, synset_id, lexid, rowid, self)
     except StopIteration:
         raise wn.Error(f'no such sense: {id}')
Ejemplo n.º 9
0
 def synset(self, id: str) -> Synset:
     iterable = _db.find_synsets(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         lexid, rowid, id, pos, ili = next(iterable)
         return Synset(id, pos, ili, lexid, rowid, self)
     except StopIteration:
         raise wn.Error(f'no such synset: {id}')
Ejemplo n.º 10
0
 def word(self, id: str) -> Word:
     iterable = _db.find_entries(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         lexid, rowid, id, pos, forms = next(iterable)
         return Word(id, pos, forms, lexid, rowid, self)
     except StopIteration:
         raise wn.Error(f'no such lexical entry: {id}')
Ejemplo n.º 11
0
 def sense(self, id: str) -> Sense:
     """Return the first sense in this wordnet with identifier *id*."""
     iterable = find_senses(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         return Sense(*next(iterable), self)
     except StopIteration:
         raise wn.Error(f'no such sense: {id}')
Ejemplo n.º 12
0
def get_metadata(rowid: int, table: str) -> Metadata:
    conn = connect()
    tablename = _SANITIZED_METADATA_TABLES.get(table)
    if tablename is None:
        raise wn.Error(f"'{table}' does not contain metadata")
    query = f'SELECT metadata FROM {tablename} WHERE rowid=?'
    return conn.execute(query, (rowid, )).fetchone()[0] or {}
Ejemplo n.º 13
0
def add(
    source: AnyPath,
    progress_handler: Optional[Type[ProgressHandler]] = ProgressBar,
) -> None:
    """Add the LMF file at *source* to the database.

    The file at *source* may be gzip-compressed or plain text XML.

    >>> wn.add('english-wordnet-2020.xml')
    Added ewn:2020 (English WordNet)

    The *progress_handler* parameter takes a subclass of
    :class:`wn.util.ProgressHandler`. An instance of the class will be
    created, used, and closed by this function.

    """
    if progress_handler is None:
        progress_handler = ProgressHandler
    progress = progress_handler(message='Database')

    logger.info('adding project to database')
    logger.info('  database: %s', wn.config.database_path)
    logger.info('  project file: %s', source)

    try:
        for package in iterpackages(source):
            if package.type == _WORDNET:
                _add_lmf(package.resource_file(), progress)
            elif package.type == _ILI:
                _add_ili(package.resource_file(), progress)
            else:
                raise wn.Error(f'unknown package type: {package.type}')
    finally:
        progress.close()
Ejemplo n.º 14
0
 def word(self, id: str) -> Word:
     """Return the first word in this wordnet with identifier *id*."""
     iterable = find_entries(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         return Word(*next(iterable), self)
     except StopIteration:
         raise wn.Error(f'no such lexical entry: {id}')
Ejemplo n.º 15
0
def _check_tar(tar: tarfile.TarFile) -> None:
    """Check the tarfile to avoid potential security issues.

    Currently collections and packages have the following constraints:
    - Only regular files or directories
    - No paths starting with '/' or containing '..'
    """
    for info in tar.getmembers():
        if not (info.isfile() or info.isdir()):
            raise wn.Error(
                f'tarfile member is not a regular file or directory: {info.name}'
            )
        if info.name.startswith('/') or '..' in info.name:
            raise wn.Error(
                f'tarfile member paths may not be absolute or contain ..: {info.name}'
            )
Ejemplo n.º 16
0
 def synset(self, id: str) -> Synset:
     """Return the first synset in this wordnet with identifier *id*."""
     iterable = _db.find_synsets(id=id, lexicon_rowids=self._lexicon_ids)
     try:
         lexid, rowid, id, pos, ili = next(iterable)
         return Synset(id, pos, ili, lexid, rowid, self)
     except StopIteration:
         raise wn.Error(f'no such synset: {id}')
Ejemplo n.º 17
0
 def shortest_path(self,
                   other: 'Synset',
                   simulate_root: bool = False) -> List['Synset']:
     pathmap = self._shortest_hyp_paths(other, simulate_root)
     key = min(pathmap, key=lambda key: len(pathmap[key]), default=None)
     if key is None:
         raise wn.Error(f'no path between {self!r} and {other!r}')
     return pathmap[key]
Ejemplo n.º 18
0
def _get_lexicon_rowids(
    conn: sqlite3.Connection,
    lgcode: str = None,
    lexicon: str = None,
) -> List[int]:
    rows = conn.execute(
        'SELECT rowid, id, version, language FROM lexicons').fetchall()

    lg_match: Set[int] = set()
    if lgcode:
        lg_match.update(rowid for rowid, _, _, language in rows
                        if language == lgcode)
        if not lg_match:
            raise wn.Error(f"no lexicon found with language code '{lgcode}'")
    else:
        lg_match.update(row[0] for row in rows)

    lex_match: Set[int] = set()
    lex_specs = lexicon.split() if lexicon else []
    if not lex_specs or '*' in lex_specs or '*:' in lex_specs:
        lex_match.update(row[0] for row in rows)
    else:
        lexmap: Dict[str, Dict[str, int]] = {}
        for rowid, id, version, _ in rows:
            lexmap.setdefault(id, {})[version] = rowid
        for id_ver in lex_specs:
            id, _, ver = id_ver.partition(':')
            if id == '*':
                raise wn.Error("version not allowed when lexicon id is '*'")
            elif id not in lexmap:
                raise wn.Error(f"no lexicon found with id '{id}'")
            if not ver:
                lex_match.add(next(iter(lexmap[id].values())))
            elif ver not in lexmap[id]:
                raise wn.Error(
                    f"no lexicon with id '{id}' found with version '{ver}'")
            else:
                lex_match.add(lexmap[id][ver])

    result = lg_match & lex_match
    if rows and not result:
        raise wn.Error(
            f'no lexicon found with lgcode={lgcode!r} and lexicon={lexicon!r}')

    return sorted(result)
Ejemplo n.º 19
0
def iterpackages(path: AnyPath) -> Iterator[Package]:
    """Yield any wordnet Packages found at *path*.

    The *path* argument can point to one of the following:
      - a lexical resource file
      - a wordnet package directory
      - a wordnet collection directory
      - a tar archive containing one of the above
      - a compressed (gzip or lzma) resource file or tar archive
    """
    path = Path(path).expanduser()

    if path.is_dir():
        if is_package_directory(path):
            yield Package(path)

        elif is_collection_directory(path):
            yield from Collection(path).packages()

        else:
            raise wn.Error(
                f'does not appear to be a valid package or collection: {path!s}'
            )

    elif tarfile.is_tarfile(path):
        with tarfile.open(path) as tar:
            _check_tar(tar)
            with tempfile.TemporaryDirectory() as tmpdir:
                tar.extractall(path=tmpdir)
                contents = list(Path(tmpdir).iterdir())
                if len(contents) != 1:
                    raise wn.Error(
                        'archive may only have one resource, package, or collection'
                    )
                yield from iterpackages(contents[0])

    else:
        decompressed: Path
        with _get_decompressed(path) as decompressed:
            if lmf.is_lmf(decompressed):
                yield _ResourceOnlyPackage(decompressed)
            else:
                raise wn.Error(
                    f'not a valid lexical resource: {path!s}'
                )
Ejemplo n.º 20
0
def get_lexicalized(rowid: int, table: str) -> bool:
    conn = connect()
    tablename = _SANITIZED_LEXICALIZED_TABLES.get(table)
    if tablename is None:
        raise wn.Error(f"'{table}' does not mark lexicalization")
    if rowid == NON_ROWID:
        return False
    query = f'SELECT lexicalized FROM {tablename} WHERE rowid=?'
    return conn.execute(query, (rowid, )).fetchone()[0]
Ejemplo n.º 21
0
def _get_lexicon_rowids_for_lang(rows: List[Tuple[int, str, str, str]],
                                 lang: Optional[str]) -> Set[int]:
    lg_match: Set[int] = set()
    if lang:
        lg_match.update(rowid for rowid, _, _, language in rows
                        if language == lang)
        if not lg_match:
            raise wn.Error(f"no lexicon found with language code '{lang}'")
    else:
        lg_match.update(row[0] for row in rows)
    return lg_match
Ejemplo n.º 22
0
def _precheck(lexicons: Sequence[Lexicon]) -> None:
    all_ids: Set[str] = set()
    for lex in lexicons:
        lexids = (lex._id, )
        idset = {lex.id}
        idset.update(row[0] for row in find_entries(lexicon_rowids=lexids))
        idset.update(row[0] for row in find_senses(lexicon_rowids=lexids))
        idset.update(row[0] for row in find_synsets(lexicon_rowids=lexids))
        # TODO: syntactic behaviours
        if all_ids.intersection(idset):
            raise wn.Error('cannot export: non-unique identifiers in lexicons')
        all_ids |= idset
Ejemplo n.º 23
0
def _download(url: str, path: Path, progress: ProgressHandler) -> None:
    size: int = 0
    try:
        with open(path, 'wb') as f:
            progress.set(status='Requesting')
            with requests.get(url, stream=True, timeout=TIMEOUT) as response:
                size = int(response.headers.get('Content-Length', 0))
                progress.set(total=size, status='Receiving')
                for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                    if chunk:
                        f.write(chunk)
                    progress.update(len(chunk))
                progress.set(status='Complete')
    except requests.exceptions.RequestException as exc:
        path.unlink()
        raise wn.Error(f'Download failed at {size} bytes') from exc
    except KeyboardInterrupt as exc:
        path.unlink()
        raise wn.Error(f'Download cancelled at {size} bytes') from exc
    except Exception:
        path.unlink()
        raise
Ejemplo n.º 24
0
    def shortest_path(
            self, other: 'Synset', simulate_root: bool = False
    ) -> List['Synset']:
        """Return the shortest path from the synset to the *other* synset.

        Arguments:
            other: endpoint synset of the path
            simulate_root: if :python:`True`, ensure any two synsets
              are always connected by positing a fake root node

        """
        pathmap = self._shortest_hyp_paths(other, simulate_root)
        key = min(pathmap, key=lambda key: len(pathmap[key]), default=None)
        if key is None:
            raise wn.Error(f'no path between {self!r} and {other!r}')
        return pathmap[key][1:]
Ejemplo n.º 25
0
def find_lexicons(
    lexicon: str,
    lang: str = None,
) -> Iterator[_Lexicon]:
    conn = connect()
    rows = conn.execute(
        'SELECT rowid, id, version, language FROM lexicons').fetchall()
    rowids = _get_lexicon_rowids_for_lang(rows, lang)
    # the next call is somewhat expensive, so try to skip it in a common case
    if lexicon != '*':
        rowids &= _get_lexicon_rowids_for_lexicon(rows, lexicon)
    if rows and not rowids:
        raise wn.Error(
            f'no lexicon found with lang={lang!r} and lexicon={lexicon!r}')
    for rowid in sorted(rowids):
        yield _get_lexicon(conn, rowid)
Ejemplo n.º 26
0
def get_examples(
    rowid: int,
    table: str,
    lexicon_rowids: Sequence[int],
) -> List[Tuple[str, str, int]]:
    conn = connect()
    prefix = _SANITIZED_EXAMPLE_PREFIXES.get(table)
    if prefix is None:
        raise wn.Error(f"'{table}' does not have examples")
    query = f'''
        SELECT example, language, rowid
          FROM {prefix}_examples
         WHERE {prefix}_rowid = ?
           AND lexicon_rowid IN ({_qs(lexicon_rowids)})
    '''
    return conn.execute(query, (rowid, *lexicon_rowids)).fetchall()
Ejemplo n.º 27
0
def _insert_sense_relations(lexicon, lexid, lexidmap, cur, progress):
    progress.set(status='Sense Relations')
    # need to separate relations into those targeting senses vs synsets
    synset_ids = {ss.id for ss in lexicon.synsets}
    sense_ids = {s.id for e in lexicon.lexical_entries for s in e.senses}
    s_s_rels = []
    s_ss_rels = []
    for entry in lexicon.lexical_entries:
        for sense in entry.senses:
            slid = lexidmap.get(sense.id, lexid)
            for relation in sense.relations:
                target_id = relation.target
                tlid = lexidmap.get(target_id, lexid)
                if target_id in sense_ids:
                    s_s_rels.append((sense.id, slid, tlid, relation))
                elif target_id in synset_ids:
                    s_ss_rels.append((sense.id, slid, tlid, relation))
                else:
                    raise wn.Error(
                        f'relation target is not a known sense or synset: {target_id}'
                    )
    hyperparams = [
        ('sense_relations', SENSE_QUERY, s_s_rels),
        ('sense_synset_relations', SYNSET_QUERY, s_ss_rels),
    ]
    for table, target_query, rels in hyperparams:
        query = f'''
            INSERT INTO {table}
            VALUES (null,?,({SENSE_QUERY}),({target_query}),({RELTYPE_QUERY}),?)
        '''
        for batch in _split(rels):
            data = [(lexid, sense_id, slid, relation.target, tlid,
                     relation.type, relation.meta)
                    for sense_id, slid, tlid, relation in batch]
            cur.executemany(query, data)
            progress.update(len(data))
Ejemplo n.º 28
0
def download(
    project_or_url: str,
    add: bool = True,
    progress_handler: Optional[Type[ProgressHandler]] = ProgressBar,
) -> Path:
    """Download the resource specified by *project_or_url*.

    First the URL of the resource is determined and then, depending on
    the parameters, the resource is downloaded and added to the
    database.  The function then returns the path of the cached file.

    If *project_or_url* starts with `'http://'` or `'https://'`, then
    it is taken to be the URL for the resource. Otherwise,
    *project_or_url* is taken as a :ref:`project specifier
    <lexicon-specifiers>` and the URL is taken from a matching entry
    in Wn's project index. If no project matches the specifier,
    :exc:`wn.Error` is raised.

    If the URL has been downloaded and cached before, the cached file
    is used. Otherwise the URL is retrieved and stored in the cache.

    If the *add* paramter is ``True`` (default), the downloaded
    resource is added to the database.

    >>> wn.download('ewn:2020')
    Added ewn:2020 (English WordNet)

    The *progress_handler* parameter takes a subclass of
    :class:`wn.util.ProgressHandler`. An instance of the class will be
    created, used, and closed by this function.

    """
    if is_url(project_or_url):
        url = project_or_url
    else:
        info = config.get_project_info(project_or_url)
        url = info['resource_url']
    logger.info('download url: %s', url)

    path = config.get_cache_path(url)
    logger.info('download cache path: %s', path)

    if progress_handler is None:
        progress_handler = ProgressHandler
    progress = progress_handler(message='Download', unit=' bytes')

    try:
        if path.exists():
            progress.flash(f'Cached file found: {path!s}')
        else:
            _download(url, path, progress)
    finally:
        progress.close()

    if add:
        try:
            add_to_db(path, progress_handler=progress_handler)
        except wn.Error as exc:
            raise wn.Error(
                f'could not add downloaded file: {path}\n  You might try '
                'deleting the cached file and trying the download again.'
            ) from exc

    return path
Ejemplo n.º 29
0
def lch(synset1: Synset, synset2: Synset, max_depth: int = 0) -> float:
    """Return the Leacock-Chodorow similarity of *synset1* and *synset2*."""
    distance = len(synset1.shortest_path(synset2, simulate_root=True))
    if max_depth <= 0:
        raise wn.Error('max_depth must be greater than 0')
    return -math.log((distance + 1) / (2 * max_depth))