Ejemplo n.º 1
0
def parse_lemma_pos_index(lemma_pos_index):
    lemma, pos, synset_index_str = lemma_pos_index.lower().rsplit('.', 2)
    synset_index = int(synset_index_str) - 1
    # Get the offset for this synset
    try:
        offset = _lemma_pos_offset_map[lemma][pos][synset_index]
    except KeyError:
        message = 'no lemma %r with part of speech %r'
        raise WordNetError(message % (lemma, pos))
    except IndexError:
        n_senses = len(_lemma_pos_offset_map[lemma][pos])
        message = "lemma %r with part of speech %r has only %i %s"
        message = message % lemma, pos, n_senses, "sense"
        raise WordNetError(message if n_senses > 1 else message + 's')

    # If it's a confusion between adjective and satellite adjective,
    # users really doesn't care, so we resolve it and raise warning.
    if pos in ['a', 's']:
        # Raise error if user wants an `s` but offset is in `a`,
        if pos == 's' and offset in _synset_offset_cache['a']:
            message = ('adjective satellite requested but '
                       'only plain adjective found for lemma %r')
            raise WordNetError(message % lemma)
        # Push warning and change the POS
        # if user wants an `a` but offset is in `s`,
        elif pos == 'a' and offset in _synset_offset_cache['s']:
            message = ('plain adjective requested but '
                       'only adjective satellite found for lemma %r')
            warnings.warn(message % lemma)
            pos = 's'  # Edit user specified POS.
    return pos, offset
Ejemplo n.º 2
0
 def synset_from_pos_and_offset(self, pos, offset):
     assert pos in POS_LIST, WordNetError(
         'Part-of-Speech should be one of this: {}'.format(POS_LIST))
     offset = int(offset)
     try:
         return _synset_offset_cache[pos][offset]
     except:
         if pos == 's' and offset in _synset_offset_cache['a']:
             return _synset_offset_cache['a'][offset]
         if pos == 'a' and offset in _synset_offset_cache['s']:
             return _synset_offset_cache['s'][offset]
         raise WordNetError(
             'Part-of-Speech and Offset combination not found in WordNet: {} + {}'
             .format(pos, offset))
Ejemplo n.º 3
0
 def get_version(self):
     filename = self.wordnet_data_dir + '/data.adj'
     with io.open(filename, encoding='utf8') as fin:
         for line in fin:
             match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
             if match:
                 self._version = match.group(1)
                 return self._version
     raise WordNetError("Cannot find version number in {}".format(filename))
Ejemplo n.º 4
0
def parse_wordnet_ic_line(ic_line):
    offset, value, *has_root = ic_line.strip().split()
    pos, has_root = offset[-1], bool(has_root)
    offset, value = int(offset[:-1]), float(value)
    if pos not in ['n', 'v']:
        raise WordNetError("Unidentified part of speech in "
                           "WordNet Information Content "
                           "file for field {}".format(line))
    return offset, value, pos, has_root
Ejemplo n.º 5
0
    def lch_similarity(self,
                       synset1,
                       synset2,
                       simulate_root=True,
                       _max_depth=None,
                       if_none_return=None):
        """
        Leacock Chodorow Similarity:
        Return a score denoting how similar two word senses are, based on the
        shortest path that connects the senses (as above) and the maximum depth
        of the taxonomy in which the senses occur. The relationship is given as
        -log(p/2d) where p is the shortest path length and d is the taxonomy
        depth.
        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type simulate_root: bool
        :param simulate_root: The various verb taxonomies do not
            share a single root which disallows this metric from working for
            synsets that are not connected. This flag (True by default)
            creates a fake root that connects all the taxonomies. Set it
            to false to disable this behavior. For the noun taxonomy,
            there is usually a default root except for WordNet version 1.6.
            If you are using wordnet 1.6, a fake root will be added for nouns
            as well.
        :return: A score denoting the similarity of the two ``Synset`` objects,
            normally greater than 0. None is returned if no connecting path
            could be found. If a ``Synset`` is compared with itself, the
            maximum score is returned, which varies depending on the taxonomy
            depth.
        """

        if synset1._pos != synset2._pos:
            raise WordNetError('Computing the lch similarity requires '
                               '%s and %s to have the same part of speech.' %
                               (synset1, synset2))

        # Note the s`imlulate_root` here only tries to simulate
        # Ultimately the synset's need_root and simulate_root decides
        # whether root is really needed
        need_root = synset1._needs_root() and simulate_root
        # Hack to handle adjective and adverbs where _needs_root() returns None.
        if need_root == None:
            if if_none_return:
                need_root = True
            else:  # Emulate NLTK's behavior to return None.
                return if_none_return
        # FIXME: how to make subclass overwrite values in kwargs?
        # By default use the static value from wn.constants
        depth = _max_depth if _max_depth else WN_MAX_DEPTH['3.0'][need_root][
            synset1._pos]
        distance = self.shortest_path_distance(synset1,
                                               synset2,
                                               simulate_root=need_root)

        if distance is None or distance < 0 or depth == 0:
            return if_none_return
        return -math.log((distance + 1) / (2.0 * depth))
Ejemplo n.º 6
0
    def lemma(self, name, lang='eng'):
        '''Return lemma object that matches the name'''
        # cannot simply split on first '.',
        # e.g.: '.45_caliber.a.01..45_caliber'
        separator = SENSENUM_RE.search(name).end()
        synset_name, lemma_name = name[:separator - 1], name[separator:]

        synset = self.synset(synset_name)
        for lemma in synset.lemmas(lang):
            if lemma._name == lemma_name:
                return lemma
        raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
Ejemplo n.º 7
0
 def _load_all_synsets(self):
     for pos_tag in _FILEMAP.values():
         filename = os.path.join(self.wordnet_data_dir,
                                 'data.{}'.format(pos_tag))
         with io.open(filename, encoding='utf8') as fin:
             for line in fin:
                 # Skip documentation and empty lines.
                 if line.startswith(' ') or not line.strip():
                     continue
                 try:
                     synset, lemmas = parse_wordnet_line(
                         line, lexname_type=self.lexname_type)
                     _synset_offset_cache[synset._pos][
                         synset._offset] = synset
                 except:
                     err_msg = "Error parsing this line from {}:\n".format(
                         'data.{}'.format(pos_tag))
                     raise WordNetError(err_msg + line)
Ejemplo n.º 8
0
    def _compute_max_depth_once(self, pos, simulate_root):
        """
        Compute the max depth for the given part of speech.  This is
        used by the lch similarity metric.

        This function should never be used!!!
        It should be computed once, then put into wn.constants.
        """
        depth = 0
        for ss in self.all_synsets(pos):
            try:
                depth = max(depth, ss.max_depth())
            except RuntimeError:
                _msg = '{} throws error when searching for max_depth'.format(
                    ss)
                raise WordNetError(_msg)
        if simulate_root:
            depth += 1
        return depth
Ejemplo n.º 9
0
    def _related(self, relation_symbol, sort=True):
        if relation_symbol not in self._pointers:
            return []
        related_synsets = []
        for pos, offset in self._pointers[relation_symbol]:
            if pos in ['s', 'a']:
                try:
                    related_synset = _synset_offset_cache['a'][offset]
                except:
                    try:
                        related_synset = _synset_offset_cache['s'][offset]
                    except:
                        raise WordNetError(
                            'Part-of-Speech and Offset combination not found in WordNet: {} + {}'
                            .format(pos, offset))
            else:
                related_synset = _synset_offset_cache[pos][offset]
            related_synsets.append(related_synset)

        return sorted(related_synsets) if sort else related_synsets
Ejemplo n.º 10
0
 def _load_lemma_pos_offset_map(self):
     for pos_tag in _FILEMAP.values():
         filename = os.path.join(self.wordnet_data_dir,
                                 'index.{}'.format(pos_tag))
         with io.open(filename, encoding='utf8') as fin:
             for line in fin:
                 if line.startswith(' '):
                     continue
                 try:
                     lemma, pos, synset_offsets = parse_index_line(line)
                 except:  # When there's inconsistencies.
                     ##if self.wordnet_33:
                     ##    lemma, pos, synset_offsets = parse_index_line(fix_inconsistent_line(line))
                     ##else:
                     raise WordNetError(
                         'Error parsing:\n{}\nfrom {}'.format(
                             line, filename))
                 # Cache the map.
                 _lemma_pos_offset_map[lemma][pos] = synset_offsets
                 if pos == 'a':
                     _lemma_pos_offset_map[lemma]['s'] = synset_offsets
Ejemplo n.º 11
0
def parse_sense_key(sense_key):
    """
    Retrieves synset based on a given sense_key. Sense keys can be
    obtained from lemma.key()
    From https://wordnet.princeton.edu/wordnet/man/senseidx.5WN.html:
    A sense_key is represented as:
        lemma % lex_sense (e.g. 'dog%1:18:01::')
    where lex_sense is encoded as:
        ss_type:lex_filenum:lex_id:head_word:head_id

    lemma:      ASCII text of word/collocation, in lower case
    ss_type:    synset type for the sense (1 digit int)
                The synset type is encoded as follows:
                1    NOUN
                2    VERB
                3    ADJECTIVE
                4    ADVERB
                5    ADJECTIVE SATELLITE
    lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
    lex_id:      when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
    head_word:   lemma of the first word in satellite's head synset
                 Only used if sense is in an adjective satellite synset
    head_id:     uniquely identifies sense in a lexicographer file when paired with head_word
                 Only used if head_word is present (2 digit int)
    """
    sense_key_regex = re.compile(r"(.*)\%(.*):(.*):(.*):(.*):(.*)")
    lemma, ss_type, _, lex_id, _, _ = sense_key_regex.match(sense_key).groups()
    # check that information extracted from sense_key is valid
    error = None
    if not lemma:
        error = "lemma"
    elif int(ss_type) not in _synset_types:
        error = "ss_type"
    elif int(lex_id) < 0 or int(lex_id) > 99:
        error = "lex_id"
    if error:
        raise WordNetError(
            "valid {} could not be extracted from the sense key".format(error))
    pos = _synset_types[int(ss_type)]
    return lemma, pos, lex_id