コード例 #1
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
    def __init__(self, stem, suffixes, morphs, tags, wordforms):
        """ This class is constructed by the Morph.inflection_analyse()
        function. It contains a few attributes and methods useful for examining
        the inflections of a lexeme:

        stem - The stem of the lexeme as determined by examining which parts of
        the word are constant throughout all inflections. A stem may contain
        'variables' which usually correspond to vowel transformations. For
        example, the verb 'drjúpa' has the stem *dr(au|jú|o|y)p*, the noun
        'banki' has *b(a|ö)nk* and 'hús' simply has *hús*.

        *suffixes* - A list of suffixes following the stem, for all the word
        forms.

        *morphs* - A list, corresponding to suffixes, which specifies which (if
        any) variables should be substituted in the variable parts of the stem.

        *tags* - Another list, corresponding to the other ones, containing the
        tag of each word form.

        *wordforms* - Lists all the word forms, in the same sequence as the
        other lists.
        """
        self.stem      = icepy_decode(stem)
        self.suffixes  = icepy_decode(suffixes)
        self.morphs    = icepy_decode(morphs)
        self.tags      = icepy_decode(tags)
        self.wordforms = icepy_decode(wordforms)
コード例 #2
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
    def iterate_words_grouped(self, show_status=False, decode=True):
        """ Returns an iterator over all the word forms in the maps, grouped by
        lemma. That is, yields a list for every lexeme in the maps.

        Accepts the same options as iterate_words().
        """
        #TODO: refactor this and iterate_words to use more of the same code
        i = 0
        total = len(self.id_lemma)
        for prefix,idlist in self.prefix_map.iteritems():
            for lemma_id, suffix_id, wordform_count in idlist:
                group = []
                for suffix,tags in self.id_suffixes[suffix_id].iteritems():
                    for tag in tags:
                        if decode:
                            try:
                                group.append((icepy_decode(prefix+suffix),icepy_decode(tag)))
                            except KeyError:
                                print "Could not decode word/tag %s/%s!" % (repr(prefix+suffix),repr(tag))
                                raise
                        else:
                            group.append((prefix+suffix,tag))
                i += 1
                if show_status:
                    print '%d / %d\r' % (i, total),
                    sys.stdout.flush()
                yield group

        if show_status: print '%d / %d' % (i, total)
コード例 #3
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
    def lemmatise(self, wordform, category=None, tag=None, return_original=True):
        """ Return a tuple (lemma,tag) with the lemma of the input wordform,
        along with the assumed tag.

        Basically this function does the same as analyse() but attempts to
        select one best match and return the lemma string for that.

        By default the input is returned back (in a tuple) if nothing is found.
        If return_original=False is set, None is return instead.

        """
        if self._check_input(wordform,category,tag):
            #first try to lookup the whole word directly
            candidates = self.lookup(wordform,category,tag)
            if candidates:
                #TODO: (vastly) improve selection process
                m = candidates.top_pick()
                return icepy_decode((m.lemma,m.match_tag))

            #else try expanding the word
            candidates = self.expand(wordform,category)
            if candidates:
                #TODO: (vastly) improve selection process
                #prefer matches with fewer parts
                m = candidates.top_pick()
                return icepy_decode((m.lemma,m.match_tag))

        #else return original or none
        if return_original: return (wordform,tag or category or None)
        return None
コード例 #4
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
 def _decode(self):
     a = AnalysisMatch(icepy_decode(self.prefix),icepy_decode(self.suffix),
                          self.lemma_id,icepy_decode(self.lemma),self.otb_count,
                          self.suffix_id,
                          icepy_decode(self.match_tag),self.type,
                          self.parts)
     a.tag_count = self.tag_count
     a.tag_pattern_count = self.tag_count
     return a
コード例 #5
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
    def inflections(self, match):
        """ Return list of all inflections for an AnalysisMatch object's lexeme.
        """
        if not isinstance(match, AnalysisMatch):
            raise ValueError('input object must be AnalysisMatch instance')

        results = []
        for suffix,tags in self.id_suffixes[match.suffix_id].iteritems():
            for tag in tags:
                results.append((icepy_decode(match.prefix+suffix),icepy_decode(tag)))
        return results
コード例 #6
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
    def wordforms(self, match):
        """ Return list of all wordforms for an AnalysisMatch object's lexeme.
        """
        if not isinstance(match, AnalysisMatch):
            raise ValueError('input object must be AnalysisMatch instance')

        return [icepy_decode(match.prefix+suffix) for suffix in self.id_suffixes[match.suffix_id]]
コード例 #7
0
def test_codec():
    for args in LOOKUP_TEST:
        word = args[0]
        try:
            encoded = icepy_encode(word)
        except ValueError:
            print "Could not encode %s" % repr(word)
            continue
        decoded = icepy_decode(encoded)

        assert word==decoded
コード例 #8
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
    def iterate_words(self, show_status=False, decode=True):
        """ Returns an iterator over all the word forms in the maps.

        Set show_status=True to have the generator print a progress counter.

        By default the output is a decoded unicode string. Set decode=False if
        you need the IcePy encoded string.
        """
        i = 0
        total = self._count_wordforms()
        for prefix,idlist in self.prefix_map.iteritems():
            for lemma_id, suffix_id, wordform_count in idlist:
                for suffix,tags in self.id_suffixes[suffix_id].iteritems():
                    for tag in tags:
                        i += 1
                        if show_status and i%1000==0:
                            print '%d / %d\r' % (i, total),
                            sys.stdout.flush()
                        if decode:
                            yield icepy_decode(prefix+suffix), icepy_decode(tag)
                        else:
                            yield prefix+suffix, tag

        if show_status: print '%d / %d' % (i, total)
コード例 #9
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
 def by_tag(self, tag):
     d = dict((icepy_decode(m.match_tag),m) for m in self.matches)
     return d.get(tag, None)
コード例 #10
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
 def tags(self):
     """ Return a set of all the tags in the list of *matches*.
     """
     alltags = (icepy_decode(m.match_tag) for m in self.matches)
     return set(alltags)
コード例 #11
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
 def decoded_lemma(self):
     return icepy_decode(self.lemma)
コード例 #12
0
ファイル: morph.py プロジェクト: sverrirab/IcePy
 def decoded_tag(self):
     """ Returns the word form described by the match in unicode format.
     """
     return icepy_decode(self.match_tag)