Esempio n. 1
0
def get_grammeme_classes(parses):
    """ Given a list of ``ParseInfo`` structures, return a dict with its
    grammemes, classified::

        {
            TokenInfo.UNIVOCAL: {set of univocal grammemes},
            TokenInfo.AMBIG: {set of possible grammemes},
            TokenInfo.DISCARDED: {set of discarded grammemes},
        }

    """
    all_grammemes = defaultdict(set)
    tag_grammemes = defaultdict(list)

    for p in parses:
        gr = tag2grammemes(p.tag)
        tag_grammemes[p.state].append((p.tag, gr))
        all_grammemes[p.state] |= gr

    if not all_grammemes[ParseInfo.UNIVOCAL]:
        all_grammemes[ParseInfo.UNIVOCAL] = all_grammemes[ParseInfo.AMBIG].copy()
        for tag, gr in tag_grammemes[ParseInfo.AMBIG]:
            all_grammemes[ParseInfo.UNIVOCAL] &= gr

    all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.UNIVOCAL]
    all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.AMBIG]
    all_grammemes[ParseInfo.AMBIG] -= all_grammemes[ParseInfo.UNIVOCAL]
    return dict(all_grammemes)
Esempio n. 2
0
def get_grammeme_classes(parses):
    """ Given a list of ``ParseInfo`` structures, return a dict with its
    grammemes, classified::

        {
            TokenInfo.UNIVOCAL: {set of univocal grammemes},
            TokenInfo.AMBIG: {set of possible grammemes},
            TokenInfo.DISCARDED: {set of discarded grammemes},
        }

    """
    all_grammemes = defaultdict(set)
    tag_grammemes = defaultdict(list)

    for p in parses:
        gr = tag2grammemes(p.tag)
        tag_grammemes[p.state].append((p.tag, gr))
        all_grammemes[p.state] |= gr

    if not all_grammemes[ParseInfo.UNIVOCAL]:
        all_grammemes[ParseInfo.UNIVOCAL] = all_grammemes[
            ParseInfo.AMBIG].copy()
        for tag, gr in tag_grammemes[ParseInfo.AMBIG]:
            all_grammemes[ParseInfo.UNIVOCAL] &= gr

    all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.UNIVOCAL]
    all_grammemes[ParseInfo.DISCARDED] -= all_grammemes[ParseInfo.AMBIG]
    all_grammemes[ParseInfo.AMBIG] -= all_grammemes[ParseInfo.UNIVOCAL]
    return dict(all_grammemes)
Esempio n. 3
0
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    if logger is None:
        logger = logging.getLogger(__name__)

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition], condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    disambig_words = list(
        with_progress(
            _disambiguated_words(reader),
            "Reading disambiguated words from corpus"
        )
    )

    disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words")
    ambiguous_words = [
        (w, gr) for (w, gr) in (
            (w.lower(), tag2grammemes(t))
            for (w, t) in disambig_words
            if len(morph.tag(w)) > 1
        ) if gr != set(['UNKN'])
    ]

    logger.info("Computing P(t|w)")

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Esempio n. 4
0
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    if logger is None:
        logger = logging.getLogger(__name__)

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition],
                                                   condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    disambig_words = list(
        with_progress(_disambiguated_words(reader),
                      "Reading disambiguated words from corpus"))

    disambig_words = with_progress(disambig_words,
                                   "Filtering out non-ambiguous words")
    ambiguous_words = [(w, gr) for (w, gr) in ((w.lower(), tag2grammemes(t))
                                               for (w, t) in disambig_words
                                               if len(morph.tag(w)) > 1)
                       if gr != set(['UNKN'])]

    logger.info("Computing P(t|w)")

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Esempio n. 5
0
def estimate_conditional_tag_probability(morph, corpus_filename):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition], condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    ambiguous_words = (
        (w.lower(), tag2grammemes(t))
        for (w, t) in _disambiguated_words(reader)
        if len(morph.tag(w)) > 1
    )
    ambiguous_words = ((w, gr) for (w, gr) in ambiguous_words
                       if gr != set(['UNKN']))

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Esempio n. 6
0
def estimate_conditional_tag_probability(morph, corpus_filename):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition],
                                                   condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    ambiguous_words = ((w.lower(), tag2grammemes(t))
                       for (w, t) in _disambiguated_words(reader)
                       if len(morph.tag(w)) > 1)
    ambiguous_words = ((w, gr) for (w, gr) in ambiguous_words
                       if gr != set(['UNKN']))

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Esempio n. 7
0
 def grammemes(self):
     return tag2grammemes(self.tag)
Esempio n. 8
0
 def grammemes(self):
     return tag2grammemes(self.tag)