Esempio n. 1
0
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    if logger is None:
        logger = logging.getLogger(__name__)

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition], condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    disambig_words = list(
        with_progress(
            _disambiguated_words(reader),
            "Reading disambiguated words from corpus"
        )
    )

    disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words")
    ambiguous_words = [
        (w, gr) for (w, gr) in (
            (w.lower(), tag2grammemes(t))
            for (w, t) in disambig_words
            if len(morph.tag(w)) > 1
        ) if gr != set(['UNKN'])
    ]

    logger.info("Computing P(t|w)")

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Esempio n. 2
0
def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes,
                              min_ending_freq, min_paradigm_popularity, max_suffix_length,
                              paradigm_prefixes):

    logger.debug('calculating prediction data: removing non-productive paradigms..')
    productive_paradigms = _popular_keys(paradigm_popularity, min_paradigm_popularity)

    # ["suffix"] => number of occurrences
    # this is for removing non-productive suffixes
    ending_counts = collections.defaultdict(int)

    # [form_prefix_id]["suffix"]["POS"][(para_id, idx)] => number or occurrences
    # this is for selecting most popular parses
    prefix_endings = {}
    for form_prefix_id in range(len(paradigm_prefixes)):
        prefix_endings[form_prefix_id] = collections.defaultdict(
                                    lambda: collections.defaultdict(
                                        lambda: collections.defaultdict(int)))

    logger.debug('calculating prediction data: checking word endings..')
    for word, (para_id, idx) in with_progress(words, "Checking word endings"):

        if para_id not in productive_paradigms:
            continue

        paradigm = paradigms[para_id]

        form_count = len(paradigm) // 3

        tag = gramtab[paradigm[form_count + idx]]
        form_prefix_id = paradigm[2*form_count + idx]
        form_prefix = paradigm_prefixes[form_prefix_id]
        form_suffix = suffixes[paradigm[idx]]

        assert len(word) >= len(form_prefix+form_suffix), word
        assert word.startswith(form_prefix), word
        assert word.endswith(form_suffix), word

        if len(word) == len(form_prefix) + len(form_suffix):
            # pseudo-paradigms are useless for prediction
            continue

        POS = tuple(tag.replace(' ', ',', 1).split(','))[0]

        for i in range(max(len(form_suffix), 1), max_suffix_length+1): #was: 1,2,3,4,5
            word_end = word[-i:]
            ending_counts[word_end] += 1
            prefix_endings[form_prefix_id][word_end][POS][(para_id, idx)] += 1

    dawgs_data = []

    for form_prefix_id in sorted(prefix_endings.keys()):
        logger.debug('calculating prediction data: preparing DAFSA #%d..' % form_prefix_id)
        endings = prefix_endings[form_prefix_id]
        dawgs_data.append(
            _get_suffixes_dawg_data(endings, ending_counts, min_ending_freq)
        )

    return dawgs_data
Esempio n. 3
0
def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes,
                              min_ending_freq, min_paradigm_popularity, max_suffix_length,
                              paradigm_prefixes):

    logger.debug('calculating prediction data: removing non-productive paradigms..')
    productive_paradigms = _popular_keys(paradigm_popularity, min_paradigm_popularity)

    # ["suffix"] => number of occurrences
    # this is for removing non-productive suffixes
    ending_counts = collections.defaultdict(int)

    # [form_prefix_id]["suffix"]["POS"][(para_id, idx)] => number or occurrences
    # this is for selecting most popular parses
    prefix_endings = {}
    for form_prefix_id in range(len(paradigm_prefixes)):
        prefix_endings[form_prefix_id] = collections.defaultdict(
                                    lambda: collections.defaultdict(
                                        lambda: collections.defaultdict(int)))

    logger.debug('calculating prediction data: checking word endings..')
    for word, (para_id, idx) in with_progress(words, "Checking word endings"):

        if para_id not in productive_paradigms:
            continue

        paradigm = paradigms[para_id]

        form_count = len(paradigm) // 3

        tag = gramtab[paradigm[form_count + idx]]
        form_prefix_id = paradigm[2*form_count + idx]
        form_prefix = paradigm_prefixes[form_prefix_id]
        form_suffix = suffixes[paradigm[idx]]

        assert len(word) >= len(form_prefix+form_suffix), word
        assert word.startswith(form_prefix), word
        assert word.endswith(form_suffix), word

        if len(word) == len(form_prefix) + len(form_suffix):
            # pseudo-paradigms are useless for prediction
            continue

        POS = tuple(tag.replace(' ', ',', 1).split(','))[0]

        for i in range(max(len(form_suffix), 1), max_suffix_length+1): #was: 1,2,3,4,5
            word_end = word[-i:]
            ending_counts[word_end] += 1
            prefix_endings[form_prefix_id][word_end][POS][(para_id, idx)] += 1

    dawgs_data = []

    for form_prefix_id in sorted(prefix_endings.keys()):
        logger.debug('calculating prediction data: preparing DAFSA #%d..' % form_prefix_id)
        endings = prefix_endings[form_prefix_id]
        dawgs_data.append(
            _get_suffixes_dawg_data(endings, ending_counts, min_ending_freq)
        )

    return dawgs_data
Esempio n. 4
0
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    if logger is None:
        logger = logging.getLogger(__name__)

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition],
                                                   condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    disambig_words = list(
        with_progress(_disambiguated_words(reader),
                      "Reading disambiguated words from corpus"))

    disambig_words = with_progress(disambig_words,
                                   "Filtering out non-ambiguous words")
    ambiguous_words = [(w, gr) for (w, gr) in ((w.lower(), tag2grammemes(t))
                                               for (w, t) in disambig_words
                                               if len(morph.tag(w)) > 1)
                       if gr != set(['UNKN'])]

    logger.info("Computing P(t|w)")

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Esempio n. 5
0
def parse_opencorpora_xml(filename):
    """
    Parse OpenCorpora dict XML and return a ``ParsedDictionary`` namedtuple.
    """

    links = []
    lexemes = {}
    grammemes = []

    version, revision = get_dictionary_info(filename)
    logger.info("dictionary v%s, rev%s", version, revision)
    interesting_tags = set(['grammeme', 'lemma', 'link'])

    def _parse(filename):
        for ev, elem in iterparse(filename):
            if elem.tag not in interesting_tags:
                continue
            yield ev, elem

    logger.info("parsing XML dictionary")

    for ev, elem in with_progress(_parse(filename), "XML parsing"):
        if elem.tag == 'grammeme':
            name = elem.find('name').text
            parent = elem.get('parent')
            alias = elem.find('alias').text
            description = elem.find('description').text

            grammeme = (name, parent, alias, description)
            grammemes.append(grammeme)
            xml_clear_elem(elem)

        if elem.tag == 'lemma':
            lex_id, word_forms = _word_forms_from_xml_elem(elem)
            lexemes[lex_id] = word_forms
            xml_clear_elem(elem)

        elif elem.tag == 'link':
            link_tuple = (
                elem.get('from'),
                elem.get('to'),
                elem.get('type'),
            )
            links.append(link_tuple)
            xml_clear_elem(elem)

    return ParsedDictionary(
        lexemes=lexemes,
        links=links,
        grammemes=grammemes,
        version=version,
        revision=revision
    )
Esempio n. 6
0
def simplify_tags(parsed_dict, skip_space_ambiguity=True):
    """
    This function simplifies tags in :param:`parsed_dict`.
    :param:`parsed_dict` is modified inplace.
    """
    logger.info("simplifying tags: looking for tag spellings")
    spellings = _get_tag_spellings(parsed_dict)

    logger.info(
        "simplifying tags: looking for spelling duplicates "
        "(skip_space_ambiguity: %s)", skip_space_ambiguity)
    tag_replaces = _get_duplicate_tag_replaces(spellings, skip_space_ambiguity)
    logger.debug("%d duplicate tags will be removed", len(tag_replaces))

    logger.info("simplifying tags: fixing")
    for lex_id in with_progress(parsed_dict.lexemes, "Simplifying tags"):
        new_lexeme = [(word, _simplify_tag(tag, tag_replaces))
                      for word, tag in parsed_dict.lexemes[lex_id]]
        parsed_dict.lexemes[lex_id] = new_lexeme
Esempio n. 7
0
def simplify_tags(parsed_dict, skip_space_ambiguity=True):
    """
    This function simplifies tags in :param:`parsed_dict`.
    :param:`parsed_dict` is modified inplace.
    """
    logger.info("simplifying tags: looking for tag spellings")
    spellings = _get_tag_spellings(parsed_dict)

    logger.info("simplifying tags: looking for spelling duplicates "
                "(skip_space_ambiguity: %s)", skip_space_ambiguity)
    tag_replaces = _get_duplicate_tag_replaces(spellings, skip_space_ambiguity)
    logger.debug("%d duplicate tags will be removed", len(tag_replaces))

    logger.info("simplifying tags: fixing")
    for lex_id in with_progress(parsed_dict.lexemes, "Simplifying tags"):
        new_lexeme = [
            (word, _simplify_tag(tag, tag_replaces))
            for word, tag in parsed_dict.lexemes[lex_id]
        ]
        parsed_dict.lexemes[lex_id] = new_lexeme
Esempio n. 8
0
def _itertags(parsed_dict):
    for lex_id in with_progress(parsed_dict.lexemes,
                                "Looking for tag spellings"):
        for word, tag in parsed_dict.lexemes[lex_id]:
            yield tag
Esempio n. 9
0
def _itertags(parsed_dict):
    for lex_id in with_progress(parsed_dict.lexemes, "Looking for tag spellings"):
        for word, tag in parsed_dict.lexemes[lex_id]:
            yield tag