Ejemplo n.º 1
0
def load_dict(path, gramtab_format='opencorpora-int'):
    """
    Load pymorphy2 dictionary.
    ``path`` is a folder name with dictionary data.
    """

    _f = lambda p: os.path.join(path, p)

    meta = _load_meta(_f('meta.json'))
    _assert_format_is_compatible(meta, path)

    Tag = _load_tag_class(gramtab_format, _f('grammemes.json'))

    str_gramtab = _load_gramtab(meta, gramtab_format, path)
    gramtab = [Tag(tag_str) for tag_str in str_gramtab]

    suffixes = json_read(_f('suffixes.json'))
    paradigm_prefixes = json_read(_f('paradigm-prefixes.json'))
    paradigms = _load_paradigms(_f('paradigms.array'))
    words = dawg.WordsDawg().load(_f('words.dawg'))

    prediction_prefixes = dawg.DAWG().load(_f('prediction-prefixes.dawg'))

    prediction_suffixes_dawgs = []
    for prefix_id in range(len(paradigm_prefixes)):
        fn = _f('prediction-suffixes-%s.dawg' % prefix_id)

        assert os.path.exists(fn)

        prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn))

    return LoadedDictionary(meta, gramtab, suffixes, paradigms, words,
                            prediction_prefixes, prediction_suffixes_dawgs,
                            Tag, paradigm_prefixes)
Ejemplo n.º 2
0
def estimate_tag_cpd(corpus_filename,
                     out_path,
                     min_word_freq,
                     update_meta=True):
    from pymorphy2.opencorpora_dict.probability import (
        estimate_conditional_tag_probability, build_cpd_dawg)

    m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None)

    logger.info("Estimating P(t|w) from %s" % corpus_filename)
    cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename)

    logger.info("Encoding P(t|w) as DAWG")
    d = build_cpd_dawg(m, cpd, int(min_word_freq))
    dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
    d.save(dawg_filename)

    if update_meta:
        logger.info("Updating meta information")
        meta_filename = os.path.join(out_path, 'meta.json')
        meta = json_read(meta_filename)
        meta.extend([
            ('P(t|w)', True),
            ('P(t|w)_unique_words', len(cpd.conditions())),
            ('P(t|w)_outcomes', cfd.N()),
            ('P(t|w)_min_word_freq', int(min_word_freq)),
        ])
        json_write(meta_filename, meta)

    logger.info('\nDone.')
Ejemplo n.º 3
0
def estimate_tag_cpd(corpus_filename, out_path, min_word_freq, update_meta=True):
    from pymorphy2.opencorpora_dict.probability import (
        estimate_conditional_tag_probability, build_cpd_dawg)

    m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None)

    logger.info("Estimating P(t|w) from %s" % corpus_filename)
    cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename)

    logger.info("Encoding P(t|w) as DAWG")
    d = build_cpd_dawg(m, cpd, int(min_word_freq))
    dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
    d.save(dawg_filename)

    if update_meta:
        logger.info("Updating meta information")
        meta_filename = os.path.join(out_path, 'meta.json')
        meta = json_read(meta_filename)
        meta.extend([
            ('P(t|w)', True),
            ('P(t|w)_unique_words', len(cpd.conditions())),
            ('P(t|w)_outcomes', cfd.N()),
            ('P(t|w)_min_word_freq', int(min_word_freq)),
        ])
        json_write(meta_filename, meta)

    logger.info('\nDone.')
Ejemplo n.º 4
0
def _load_gramtab(meta, gramtab_format, path):
    """ Load gramtab (a list of tags) """
    gramtab_formats = meta.get('gramtab_formats', {})
    if gramtab_format not in gramtab_formats:
        raise ValueError("This gramtab format (%s) is unavailable; available formats: %s" % (gramtab_format, gramtab_formats.keys()))

    gramtab_filename = os.path.join(path, gramtab_formats[gramtab_format])
    return json_read(gramtab_filename)
Ejemplo n.º 5
0
def _load_gramtab(meta, gramtab_format, path):
    """ Load gramtab (a list of tags) """

    gramtab_formats = meta.get('gramtab_formats', {})
    if gramtab_format not in gramtab_formats:
        raise ValueError("This gramtab format (%s) is unavailable; available formats: %s" % (gramtab_format, gramtab_formats.keys()))

    gramtab_filename = os.path.join(path, gramtab_formats[gramtab_format])
    return json_read(gramtab_filename)
Ejemplo n.º 6
0
def _load_tag_class(gramtab_format, grammemes_filename):
    """ Load and initialize Tag class (according to ``gramtab_format``). """
    if gramtab_format not in tagset.registry:
        raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format)

    # FIXME: clone the class
    Tag = tagset.registry[gramtab_format] #._clone_class()

    grammemes = json_read(grammemes_filename)
    Tag._init_grammemes(grammemes)

    return Tag
Ejemplo n.º 7
0
def _load_tag_class(gramtab_format, grammemes_filename):
    """ Load and initialize Tag class (according to ``gramtab_format``). """
    if gramtab_format not in tagset.registry:
        raise ValueError("This gramtab format ('%s') is unsupported." %
                         gramtab_format)

    # FIXME: clone the class
    Tag = tagset.registry[gramtab_format]  #._clone_class()

    grammemes = json_read(grammemes_filename)
    Tag._init_grammemes(grammemes)

    return Tag
Ejemplo n.º 8
0
def _load_tag_class(gramtab_format, grammemes_filename):
    """ Load and initialize Tag class (according to ``gramtab_format``). """
    if gramtab_format not in tagset.registry:
        raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format)

    grammemes = json_read(grammemes_filename)

    Tag = tagset.registry[gramtab_format]

    # FIXME: clone the class
    # Tag = type(Tag.__name__, (Tag,), {
    #     'KNOWN_GRAMMEMES': Tag.KNOWN_GRAMMEMES.copy(),
    # })

    Tag._init_grammemes(grammemes)

    return Tag
Ejemplo n.º 9
0
def _load_meta(filename):
    """ Load metadata. """
    meta = json_read(filename, parse_float=str)
    if hasattr(collections, 'OrderedDict'):
        return collections.OrderedDict(meta)
    return dict(meta)