Ejemplo n.º 1
0
class SingleTagProbabilityEstimator(object):
    def __init__(self, dict_path):
        cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg')
        self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)

    def apply_to_parses(self, word, word_lower, parses):
        if not parses:
            return parses

        probs = [
            self.p_t_given_w.prob(word_lower, tag)
            for (word, tag, normal_form, score, methods_stack) in parses
        ]

        if sum(probs) == 0:
            # no P(t|w) information is available; return normalized estimate
            k = 1.0 / sum(map(_score_getter, parses))
            return [(word, tag, normal_form, score * k, methods_stack)
                    for (word, tag, normal_form, score,
                         methods_stack) in parses]

        # replace score with P(t|w) probability
        return sorted([(word, tag, normal_form, prob, methods_stack)
                       for (word, tag, normal_form, score,
                            methods_stack), prob in zip(parses, probs)],
                      key=_score_getter,
                      reverse=True)

    def apply_to_tags(self, word, word_lower, tags):
        if not tags:
            return tags
        return sorted(tags,
                      key=lambda tag: self.p_t_given_w.prob(word_lower, tag),
                      reverse=True)
Ejemplo n.º 2
0
class ProbabilityEstimator(object):
    def __init__(self, dict_path):
        cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg')
        self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)

    def apply_to_parses(self, word, word_lower, parses):
        if not parses:
            return parses

        probs = [self.p_t_given_w.prob(word_lower, tag)
                for (word, tag, normal_form, score, methods_stack) in parses]

        if sum(probs) == 0:
            # no P(t|w) information is available; return normalized estimate
            k = 1.0 / sum(map(_score_getter, parses))
            return [
                (word, tag, normal_form, score*k, methods_stack)
                for (word, tag, normal_form, score, methods_stack) in parses
            ]

        # replace score with P(t|w) probability
        return sorted([
            (word, tag, normal_form, prob, methods_stack)
            for (word, tag, normal_form, score, methods_stack), prob
            in zip(parses, probs)
        ], key=_score_getter, reverse=True)

    def apply_to_tags(self, word, word_lower, tags):
        if not tags:
            return tags
        return sorted(tags,
            key=lambda tag: self.p_t_given_w.prob(word_lower, tag),
            reverse=True
        )
Ejemplo n.º 3
0
def build_cpd_dawg(morph, cpd, min_frequency):
    """
    Return conditional tag probability information encoded as DAWG.

    For each "interesting" word and tag the resulting DAWG
    stores ``"word:tag"`` key with ``probability*1000000`` integer value.
    """
    words = [
        word for (word, fd) in cpd.items()
        if fd.freqdist().N() >= min_frequency
    ]

    prob_data = filter(lambda rec: not _all_the_same(rec[1]),
                       ((word, _tag_probabilities(morph, word, cpd))
                        for word in words))
    dawg_data = (((word, tag), prob) for word, probs in prob_data
                 for tag, prob in probs.items())
    return ConditionalProbDistDAWG(dawg_data)
Ejemplo n.º 4
0
 def __init__(self, dict_path):
     cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg')
     self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)
Ejemplo n.º 5
0
 def __init__(self, dict_path):
     cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg')
     self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)