class SingleTagProbabilityEstimator(object): def __init__(self, dict_path): cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg') self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path) def apply_to_parses(self, word, word_lower, parses): if not parses: return parses probs = [ self.p_t_given_w.prob(word_lower, tag) for (word, tag, normal_form, score, methods_stack) in parses ] if sum(probs) == 0: # no P(t|w) information is available; return normalized estimate k = 1.0 / sum(map(_score_getter, parses)) return [(word, tag, normal_form, score * k, methods_stack) for (word, tag, normal_form, score, methods_stack) in parses] # replace score with P(t|w) probability return sorted([(word, tag, normal_form, prob, methods_stack) for (word, tag, normal_form, score, methods_stack), prob in zip(parses, probs)], key=_score_getter, reverse=True) def apply_to_tags(self, word, word_lower, tags): if not tags: return tags return sorted(tags, key=lambda tag: self.p_t_given_w.prob(word_lower, tag), reverse=True)
class ProbabilityEstimator(object): def __init__(self, dict_path): cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg') self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path) def apply_to_parses(self, word, word_lower, parses): if not parses: return parses probs = [self.p_t_given_w.prob(word_lower, tag) for (word, tag, normal_form, score, methods_stack) in parses] if sum(probs) == 0: # no P(t|w) information is available; return normalized estimate k = 1.0 / sum(map(_score_getter, parses)) return [ (word, tag, normal_form, score*k, methods_stack) for (word, tag, normal_form, score, methods_stack) in parses ] # replace score with P(t|w) probability return sorted([ (word, tag, normal_form, prob, methods_stack) for (word, tag, normal_form, score, methods_stack), prob in zip(parses, probs) ], key=_score_getter, reverse=True) def apply_to_tags(self, word, word_lower, tags): if not tags: return tags return sorted(tags, key=lambda tag: self.p_t_given_w.prob(word_lower, tag), reverse=True )
def build_cpd_dawg(morph, cpd, min_frequency): """ Return conditional tag probability information encoded as DAWG. For each "interesting" word and tag the resulting DAWG stores ``"word:tag"`` key with ``probability*1000000`` integer value. """ words = [ word for (word, fd) in cpd.items() if fd.freqdist().N() >= min_frequency ] prob_data = filter(lambda rec: not _all_the_same(rec[1]), ((word, _tag_probabilities(morph, word, cpd)) for word in words)) dawg_data = (((word, tag), prob) for word, probs in prob_data for tag, prob in probs.items()) return ConditionalProbDistDAWG(dawg_data)
def __init__(self, dict_path): cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg') self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)