def next_for_guessed_voc_token(self, prev_tags_set: set, lword: str, anals: list or set, guesser: HashSuffixGuesser) -> dict: rrr = dict() tag_probs = dict() possible_tags = anals for tag in possible_tags: new_tag = guesser.mapper.map(tag) if new_tag > self.model.data.tag_vocabulary.max_index(): emission_prob = UNKNOWN_TAG_WEIGHT transition_prob = UNKOWN_TAG_TRANSITION tag_probs[tag] = (transition_prob, emission_prob) for prev_tags in prev_tags_set: rrr[prev_tags] = tag_probs else: apriori_prob = self.model.compiled_data.apriori_tag_probs[ new_tag] log_apriori_prob = math.log(apriori_prob) tag_log_prob = guesser.tag_log_probability(lword, tag) if tag_log_prob == UNKNOWN_VALUE: emission_prob = UNKNOWN_TAG_WEIGHT else: emission_prob = tag_log_prob - log_apriori_prob for prev_tags in prev_tags_set: transition_prob = self.model.compiled_data.tag_transition_model.log_prob( prev_tags.token_list, tag) tag_probs[tag] = (transition_prob, emission_prob) rrr[prev_tags] = tag_probs return rrr
def next_for_guessed_voc_token(self, prev_tags_set: set, lword: str, anals: list or set, guesser: HashSuffixGuesser) -> dict: rrr = dict() tag_probs = dict() possible_tags = anals for tag in possible_tags: new_tag = guesser.mapper.map(tag) if new_tag > self.model.data.tag_vocabulary.max_index(): emission_prob = UNKNOWN_TAG_WEIGHT transition_prob = UNKOWN_TAG_TRANSITION tag_probs[tag] = (transition_prob, emission_prob) for prev_tags in prev_tags_set: rrr[prev_tags] = tag_probs else: apriori_prob = self.model.compiled_data.apriori_tag_probs[new_tag] log_apriori_prob = math.log(apriori_prob) tag_log_prob = guesser.tag_log_probability(lword, tag) if tag_log_prob == UNKNOWN_VALUE: emission_prob = UNKNOWN_TAG_WEIGHT else: emission_prob = tag_log_prob - log_apriori_prob for prev_tags in prev_tags_set: transition_prob = self.model.compiled_data.tag_transition_model.log_prob( prev_tags.token_list, tag) tag_probs[tag] = (transition_prob, emission_prob) rrr[prev_tags] = tag_probs return rrr
def next_for_guessed_oov_token(self, prev_tags_set: set, lword: str, guesser: HashSuffixGuesser): rrr = dict() tag_probs = dict() guessed_tags = guesser.tag_log_probabilities(lword) pruned_guessed_tags = self.prune_guessed_tags(guessed_tags) for prev_tags in prev_tags_set: for guess in pruned_guessed_tags: emission_prob = guess[1] tag = guess[0] tag_trans_prob = self.model.compiled_data.tag_transition_model.log_prob( prev_tags.token_list, tag) apriori_prob = math.log( self.model.compiled_data.apriori_tag_probs[tag]) tag_probs[tag] = (tag_trans_prob, emission_prob - apriori_prob) rrr[prev_tags] = tag_probs return rrr
def next_for_guessed_oov_token(self, prev_tags_set: set, lword: str, guesser: HashSuffixGuesser): rrr = dict() tag_probs = dict() guessed_tags = guesser.tag_log_probabilities(lword) pruned_guessed_tags = self.prune_guessed_tags(guessed_tags) for prev_tags in prev_tags_set: for guess in pruned_guessed_tags: emission_prob = guess[1] tag = guess[0] tag_trans_prob = self.model.compiled_data.tag_transition_model.log_prob( prev_tags.token_list, tag) apriori_prob = math.log(self.model.compiled_data.apriori_tag_probs[tag]) tag_probs[tag] = (tag_trans_prob, emission_prob - apriori_prob) rrr[prev_tags] = tag_probs return rrr
def prune_guessed_tags(self, guessed_tags: dict) -> set: # set of pairs # A legnagyobb valószínűségű tag-eket kiszedi, hogy az ismeretlen szavak taggelésénél ne # vezessenek félre. // „TnT – A Statistical Part-of-Speech Tagger” Brants, Thorsen 2000 # 2.3, 4) s = set() max_tag = HashSuffixGuesser.max_probability_tag(guessed_tags) max_val = guessed_tags[max_tag] min_val = max_val - self.suf_theta for k, v in guessed_tags.items(): if v > min_val: s.add((k, v)) if len(s) > self.max_guessed_tags: l = list(s) l.sort(key=lambda ent: ent[1], reverse=True) for e in l: if len(s) <= self.max_guessed_tags: break s.remove(e) return s
def create_guesser(self, theta: float) -> HashSuffixGuesser: return HashSuffixGuesser(self.representation, theta)