Beispiel #1
0
 def next_for_guessed_voc_token(self, prev_tags_set: set, lword: str,
                                anals: list or set,
                                guesser: HashSuffixGuesser) -> dict:
     rrr = dict()
     tag_probs = dict()
     possible_tags = anals
     for tag in possible_tags:
         new_tag = guesser.mapper.map(tag)
         if new_tag > self.model.data.tag_vocabulary.max_index():
             emission_prob = UNKNOWN_TAG_WEIGHT
             transition_prob = UNKOWN_TAG_TRANSITION
             tag_probs[tag] = (transition_prob, emission_prob)
             for prev_tags in prev_tags_set:
                 rrr[prev_tags] = tag_probs
         else:
             apriori_prob = self.model.compiled_data.apriori_tag_probs[
                 new_tag]
             log_apriori_prob = math.log(apriori_prob)
             tag_log_prob = guesser.tag_log_probability(lword, tag)
             if tag_log_prob == UNKNOWN_VALUE:
                 emission_prob = UNKNOWN_TAG_WEIGHT
             else:
                 emission_prob = tag_log_prob - log_apriori_prob
             for prev_tags in prev_tags_set:
                 transition_prob = self.model.compiled_data.tag_transition_model.log_prob(
                     prev_tags.token_list, tag)
                 tag_probs[tag] = (transition_prob, emission_prob)
                 rrr[prev_tags] = tag_probs
     return rrr
 def next_for_guessed_voc_token(self, prev_tags_set: set,
                                lword: str,
                                anals: list or set,
                                guesser: HashSuffixGuesser) -> dict:
     rrr = dict()
     tag_probs = dict()
     possible_tags = anals
     for tag in possible_tags:
         new_tag = guesser.mapper.map(tag)
         if new_tag > self.model.data.tag_vocabulary.max_index():
             emission_prob = UNKNOWN_TAG_WEIGHT
             transition_prob = UNKOWN_TAG_TRANSITION
             tag_probs[tag] = (transition_prob, emission_prob)
             for prev_tags in prev_tags_set:
                 rrr[prev_tags] = tag_probs
         else:
             apriori_prob = self.model.compiled_data.apriori_tag_probs[new_tag]
             log_apriori_prob = math.log(apriori_prob)
             tag_log_prob = guesser.tag_log_probability(lword, tag)
             if tag_log_prob == UNKNOWN_VALUE:
                 emission_prob = UNKNOWN_TAG_WEIGHT
             else:
                 emission_prob = tag_log_prob - log_apriori_prob
             for prev_tags in prev_tags_set:
                 transition_prob = self.model.compiled_data.tag_transition_model.log_prob(
                     prev_tags.token_list, tag)
                 tag_probs[tag] = (transition_prob, emission_prob)
                 rrr[prev_tags] = tag_probs
     return rrr
Beispiel #3
0
 def next_for_guessed_oov_token(self, prev_tags_set: set, lword: str,
                                guesser: HashSuffixGuesser):
     rrr = dict()
     tag_probs = dict()
     guessed_tags = guesser.tag_log_probabilities(lword)
     pruned_guessed_tags = self.prune_guessed_tags(guessed_tags)
     for prev_tags in prev_tags_set:
         for guess in pruned_guessed_tags:
             emission_prob = guess[1]
             tag = guess[0]
             tag_trans_prob = self.model.compiled_data.tag_transition_model.log_prob(
                 prev_tags.token_list, tag)
             apriori_prob = math.log(
                 self.model.compiled_data.apriori_tag_probs[tag])
             tag_probs[tag] = (tag_trans_prob, emission_prob - apriori_prob)
         rrr[prev_tags] = tag_probs
     return rrr
 def next_for_guessed_oov_token(self, prev_tags_set: set,
                                lword: str,
                                guesser: HashSuffixGuesser):
     rrr = dict()
     tag_probs = dict()
     guessed_tags = guesser.tag_log_probabilities(lword)
     pruned_guessed_tags = self.prune_guessed_tags(guessed_tags)
     for prev_tags in prev_tags_set:
         for guess in pruned_guessed_tags:
             emission_prob = guess[1]
             tag = guess[0]
             tag_trans_prob = self.model.compiled_data.tag_transition_model.log_prob(
                 prev_tags.token_list, tag)
             apriori_prob = math.log(self.model.compiled_data.apriori_tag_probs[tag])
             tag_probs[tag] = (tag_trans_prob, emission_prob - apriori_prob)
         rrr[prev_tags] = tag_probs
     return rrr
Beispiel #5
0
 def prune_guessed_tags(self, guessed_tags: dict) -> set:  # set of pairs
     # A legnagyobb valószínűségű tag-eket kiszedi, hogy az ismeretlen szavak taggelésénél ne
     # vezessenek félre. // „TnT – A Statistical Part-of-Speech Tagger” Brants, Thorsen 2000
     # 2.3, 4)
     s = set()
     max_tag = HashSuffixGuesser.max_probability_tag(guessed_tags)
     max_val = guessed_tags[max_tag]
     min_val = max_val - self.suf_theta
     for k, v in guessed_tags.items():
         if v > min_val:
             s.add((k, v))
     if len(s) > self.max_guessed_tags:
         l = list(s)
         l.sort(key=lambda ent: ent[1], reverse=True)
         for e in l:
             if len(s) <= self.max_guessed_tags:
                 break
             s.remove(e)
     return s
 def prune_guessed_tags(self, guessed_tags: dict) -> set:  # set of pairs
     # A legnagyobb valószínűségű tag-eket kiszedi, hogy az ismeretlen szavak taggelésénél ne
     # vezessenek félre. // „TnT – A Statistical Part-of-Speech Tagger” Brants, Thorsen 2000
     # 2.3, 4)
     s = set()
     max_tag = HashSuffixGuesser.max_probability_tag(guessed_tags)
     max_val = guessed_tags[max_tag]
     min_val = max_val - self.suf_theta
     for k, v in guessed_tags.items():
         if v > min_val:
             s.add((k, v))
     if len(s) > self.max_guessed_tags:
         l = list(s)
         l.sort(key=lambda ent: ent[1], reverse=True)
         for e in l:
             if len(s) <= self.max_guessed_tags:
                 break
             s.remove(e)
     return s
Beispiel #7
0
 def create_guesser(self, theta: float) -> HashSuffixGuesser:
     return HashSuffixGuesser(self.representation, theta)