class RawModelData: def __init__(self, tagging_order, emission_order): self.stat = Statistics() # Statistics about trainig # Címkék ngram modellje self.tag_ngram_model = NGramModel(tagging_order + 1) # Eredeti szóalakok és a megelőző cimkék modellje self.std_emission_ngram_model = NGramModel(emission_order + 1) # Speciális tokenek és a megelőző cimkék modellje self.spec_emission_ngram_model = NGramModel(2) self.eos_tag = None # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.) self.lemma_suffix_tree = HashSuffixTree(100) # Lemma gyakorisági táblázat self.lemma_freq_tree = HashSuffixTree(5) # Lemma gyakorisági táblázat self.lemma_unigram_model = LemmaUnigramModel() # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen. self.lower_suffix_tree = None # HashSuffixTree(0) self.upper_suffix_tree = None # HashSuffixTree(0) self.lemma_lambdas = list() # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához. from purepos.model.combiner import default_combiner self.combiner = default_combiner() def compile(self) -> CompiledModelData: c = CompiledModelData() c.unigram_lemma_model = self.lemma_unigram_model c.tag_transition_model = self.tag_ngram_model.create_probability_model( ) c.standard_emission_model = self.std_emission_ngram_model.create_probability_model( ) c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model( ) c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs) c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser( theta) c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser( theta) c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta) c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta) c.combiner = self.combiner return c
def build_suffix_trees(self): # Tanuláskor, beolvasás után suffixtree-k építése. self.raw_model_data.lower_suffix_tree = HashSuffixTree(self.data.suffix_length) self.raw_model_data.upper_suffix_tree = HashSuffixTree(self.data.suffix_length) for word, m in self.data.standard_tokens_lexicon.representation.items(): word_freq = self.data.standard_tokens_lexicon.word_count(word) if word_freq <= self.data.rare_frequency: lower_word = word.lower() islower = lower_word == word for tag in m.keys(): word_tag_freq = self.data.standard_tokens_lexicon.wordcount_for_tag(word, tag) if islower: self.raw_model_data.lower_suffix_tree.add_word( lower_word, tag, word_tag_freq) self.raw_model_data.stat.increment_lower_guesser_items(word_tag_freq) else: self.raw_model_data.upper_suffix_tree.add_word( lower_word, tag, word_tag_freq) self.raw_model_data.stat.increment_upper_guesser_items(word_tag_freq)
def __init__(self, tagging_order, emission_order): self.stat = Statistics() # Statistics about trainig # Címkék ngram modellje self.tag_ngram_model = NGramModel(tagging_order + 1) # Eredeti szóalakok és a megelőző cimkék modellje self.std_emission_ngram_model = NGramModel(emission_order + 1) # Speciális tokenek és a megelőző cimkék modellje self.spec_emission_ngram_model = NGramModel(2) self.eos_tag = None # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.) self.lemma_suffix_tree = HashSuffixTree(100) # Lemma gyakorisági táblázat self.lemma_freq_tree = HashSuffixTree(5) # Lemma gyakorisági táblázat self.lemma_unigram_model = LemmaUnigramModel() # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen. self.lower_suffix_tree = None # HashSuffixTree(0) self.upper_suffix_tree = None # HashSuffixTree(0) self.lemma_lambdas = list() # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához. from purepos.model.combiner import default_combiner self.combiner = default_combiner()
class RawModelData: def __init__(self, tagging_order, emission_order): self.stat = Statistics() # Statistics about trainig # Címkék ngram modellje self.tag_ngram_model = NGramModel(tagging_order + 1) # Eredeti szóalakok és a megelőző cimkék modellje self.std_emission_ngram_model = NGramModel(emission_order + 1) # Speciális tokenek és a megelőző cimkék modellje self.spec_emission_ngram_model = NGramModel(2) self.eos_tag = None # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.) self.lemma_suffix_tree = HashSuffixTree(100) # Lemma gyakorisági táblázat self.lemma_freq_tree = HashSuffixTree(5) # Lemma gyakorisági táblázat self.lemma_unigram_model = LemmaUnigramModel() # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen. self.lower_suffix_tree = None # HashSuffixTree(0) self.upper_suffix_tree = None # HashSuffixTree(0) self.lemma_lambdas = list() # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához. from purepos.model.combiner import default_combiner self.combiner = default_combiner() def compile(self) -> CompiledModelData: c = CompiledModelData() c.unigram_lemma_model = self.lemma_unigram_model c.tag_transition_model = self.tag_ngram_model.create_probability_model() c.standard_emission_model = self.std_emission_ngram_model.create_probability_model() c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model() c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs) c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(theta) c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(theta) c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta) c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta) c.combiner = self.combiner return c
def calculate_params(self, doc: Document, raw_modeldata: RawModelData, modeldata: ModelData): apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(apriori_probs) lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta) lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta) lemma_unigram_model = raw_modeldata.lemma_unigram_model lambda_s = 1.0 lambda_u = 1.0 lambda_l = 1.0 for sentence in doc.sentences(): for tok in sentence: suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities( tok.token), tok.token, modeldata.tag_vocabulary) uni_probs = dict() for t in suffix_probs.keys(): uniscore = lemma_unigram_model.log_prob(t.stem) uni_probs[t] = uniscore lemma_probs = dict() for t in suffix_probs.keys(): lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag)) lemma_probs[t] = lemma_score uni_max = max(uni_probs.items(), key=lambda e: e[1]) t = max(suffix_probs.items(), key=lambda e: e[1][1]) suffix_max = (t[0], t[1][1]) lemma_max = max(lemma_probs.items(), key=lambda e: e[1]) act_uni_prob = lemma_unigram_model.log_prob(tok.stem) act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag( tok.tag)) if tok in suffix_probs.keys(): act_suff_prob = suffix_probs[tok][1] else: act_suff_prob = UNKOWN_VALUE uni_prop = act_uni_prob - uni_max[1] suff_prop = act_suff_prob - suffix_max[1] lemma_prop = act_lemma_prob - lemma_max[1] if uni_prop > suff_prop and uni_prop > lemma_prop: lambda_u += uni_prop elif suff_prop > uni_prop and suff_prop > lemma_prop: lambda_s += suff_prop elif lemma_prop > uni_prop and lemma_prop > suff_prop: lambda_l += lemma_prop s = lambda_u + lambda_s + lambda_l lambda_u /= s lambda_s /= s lambda_l /= s self.lambdas.append(lambda_u) self.lambdas.append(lambda_s) self.lambdas.append(lambda_l)
def compile(self) -> CompiledModelData: c = CompiledModelData() c.unigram_lemma_model = self.lemma_unigram_model c.tag_transition_model = self.tag_ngram_model.create_probability_model() c.standard_emission_model = self.std_emission_ngram_model.create_probability_model() c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model() c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs) c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(theta) c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(theta) c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta) c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta) c.combiner = self.combiner return c
def compile(self) -> CompiledModelData: c = CompiledModelData() c.unigram_lemma_model = self.lemma_unigram_model c.tag_transition_model = self.tag_ngram_model.create_probability_model( ) c.standard_emission_model = self.std_emission_ngram_model.create_probability_model( ) c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model( ) c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs) c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser( theta) c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser( theta) c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta) c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta) c.combiner = self.combiner return c