Ejemplo n.º 1
0
class RawModelData:
    def __init__(self, tagging_order, emission_order):
        self.stat = Statistics()  # Statistics about trainig
        # Címkék ngram modellje
        self.tag_ngram_model = NGramModel(tagging_order + 1)
        # Eredeti szóalakok és a megelőző cimkék modellje
        self.std_emission_ngram_model = NGramModel(emission_order + 1)
        # Speciális tokenek és a megelőző cimkék modellje
        self.spec_emission_ngram_model = NGramModel(2)
        self.eos_tag = None
        # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.)
        self.lemma_suffix_tree = HashSuffixTree(100)
        # Lemma gyakorisági táblázat
        self.lemma_freq_tree = HashSuffixTree(5)
        # Lemma gyakorisági táblázat
        self.lemma_unigram_model = LemmaUnigramModel()
        # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen.
        self.lower_suffix_tree = None  # HashSuffixTree(0)
        self.upper_suffix_tree = None  # HashSuffixTree(0)
        self.lemma_lambdas = list()
        # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához.
        from purepos.model.combiner import default_combiner
        self.combiner = default_combiner()

    def compile(self) -> CompiledModelData:
        c = CompiledModelData()
        c.unigram_lemma_model = self.lemma_unigram_model
        c.tag_transition_model = self.tag_ngram_model.create_probability_model(
        )
        c.standard_emission_model = self.std_emission_ngram_model.create_probability_model(
        )
        c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model(
        )
        c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs()
        theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs)
        c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(
            theta)
        c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(
            theta)
        c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta)
        c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta)
        c.combiner = self.combiner
        return c
Ejemplo n.º 2
0
 def build_suffix_trees(self):
     # Tanuláskor, beolvasás után suffixtree-k építése.
     self.raw_model_data.lower_suffix_tree = HashSuffixTree(self.data.suffix_length)
     self.raw_model_data.upper_suffix_tree = HashSuffixTree(self.data.suffix_length)
     for word, m in self.data.standard_tokens_lexicon.representation.items():
         word_freq = self.data.standard_tokens_lexicon.word_count(word)
         if word_freq <= self.data.rare_frequency:
             lower_word = word.lower()
             islower = lower_word == word
             for tag in m.keys():
                 word_tag_freq = self.data.standard_tokens_lexicon.wordcount_for_tag(word, tag)
                 if islower:
                     self.raw_model_data.lower_suffix_tree.add_word(
                         lower_word, tag, word_tag_freq)
                     self.raw_model_data.stat.increment_lower_guesser_items(word_tag_freq)
                 else:
                     self.raw_model_data.upper_suffix_tree.add_word(
                         lower_word, tag, word_tag_freq)
                     self.raw_model_data.stat.increment_upper_guesser_items(word_tag_freq)
Ejemplo n.º 3
0
 def __init__(self, tagging_order, emission_order):
     self.stat = Statistics()  # Statistics about trainig
     # Címkék ngram modellje
     self.tag_ngram_model = NGramModel(tagging_order + 1)
     # Eredeti szóalakok és a megelőző cimkék modellje
     self.std_emission_ngram_model = NGramModel(emission_order + 1)
     # Speciális tokenek és a megelőző cimkék modellje
     self.spec_emission_ngram_model = NGramModel(2)
     self.eos_tag = None
     # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.)
     self.lemma_suffix_tree = HashSuffixTree(100)
     # Lemma gyakorisági táblázat
     self.lemma_freq_tree = HashSuffixTree(5)
     # Lemma gyakorisági táblázat
     self.lemma_unigram_model = LemmaUnigramModel()
     # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen.
     self.lower_suffix_tree = None  # HashSuffixTree(0)
     self.upper_suffix_tree = None  # HashSuffixTree(0)
     self.lemma_lambdas = list()
     # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához.
     from purepos.model.combiner import default_combiner
     self.combiner = default_combiner()
Ejemplo n.º 4
0
 def __init__(self, tagging_order, emission_order):
     self.stat = Statistics()  # Statistics about trainig
     # Címkék ngram modellje
     self.tag_ngram_model = NGramModel(tagging_order + 1)
     # Eredeti szóalakok és a megelőző cimkék modellje
     self.std_emission_ngram_model = NGramModel(emission_order + 1)
     # Speciális tokenek és a megelőző cimkék modellje
     self.spec_emission_ngram_model = NGramModel(2)
     self.eos_tag = None
     # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.)
     self.lemma_suffix_tree = HashSuffixTree(100)
     # Lemma gyakorisági táblázat
     self.lemma_freq_tree = HashSuffixTree(5)
     # Lemma gyakorisági táblázat
     self.lemma_unigram_model = LemmaUnigramModel()
     # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen.
     self.lower_suffix_tree = None  # HashSuffixTree(0)
     self.upper_suffix_tree = None  # HashSuffixTree(0)
     self.lemma_lambdas = list()
     # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához.
     from purepos.model.combiner import default_combiner
     self.combiner = default_combiner()
Ejemplo n.º 5
0
class RawModelData:
    def __init__(self, tagging_order, emission_order):
        self.stat = Statistics()  # Statistics about trainig
        # Címkék ngram modellje
        self.tag_ngram_model = NGramModel(tagging_order + 1)
        # Eredeti szóalakok és a megelőző cimkék modellje
        self.std_emission_ngram_model = NGramModel(emission_order + 1)
        # Speciális tokenek és a megelőző cimkék modellje
        self.spec_emission_ngram_model = NGramModel(2)
        self.eos_tag = None
        # Lemma suffix gyakorisági táblázat (HashLemmaTree volt.)
        self.lemma_suffix_tree = HashSuffixTree(100)
        # Lemma gyakorisági táblázat
        self.lemma_freq_tree = HashSuffixTree(5)
        # Lemma gyakorisági táblázat
        self.lemma_unigram_model = LemmaUnigramModel()
        # Szóalakok suffix gyakorisági táblázata kis- és nagybetűérzékenyen.
        self.lower_suffix_tree = None  # HashSuffixTree(0)
        self.upper_suffix_tree = None  # HashSuffixTree(0)
        self.lemma_lambdas = list()
        # LogLinearBiCombiner a guesserből és az unigram modellből származó adatok kombinálásához.
        from purepos.model.combiner import default_combiner
        self.combiner = default_combiner()

    def compile(self) -> CompiledModelData:
        c = CompiledModelData()
        c.unigram_lemma_model = self.lemma_unigram_model
        c.tag_transition_model = self.tag_ngram_model.create_probability_model()
        c.standard_emission_model = self.std_emission_ngram_model.create_probability_model()
        c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model()
        c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs()
        theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs)
        c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(theta)
        c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(theta)
        c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta)
        c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta)
        c.combiner = self.combiner
        return c
Ejemplo n.º 6
0
 def calculate_params(self, doc: Document,
                      raw_modeldata: RawModelData,
                      modeldata: ModelData):
     apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs()
     theta = HashSuffixTree.calculate_theta(apriori_probs)
     lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta)
     lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta)
     lemma_unigram_model = raw_modeldata.lemma_unigram_model
     lambda_s = 1.0
     lambda_u = 1.0
     lambda_l = 1.0
     for sentence in doc.sentences():
         for tok in sentence:
             suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities(
                 tok.token), tok.token, modeldata.tag_vocabulary)
             uni_probs = dict()
             for t in suffix_probs.keys():
                 uniscore = lemma_unigram_model.log_prob(t.stem)
                 uni_probs[t] = uniscore
             lemma_probs = dict()
             for t in suffix_probs.keys():
                 lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag))
                 lemma_probs[t] = lemma_score
             uni_max = max(uni_probs.items(), key=lambda e: e[1])
             t = max(suffix_probs.items(), key=lambda e: e[1][1])
             suffix_max = (t[0], t[1][1])
             lemma_max = max(lemma_probs.items(), key=lambda e: e[1])
             act_uni_prob = lemma_unigram_model.log_prob(tok.stem)
             act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag(
                 tok.tag))
             if tok in suffix_probs.keys():
                 act_suff_prob = suffix_probs[tok][1]
             else:
                 act_suff_prob = UNKOWN_VALUE
             uni_prop = act_uni_prob - uni_max[1]
             suff_prop = act_suff_prob - suffix_max[1]
             lemma_prop = act_lemma_prob - lemma_max[1]
             if uni_prop > suff_prop and uni_prop > lemma_prop:
                 lambda_u += uni_prop
             elif suff_prop > uni_prop and suff_prop > lemma_prop:
                 lambda_s += suff_prop
             elif lemma_prop > uni_prop and lemma_prop > suff_prop:
                 lambda_l += lemma_prop
     s = lambda_u + lambda_s + lambda_l
     lambda_u /= s
     lambda_s /= s
     lambda_l /= s
     self.lambdas.append(lambda_u)
     self.lambdas.append(lambda_s)
     self.lambdas.append(lambda_l)
Ejemplo n.º 7
0
 def compile(self) -> CompiledModelData:
     c = CompiledModelData()
     c.unigram_lemma_model = self.lemma_unigram_model
     c.tag_transition_model = self.tag_ngram_model.create_probability_model()
     c.standard_emission_model = self.std_emission_ngram_model.create_probability_model()
     c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model()
     c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs()
     theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs)
     c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(theta)
     c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(theta)
     c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta)
     c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta)
     c.combiner = self.combiner
     return c
Ejemplo n.º 8
0
 def compile(self) -> CompiledModelData:
     c = CompiledModelData()
     c.unigram_lemma_model = self.lemma_unigram_model
     c.tag_transition_model = self.tag_ngram_model.create_probability_model(
     )
     c.standard_emission_model = self.std_emission_ngram_model.create_probability_model(
     )
     c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model(
     )
     c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs()
     theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs)
     c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(
         theta)
     c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(
         theta)
     c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta)
     c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta)
     c.combiner = self.combiner
     return c