def read(self, text: str): sentences = list() for sent in text.split(self.linesep + self.linesep): if len(sent)-1 > 0: sentences.append(self.sentence_parser.read(sent)) paragraph = Paragraph(sentences) document = Document() document.append(paragraph) return document
def read(self, text: str): # it parses the whole(!) analysed corpus sentences = list() for line in text.split(self.linesep): if len(line) > 0: sentences.append(self.sentence_parser.read(line)) paragraph = Paragraph(sentences) document = Document() document.append(paragraph) return document
def train(self, document: Document): # todo read lines by lines. See the issue: # https://github.com/ppke-nlpg/purepos-python3/issues/5 self.raw_model_data.eos_tag = self.data.tag_vocabulary.add_element(ModelData.EOS_TAG) for sentence in document.sentences(): mysentence = Sentence(sentence) self.add_sentence_markers(mysentence) self.add_sentence(mysentence) self.build_suffix_trees() self.raw_model_data.combiner.calculate_params(document, self.raw_model_data, self.data)
def calculate_params(self, doc: Document, raw_modeldata: RawModelData, modeldata: ModelData): apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(apriori_probs) lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta) lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta) lemma_unigram_model = raw_modeldata.lemma_unigram_model lambda_s = 1.0 lambda_u = 1.0 lambda_l = 1.0 for sentence in doc.sentences(): for tok in sentence: suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities( tok.token), tok.token, modeldata.tag_vocabulary) uni_probs = dict() for t in suffix_probs.keys(): uniscore = lemma_unigram_model.log_prob(t.stem) uni_probs[t] = uniscore lemma_probs = dict() for t in suffix_probs.keys(): lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag)) lemma_probs[t] = lemma_score uni_max = max(uni_probs.items(), key=lambda e: e[1]) t = max(suffix_probs.items(), key=lambda e: e[1][1]) suffix_max = (t[0], t[1][1]) lemma_max = max(lemma_probs.items(), key=lambda e: e[1]) act_uni_prob = lemma_unigram_model.log_prob(tok.stem) act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag( tok.tag)) if tok in suffix_probs.keys(): act_suff_prob = suffix_probs[tok][1] else: act_suff_prob = UNKOWN_VALUE uni_prop = act_uni_prob - uni_max[1] suff_prop = act_suff_prob - suffix_max[1] lemma_prop = act_lemma_prob - lemma_max[1] if uni_prop > suff_prop and uni_prop > lemma_prop: lambda_u += uni_prop elif suff_prop > uni_prop and suff_prop > lemma_prop: lambda_s += suff_prop elif lemma_prop > uni_prop and lemma_prop > suff_prop: lambda_l += lemma_prop s = lambda_u + lambda_s + lambda_l lambda_u /= s lambda_s /= s lambda_l /= s self.lambdas.append(lambda_u) self.lambdas.append(lambda_s) self.lambdas.append(lambda_l)