def create_blob(self): """ hier kommt HanTa """ print(".") print(' >>> HanoverTagger GermaLemma with TigerCorpus <<<') tagger = ht.HanoverTagger('morphmodel_ger.pgz') # # Siehe: Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. # In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen. ) # def convert(lst): return ' '.join(lst).split() # pepe = convert(self.clean_data) tags = tagger.tag_sent(pepe) # lemma_list = [] for item in tags: lemma_list.append(item[1]) # self.hanta_lemma = lemma_list # print(lemma_list) # blob_wtf = tbde(str(self.clean_data)) # blob_wtf.words.singularize() self.blob_lemma = _lemmatizer.lemmatize(str(blob_wtf)) self.blob_polarity = tbde(str(self.blob_lemma)) # blob_wtf.parse() print(" -/- ") # print(" TF-IDF Auswertung ") return self.blob_lemma, self.blob_polarity, self.hanta_lemma
def tfidf_calculate(self): """ Erechnet den Tfidf von in jeweils 1/3 geteilte Abschnitte des sliste_n """ self.sliste_n = [x for (x,y) in self.blob_lemma if y not in ('N')] def tf(word, blob): return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob.words) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) nb1 = int(len(self.clean_data) * 0.333) nb2 = nb1 * 2 nb3 = len(self.clean_data) doku1= tbde(str(self.sliste_n[0:nb1])) doku2= tbde(str(self.sliste_n[nb1:nb2])) doku3= tbde(str(self.sliste_n[nb2:nb3])) bloblist = [doku1,doku2,doku3] for i, item in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, item, bloblist) for word in item.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for element, score in sorted_words[:3]: print("\tWord: {}, TF-IDF: {}".format(element, round(score, 4))) return self.sliste_n
def create_blob(self): """ Erzeugt blob objekte: 1x blob lemma liste und 1x pures blob object """ blob_wtf = tbde(str(self.clean_data)) self.blob_lemma = _lemmatizer.lemmatize(str(blob_wtf)) self.blob_polarity = tbde(str(self.blob_lemma)) return self.blob_lemma, self.blob_polarity