def create_blob(self): """ hier kommt HanTa """ print(".") print(' >>> HanoverTagger GermaLemma with TigerCorpus <<<') tagger = ht.HanoverTagger('morphmodel_ger.pgz') # # Siehe: Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. # In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen. ) # def convert(lst): return ' '.join(lst).split() # pepe = convert(self.clean_data) tags = tagger.tag_sent(pepe) # lemma_list = [] for item in tags: lemma_list.append(item[1]) # self.hanta_lemma = lemma_list # print(lemma_list) # blob_wtf = tbde(str(self.clean_data)) # blob_wtf.words.singularize() self.blob_lemma = _lemmatizer.lemmatize(str(blob_wtf)) self.blob_polarity = tbde(str(self.blob_lemma)) # blob_wtf.parse() print(" -/- ") # print(" TF-IDF Auswertung ") return self.blob_lemma, self.blob_polarity, self.hanta_lemma
def get_lemmalist(worte, which): #Laden des HanoverTaggers tagger = ht.HanoverTagger('morphmodel_ger.pgz') #Worte in words lemmatisieren und taggen tags = [] for wort in worte: tag = tagger.analyze(wort, taglevel=1) tags.append(tag) #Aus den tag-Tupeln nur Worte in eine Liste speichern lemma_list = [] for tup in tags: #Hinzufügen der lemmatisierten Worte zur lemma_list if which == "all": lemma_list.append(tup[0]) elif which == "nouns": #Alternativ: Hinzufügen der lemmatisierten Nomen zur lemma_list if tup[1] == 'NN': lemma_list.append(tup[0]) for w in lemma_list[:]: #Herauslöschen aller Worte, die nur aus einem Buchstaben bestehen if len(w) == 1: lemma_list.remove(w) #Herauslöschen aller "Worte", die nicht aus Buchstaben bestehen (Sonderzeichen) else: match = re.search(r'\w+', w) if match: pass else: lemma_list.remove(w) return lemma_list
def __init__(self, descriptions, convert_lower, remove_punctuation): self.convert_lower = convert_lower self.remove_punctuation = remove_punctuation self.all_languages = descriptions.languages assert "sent_tokenize" in descriptions.proc_steps assert not "removestopwords" in descriptions.proc_steps # taggers work best on sentences if "de" in descriptions.languages: self.german_tagger = ht.HanoverTagger( 'morphmodel_ger.pgz' ) # see https://textmining.wp.hs-hannover.de/Preprocessing.html#Lemmatisierung if "en" in descriptions.languages: self.english_lemmatizer = nltk_WordNetLemmatizer()
def __init__(self, title_list): config = Config() config.read() self.title_list = title_list self.processed = [] self.languages = config.langs # guesses for all titles; kind of for debug purposes self.lang_guesses = [] self.selection = [] self.selection_langs = [] nltk.data.path.append(config.nltk_data) self.lemmatizers = { "english": WordNetLemmatizer(), "german": HanoverTagger.HanoverTagger("morphmodel_ger.pgz"), }
def input_text_to_temp(self, text): # inputting of data into the temporary table of all words contained within the # text for purposes of analysis against banked words. # Remove issue of line break text = text.replace('\n', ' ') # Remove double spacing text = text.replace(' ', ' ') tagger = ht.HanoverTagger('morphmodel_ger.pgz') sentences = nltk.sent_tokenize(text, language='german') final_list = [] i = 0 while i < (len(sentences)): tokenized_sent = nltk.tokenize.word_tokenize(sentences[i], language='german') final_list.append(tokenized_sent) i += 1 x = list(chain.from_iterable(final_list)) tags = tagger.tag_sent(x) nouns_from_sent = [lemma for (word, lemma, pos) in tags if pos == "NN"] self.new_list = nouns_from_sent for x in self.new_list: mycursor.execute( 'INSERT INTO new_schema.temp_table (word) VALUES (%s);', (x, )) mydb.commit() self.percentage_success_fetch()
from sklearn.preprocessing import LabelBinarizer import spacy import de_core_news_md nlp = de_core_news_md.load() # plotting import seaborn as sns import matplotlib.pyplot as plt """**Load the data**""" train_df = pd.read_csv('train.csv', sep=";") test_reduced_df = pd.read_csv('test_reduced.csv', sep=";") tagger = ht.HanoverTagger('morphmodel_ger.pgz') # durchlaufe Preprocess-Pipeline und verwende nur Nomen. def preprocess(text): try: text = text.lower() except: pass nouns = [] try: # tokenize in sentences sentences = nltk.sent_tokenize(text, language='german') sentences_tok = [nltk.word_tokenize(sent, language='german') for sent in sentences] for sent in sentences_tok: tags = tagger.tag_sent(sent)
def __init__(self, plugin_manager): super(Plugin, self).__init__(plugin_manager) self.tagger = HanoverTagger.HanoverTagger('morphmodel_ger.pgz') self.thread = None self.sig.execute_action.connect(self.replace_text)