Exemple #1
0
    def create_blob(self):
        """
        hier kommt HanTa 
        """
        print(".")
        print('   >>>  HanoverTagger  GermaLemma  with TigerCorpus   <<<')
        tagger = ht.HanoverTagger('morphmodel_ger.pgz')
#
# Siehe: Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. 
# In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen. )
#
        def convert(lst):
            return ' '.join(lst).split()
#
        pepe = convert(self.clean_data)
        tags = tagger.tag_sent(pepe)
#
        lemma_list = []
        for item in tags:
            lemma_list.append(item[1])
#
        self.hanta_lemma = lemma_list
#        print(lemma_list)
#
        blob_wtf = tbde(str(self.clean_data))
#        blob_wtf.words.singularize()
        self.blob_lemma = _lemmatizer.lemmatize(str(blob_wtf))
        self.blob_polarity = tbde(str(self.blob_lemma))
#        blob_wtf.parse()
        print("                      -/-                             ")
#        print("             TF-IDF  Auswertung        ")
        return self.blob_lemma, self.blob_polarity, self.hanta_lemma
Exemple #2
0
def get_lemmalist(worte, which):
    #Laden des HanoverTaggers
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
    #Worte in words lemmatisieren und taggen
    tags = []
    for wort in worte:
        tag = tagger.analyze(wort, taglevel=1)
        tags.append(tag)
    #Aus den tag-Tupeln nur Worte in eine Liste speichern
    lemma_list = []
    for tup in tags:
        #Hinzufügen der lemmatisierten Worte zur lemma_list
        if which == "all":
            lemma_list.append(tup[0])
        elif which == "nouns":
            #Alternativ: Hinzufügen der lemmatisierten Nomen zur lemma_list
            if tup[1] == 'NN':
                lemma_list.append(tup[0])

    for w in lemma_list[:]:
        #Herauslöschen aller Worte, die nur aus einem Buchstaben bestehen
        if len(w) == 1:
            lemma_list.remove(w)
        #Herauslöschen aller "Worte", die nicht aus Buchstaben bestehen (Sonderzeichen)
        else:
            match = re.search(r'\w+', w)
            if match:
                pass
            else:
                lemma_list.remove(w)
    return lemma_list
 def __init__(self, descriptions, convert_lower, remove_punctuation):
     self.convert_lower = convert_lower
     self.remove_punctuation = remove_punctuation
     self.all_languages = descriptions.languages
     assert "sent_tokenize" in descriptions.proc_steps
     assert not "removestopwords" in descriptions.proc_steps  # taggers work best on sentences
     if "de" in descriptions.languages:
         self.german_tagger = ht.HanoverTagger(
             'morphmodel_ger.pgz'
         )  # see https://textmining.wp.hs-hannover.de/Preprocessing.html#Lemmatisierung
     if "en" in descriptions.languages:
         self.english_lemmatizer = nltk_WordNetLemmatizer()
    def __init__(self, title_list):
        config = Config()
        config.read()

        self.title_list = title_list
        self.processed = []
        self.languages = config.langs
        # guesses for all titles; kind of for debug purposes
        self.lang_guesses = []
        self.selection = []
        self.selection_langs = []
        nltk.data.path.append(config.nltk_data)
        self.lemmatizers = {
            "english": WordNetLemmatizer(),
            "german": HanoverTagger.HanoverTagger("morphmodel_ger.pgz"),
        }
    def input_text_to_temp(self, text):

        # inputting of data into the temporary table of all words contained within the
        # text for purposes of analysis against banked words.

        # Remove issue of line break
        text = text.replace('\n', ' ')

        # Remove double spacing
        text = text.replace('  ', ' ')

        tagger = ht.HanoverTagger('morphmodel_ger.pgz')

        sentences = nltk.sent_tokenize(text, language='german')
        final_list = []

        i = 0
        while i < (len(sentences)):
            tokenized_sent = nltk.tokenize.word_tokenize(sentences[i],
                                                         language='german')
            final_list.append(tokenized_sent)
            i += 1

        x = list(chain.from_iterable(final_list))
        tags = tagger.tag_sent(x)

        nouns_from_sent = [lemma for (word, lemma, pos) in tags if pos == "NN"]

        self.new_list = nouns_from_sent

        for x in self.new_list:
            mycursor.execute(
                'INSERT INTO new_schema.temp_table (word) VALUES (%s);', (x, ))
        mydb.commit()

        self.percentage_success_fetch()
from sklearn.preprocessing import LabelBinarizer

import spacy
import de_core_news_md
nlp = de_core_news_md.load()

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

"""**Load the data**"""

train_df = pd.read_csv('train.csv', sep=";")
test_reduced_df = pd.read_csv('test_reduced.csv', sep=";")

tagger = ht.HanoverTagger('morphmodel_ger.pgz')

# durchlaufe Preprocess-Pipeline und verwende nur Nomen.
def preprocess(text):
    try:
      text = text.lower()
    except:
      pass
    nouns = []
    try:
        # tokenize in sentences
        sentences = nltk.sent_tokenize(text, language='german')
        sentences_tok = [nltk.word_tokenize(sent, language='german') for sent in sentences]

        for sent in sentences_tok:
            tags = tagger.tag_sent(sent)
 def __init__(self, plugin_manager):
     super(Plugin, self).__init__(plugin_manager)
     self.tagger = HanoverTagger.HanoverTagger('morphmodel_ger.pgz')
     self.thread = None
     self.sig.execute_action.connect(self.replace_text)