Ejemplo n.º 1
    def __init__(self, BASEDIR, session_only=False, cycle_time=1):
        super().__init__(BASEDIR, session_only, cycle_time)
        self.name = 'contentrank'

        mapper = Mapping()
        self.rec_mapping = mapper.get_header_rec()
        self.event_mapping = mapper.get_header_event()
        self.update_mapping = mapper.get_header_update()
        self.item_id_idx = self.rec_mapping.index('ITEM_SOURCE')
        self.publisher_id_idx = self.rec_mapping.index('PUBLISHER')
        self.recs_idx = self.event_mapping.index('recs')
        self.limit_idx = self.rec_mapping.index('limit')
        self.title_idx = self.update_mapping.index('title')
        self.text_idx = self.update_mapping.index('text')
        self.update_id_idx = self.update_mapping.index('id')
        self.update_domainid_idx = self.update_mapping.index('domainid')

        self.germanStemmer = GermanStemmer(ignore_stopwords=True)
        self.stopwords = stopwords.words('german')
        self.stems = {}  # (item, [stem, stem, stem])

        self.correct = 0
        self.total_events = 0
        self.nrrows = 0

        self.counts = {}
Ejemplo n.º 2
def cosine_preprocess(texts, pickle_name, pickle_folder='pickle'):
    pickle_path = os.path.join(pickle_folder, pickle_name)

    # Return from disk if possible for efficiency reasons
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            return pickle.load(f)

    processed = []
    for text in tqdm(texts):
        stemmer = GermanStemmer()
        words = stopwords.words('german')

        tokens = [
            stemmer.stem(token) for token in word_tokenize(text)
            if token not in words

        processed.append(' '.join(tokens))

    # Pickle the output
    if not os.path.exists(pickle_folder):

    with open(pickle_path, 'wb') as f:
        pickle.dump(processed, f)

    return processed
Ejemplo n.º 3
def evaluate_dnn(path:str):
    with open(os.path.join(path, "tag_to_int.json"), "rt") as f:
        tag_to_int = json.load(f)
    with open(os.path.join(path, "int_to_tag.json"), "rt") as f:
        int_to_tag = json.load(f)  

    cv = pickle.load(open(os.path.join(path, "cv.p"), "rb"))
    stemmer = GermanStemmer()
    model_name = "dnn_intent_classification.h5"
    model = load_model(os.path.join(path, model_name))

    with open(os.path.join("Data", "commands", "Test", "testingdata.json"), "rt") as f:
        val_data = json.load(f)

    X = []
    y = []

    for tag, commands in val_data.items():
        for command in commands:
            command = " ".join(stemmer.stem(c) for c in sorted(word_tokenize(command)))
            X.append(transform_command_BoW(command, cv))

    X = np.array(X)
    y = np.array(y)

    predictions =  model.predict(X)
    predicted_indices = np.argmax(predictions, 1)

    print("acc: ", accuracy_score(y, predicted_indices))
    cm = confusion_matrix(y, predicted_indices)
    cm = pd.DataFrame(cm, index=int_to_tag.values(), columns=int_to_tag.values())

    return (accuracy_score(y, predicted_indices), cm)
Ejemplo n.º 4
def build_stems(pattern: str, category: Category,
                elements: List[Tuple[Category, Set[str]]],
                total_stems: Set[str]) -> Set[str]:
    Builds a set of stems for all words used in the pattern.

        pattern: The pattern to tokenize and stem.
        category: The category of the pattern.
            A mutable list of (category, stem) pairs that the new stems will
            be appended to.
            The set of total stems before this function was invoked.
            Will not be mutated.

        The union of total_stems and stems found in the pattern.

    # Tokenize pattern into words
    words = nltk.word_tokenize(pattern)
    # Get stems for the pattern's words, as a set to avoid duplicates
    stemmer = GermanStemmer()
    stems: Set[str] = {stemmer.stem(w.lower()) for w in words}
    # Add stems associated with association to the category to the
    # pattern list.
    elements.append((category, stems))
    # Add stems to total set of stems, needed for conversion to numeric
    # TensorFlow training array
    total_stems |= stems
    return total_stems
Ejemplo n.º 5
 def __init__(self, config):
   self.config = config
   if config.stem:
     if config.lang == 'en':
       self.stemmer = PorterStemmer()
     elif config.lang == 'de':
       self.stemmer = GermanStemmer()
       self.stemmer = IdStemmer()
Ejemplo n.º 6
def _check_NE_yeah(gram):
    tag = entities.get(" ".join(gram), "O")

    if tag == "O":
        if len(gram) == 2:
            first, last = gram
            if first in vornamen and last in nachnamen:
                tag = "PER"

    if tag == "O":
            tag = entities.get(
                " ".join([GermanStemmer().stem(g) for g in gram]), "O")
            tag = entities.get(
                " ".join([
                    for g in gram
                ]), "O")

    return tag
Ejemplo n.º 7
def ner_features(sentence, i, history):
    # TODO: try using TreeTagger's POS tag
    wordO = sentence[i]
    word = wordO.string
    pos = wordO.pos
    stemmed = GermanStemmer().stem(word)

    if i == 0:
        prevword, prevpos = "<START>", "<START>"
        last = "<START>"
        prevstemmed = "<START>"
        last = history[-1]
        prevword = sentence[i - 1].string
        prevpos = sentence[i - 1].pos
        prevstemmed = GermanStemmer().stem(sentence[i - 1].string)

    chunk = []
    if not wordO.chunk:
        knowledge_sources = "O"
        knowledge_sources = check_NE(convert(wordO.string), wordO.chunk)
        chunk = [w.string for w in wordO.chunk]

    stem_is_word = stemmed == word.lower()

    knowledge_sources_stemmed = _check_NE_yeah([stemmed])

    return {
        "knowledge": knowledge_sources,
        "knowledge_lemma": knowledge_sources_stemmed,
        "history": "+".join(history)[-2:],
        "pos": pos,
        "word": word,
        "stemmed": stemmed
Ejemplo n.º 8
def remove_stop_words(msg):
    # remove stop words and stem words
    stemmer = GermanStemmer()

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(msg)

    stop_words = set(stopwords.words('german'))

    words_filtered = []

    for w in words:
        if w not in stop_words:

    return words_filtered
Ejemplo n.º 9
 def __init__(self):
     self.tweets = 0
     self.related_tweets = 0
     self.stopwords = {}
     self.stemmers = {}
     self.stemmers["es"] = SpanishStemmer()
     self.stemmers["en"] = PorterStemmer()
     self.stemmers["fr"] = FrenchStemmer()
     self.stemmers["de"] = GermanStemmer()
     self.stopwords["es"] = self.load_stopwords_file(
     self.stopwords["en"] = self.load_stopwords_file(
     self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt")
     self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt")
     self.output_file = open(sys.argv[2], 'a')
Ejemplo n.º 10
    def __init__(self, essay: str, name: str, gazetteer_version: int = 1):
        Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict 
        :param file_path: path to the essay that is to be processed
        :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible
        # Initialize data structures
        self.essay = essay
        self.essay_name = name
        self.gazetteer_version = gazetteer_version
        self.tokens_without_stopwords = []
        self.found_entities = dict()
        self.stemmer = GermanStemmer()
        self.fastText_model = None
        self.spacy_model = None
        self.file_path = RESULTS_PATH + name

        if not os.path.exists(self.file_path):

        # retrieve the gazetteers that should be used for annotation
        self.gazetteers = sorted([
            f for f in os.listdir(PATH_GAZETTEERS +
            if os.path.isfile(PATH_GAZETTEERS +
                              version_subfolder[gazetteer_version] + f)
        print("Used gazetteers: %s" % (gazetteer_version))

        # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one
        if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"):
            self.tokenized_gazetteers = pickle.load(
                open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb"))
            self.tokenized_gazetteers = dict()
        changed = False
        for gazetteer_filename in self.gazetteers:
            # if there is not already a tokenized version of this gazetteer, tokenize it
            if not gazetteer_filename in self.tokenized_gazetteers.keys():
                    gazetteer_filename] = self.tokenize_gazetteer(
                changed = True
        if changed:
                        open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb"))
Ejemplo n.º 11
    def __init__(self,
                 ngram_range=(1, 1),

        if lang == 'de':
            self.stemmer = GermanStemmer()
            self.stemmer = EnglishStemmer()

        super(self.__class__, self).__init__(stop_words=stop_words,
Ejemplo n.º 12
    def stemWord(self, word, lng):
        '''Separates the word's changeable part with a '|' for wordfast'''
        if lng == 'ru':
            stemmer = RussianStemmer()
        elif lng == 'en':
            stemmer = PorterStemmer()
        elif lng == 'de':
            stemmer = GermanStemmer()
            print('Language error. Exiting...')

        word = word.lower()  #otherwise the stemmer fails
        if len(word) <= 3:
            return word
        elif len(word) == len(stemmer.stem(word)):
            return "{0}|{1}".format(word[:-1], word[-1])
            return "{0}|{1}".format(word[:len(stemmer.stem(word))], \
Ejemplo n.º 13
    def _preprocess(text, mode=None):
        '''helper function to preprocess text. returns List of Sentences'''
        sentences = split_single(text)
        if mode:
            nlp = spacy.load('de_core_news_sm')
            if mode == 'lemmatize':
                sentences = [
                    Sentence((' ').join([token.lemma_ for token in nlp(s)]))
                    for s in sentences
            elif mode == 'stem':
                stemmer = GermanStemmer()
                sentences = [
                    Sentence((' ').join(
                        [stemmer.stem(token.text) for token in nlp(s)]))
                    for s in sentences
            sentences = [Sentence(s, use_tokenizer=True) for s in sentences]

        return sentences
Ejemplo n.º 14
def clean_text(text):
    :param text:
    # stopwords = set(nltk.corpus.stopwords.words('german'))
    file_path = r'etc/models/german.txt'
    with open(file_path) as file:
        file_data = file.read()
    stopwords = file_data.split('\n')
    gs = GermanStemmer()
    text_cleaned = ""
    text_cleaned = re.sub('[^a-zA-Z]', ' ',
                          text)  # Keep only alphabet and space characters
    text_cleaned = text_cleaned.lower()  # All character to lowercase
    text_cleaned = text_cleaned.split(
    )  # Split to list of word (split by space specify character)
    text_cleaned = [
        gs.stem(word) for word in text_cleaned if not word in stopwords
    text_cleaned = ' '.join(text_cleaned)
    return text_cleaned
Ejemplo n.º 15
def text_cleaner(text):
    use_GermanStemmer = False
    tokens = False

    # Remove username handles
    # -? do we need the user names
    text = remove_handles(text)

    # Remove punctuation marks
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)

    # replace the umlauts
    # =============================================================================
    #         text = re.sub('ä', 'ae', text)
    #         text = re.sub('ö', 'oe', text)
    #         text = re.sub('ü', 'ue', text)
    #         text = re.sub('Ä', 'Ae', text)
    #         text = re.sub('Ö', 'Oe', text)
    #         text = re.sub('Ü', 'Ue', text)
    #         text = re.sub('ß', 'ss', text)
    # =============================================================================

    # remove the numbers
    text = re.sub(r'[0-9]+', '', text)

    # Remove emojis
    german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ"
    text = ''.join(c for c in text if c in german_char)

    tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True)
    if tokens:
        return tokenizer.tokenize(text)
    elif use_GermanStemmer:
        stemmer = GermanStemmer()
        return [stemmer.stem(token) for token in tokenizer.tokenize(text)]
        return text
Ejemplo n.º 16
def set_stemmer(stemmer_language):
    if (stemmer_language == "GER"):
        stemmers = GermanStemmer()
        stemmers = EnglishStemmer()
    return stemmers
Ejemplo n.º 17
sorted_d = np.sort([int(x["Veröffentlichungsdatum"].split("-")[0]) for x in d])
year_indices = {}
for ind, ind_year in enumerate(
        sorted([np.where(sorted_d == x)[0][0] for x in set(sorted_d)])):
    year_indices.update({list(range(2010, 2019 + 1))[ind]: ind_year})
year_indices[2020] = None

# 70 k times 588 k is big, sizing down therefore
year = 2010
d = d[year_indices[year]:year_indices[year + 1]]  # d[:500]

stop_words_en = stopwords.words('english')

stemmer = GermanStemmer()  # Cistem()

with open("stop_full.pkl", "rb") as f:
    stop_words = pickle.load(f)
    stop_words = [x.strip() for x in stop_words] + stop_words_en

def preprocess(text):
    text = text.lower().split()
    # text = [w.split(".")[0].split(",")[0].split(":")[0].split(";")[0] for w in text]
    text = " ".join(text)
    remove_punctuation_regex = re.compile(
        r"[^A-ZÄÖÜäöüßa-z ]"
    )  # regex for all characters that are NOT A-Z, a-z and space " "
    text = re.sub(remove_punctuation_regex, "", text)
    text = text.split()
Ejemplo n.º 18
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from boilerpipe.extract import Extractor
from nltk.stem.snowball import GermanStemmer
from nltk import word_tokenize
import nltk.data
import os
import re

logger = logging.getLogger(__name__)

satztokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
stemmer = GermanStemmer()
stoppwörter = []
'''Lädt Stopwortliste'''
with open('traindata/german', 'r') as f:
    for line in f:
        wort = line.split('\n')[0]

def preprocess(text):
    '''Filterregelungen, um Text zu vereinheitlichen.'''
        text = re.sub(
            "/innen|\*innen|/-innen", "innen",
            text)  # Vereinheitlicht unterschiedliche Gender-Varianten
        text = re.sub("-\s*\n", "", text)  # Entfernt Silbentrennung
Ejemplo n.º 19
 def __init__(self):
     self.stemmer = GermanStemmer()
Ejemplo n.º 20
print test_df.isnull().sum()

print 'Unique restaurants: {}'.format(len(data['restaurant_id'].unique()))
print 'Unique menu_category: {}'.format(len(data['menu_category'].unique()))
print 'Unique product_name: {}'.format(len(data['product_name'].unique()))
print 'Unique ingredients: {}'.format(len(data['ingredients'].unique()))
print test_df.shape

encode_menu = test_df['menu_category'].str.encode('ascii', errors='ignore')
print len(encode_menu.unique())
encode_menu.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True)
print len(encode_menu.unique())
encode_menu = encode_menu.apply(lambda x:GermanStemmer().stem(x))
print len(encode_menu.unique())
encode_name = test_df['product_name'].str.encode('ascii', errors='ignore')
print len(encode_name.unique())
encode_name.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True)
print len(encode_name.unique())
encode_name = encode_name.apply(lambda x:GermanStemmer().stem(x))
print len(encode_name.unique())

# X = pd.concat([encode_menu, encode_name, test_df['restaurant_id'].astype('str')], axis=1)

# le = preprocessing.LabelEncoder()
# X_2 = X.apply(le.fit_transform)
# print X_2.head()
# print X_2.shape
Ejemplo n.º 21
class StringHandler:
    _STEMMER = GermanStemmer()
    _P_SIMILARITY_THRESHOLD: float = 0.9

    def __init__(self, string_series: pd.Series):
        self._ds = string_series.str.lower()
        self.ds_origin = string_series

    def optimize(self):
        # self.correct_spelling()

    def reset(self):
        self.ds = self.ds_origin.copy()

    # string manipulation

    def stem_words(self):
        self.ds = self.ds.apply(StringHandler.stem_sentence)

    def split_text(self):
        self.ds = self.ds.str.split(' ')

    def remove_noise(self):
        self.ds = self.ds.str.replace(r'[^a-zA-Z0-9]', ' ')
        # remove leftover isolated substrings that are not words/digits

    def build_sentence(self):
        self.ds = self.ds.apply(lambda x: ' '.join(word.strip() for word in x if word))

    # nlp manipulation

    def correct_spelling(self):
        uniques = self.get_unique_series
        uniques.apply(lambda x: list(i for i in uniques if i != x and SequenceMatcher(None, x, i).ratio() > 0.9))

    def stem_sentence(cls, sentence: str, split_char: str = ' '):
        return ' '.join(cls._STEMMER.stem(word) for word in sentence.split(split_char))

    # properties

    def get_unique_series(self):
        return pd.Series(self.ds.unique()).sort_values().reset_index(drop=True)

    def ds(self):
        return self._ds

    def ds(self, ds: pd.Series):
        if isinstance(ds, pd.Series) and not ds.empty:
            self._ds = ds
            raise TypeError('Wrong variable type or empty series')
Ejemplo n.º 22

        return res

# In[3]:

def subwords(word):
    return [word[:2], word[2:]]

# In[27]:

stem = GermanStemmer().stem

cnt_vect_splits = [
    ("short", lambda doc: [line for line in doc if len(line) <= 1], {}),
    ("long", lambda doc: [line for line in doc if len(line) > 1], {}),
    ("subwords", lambda doc: [
        list(map(stem, concat(subwords(word) for word in line)))
        for line in doc
    ], {
        "ngram_range": (1, 1)

doc_funcs = [
    ("num_char", lambda doc: len(re.findall("[A-Za-zäöüÄÖÜß]", doc))),
Ejemplo n.º 23
import nltk
import sys
from string import punctuation
import re
from nltk.stem.snowball import GermanStemmer


#pre-processing tools
sents_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#sents_tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')
stemmerEn = nltk.PorterStemmer() # uses nltk Porter stemmer
wnl = nltk.WordNetLemmatizer()
stemmerDe = GermanStemmer() # uses nltk Snowballs stemmer for German

def split_into_sentences(text):
	import re
	caps = "([A-Z])"
	prefixes = "(Mr|St|Mrs|Ms|Dr|dr|etc|vs|doc|art|no|inc|mr)[.]"
	suffixes = "(Inc|Ltd|Jr|Sr|Co|gdp|hon)"
	starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
	acronyms = "([A-Za-z][.][A-Za-z][.](?:[A-Za-z][.])?)"
	websites = "[.](com|net|org|io|gov|de|fr|il|mk)"
	dates = "(\d\d?)\.(\s+(januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember|jahrestag))"
	#dates = "(\d\d?)\."
	www = "(www)\."
	times = "(\d\d?)\.(\s?\d\d?)"
	full_date ="(\d\d?)\.(\s?\d\d?)\.(\s?\d\d\d?\d?)"
import re
import preprocess_files
from nltk.stem.snowball import GermanStemmer

gs = GermanStemmer()
punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''

def match_synms(tokens):
    syn_dict = preprocess_files.read_synms_list()
    for t in tokens:
        for (idx, val) in enumerate(t):
            if val in syn_dict:
                t[idx] = syn_dict[val]

    return tokens

def _remove_punctuation(tokens):
    tokens_filt = []
    for gT in tokens:
        if gT not in punctuations: tokens_filt.append(gT)
    return tokens_filt

def _remove_stopwords(tokens):
    '''Remove stop words from an array of tokens'''

    stopWords = ['the', 'to', '-', 'pr', 'der', 'is', 'of', 'die', 'in', 'and', 'und', '–', '•', '✔', '●', 'a']
Ejemplo n.º 25
def get_stem_relations(sentences, gn):
    """Gets verb-noun relations
    between two sentences.

        Array of word-pairs between two sentences

    # Init word pairs
    word_pairs = []

    # Init stemmer
    stemmer = GermanStemmer(ignore_stopwords=True)

    # Loop over every sentence
    for val, sentence in enumerate(sentences):
        # Is current sentence not the last
        # sentence? If so carry on
        if val != (len(sentences) - 1):
            # Get stems of all words in current sentence
            stems_next_sentence = map(lambda x: stemmer.stem(x['lemma']),
                                      sentences[val + 1])

            # Nouns in next sentence
            nouns_next_sentence = [
                word['lemma'] for word in sentences[val + 1] if word['noun']

            # Nouns of current sentence
            words_current_sentence = [
                word for word in sentence if word['noun']

            # Loop over every word in current sentece
            for word in sentences[val]:
                # Stem of current word
                stem_current_word = stemmer.stem(word['lemma'])

                # Is the stemmed word in the next sentence, great.
                # If word is a lame 'sein', ignore it
                if (stem_current_word
                        in stems_next_sentence) and word['lemma'] != 'sein':
                    # Get index of stem that is related to current word
                    index_word_next_sentence = stems_next_sentence.index(

                    # Corresponding word in next sentence
                    corresponding_word = sentences[val +

                    # Only add word pairs if verb or noun
                    if word['noun'] or word['verb']:
                        # Get dictionary of word in next sentence
                        dict_next = sentences[val +

                        # We do not want to combine words
                        # that have the same grammatical function
                        # A noun should not be combined with a noun
                        # We are only interested in verb-noun relations
                        if word['verb'] and dict_next['noun']:
                            # Get all combinations of corresponding noun
                            # in next sentence an all nouns in current sentence
                            for wordCurrent in words_current_sentence:
                                # Append to list
                                    'source': {
                                        'word': corresponding_word['orth'],
                                        'lemma': corresponding_word['lemma'],
                                        'sentence': val
                                    'target': {
                                        'word': wordCurrent['orth'],
                                        'lemma': wordCurrent['lemma'],
                                        'sentence': val + 1
                                    'verb noun relation'

                        # Current word is noun and corresponding word is
                        # verb
                        elif word['noun'] and dict_next['verb']:
                            # Get all combinations of of noun in this sentence
                            # with nouns in next sentence
                            for wordNext in sentences[val + 1]:
                                # Do not use stupid 'sein'
                                if wordNext['noun']:
                                    # Append to list
                                        'source': {
                                            'word': word['orth'],
                                            'lemma': word['lemma'],
                                            'sentence': val
                                        'target': {
                                            'word': wordNext['orth'],
                                            'lemma': wordNext['lemma'],
                                            'sentence': val + 1
                                        'noun verb relation'

    return word_pairs
Ejemplo n.º 26
 def stem_words(self, words):
     stemmer = GermanStemmer()
     stemmed_words = []
     for word in words:
     return stemmed_words
Ejemplo n.º 27
 def load_stemmer(self):
     self._stemmer = None
     if self._stemming_lang == Language.GERMAN:
         self._stemmer = GermanStemmer()
         self._stemmer = EnglishStemmer()
Ejemplo n.º 28
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import GermanStemmer
import os
import json
import nltk
import pandas as pd
import numpy as np

stemmer = GermanStemmer(ignore_stopwords=True)

CONFLICT_OUTPUT_PATH = os.path.join("Output")
CREATE_VOCABULARY = os.path.join("Output")

def combine_data_panning(dirpath: str, output_name: str = None):
    ACTION = "actions"
    TAG = "tag"
    COMMANDS = "commands"

    # with open(os.path.join("Data", "stopwords.txt"), "rt") as f:
    #     stopwords = set(f.read().splitlines())

    document_pathes = [os.path.join(dirpath, x) for x in os.listdir(dirpath)]
    new_data = {}
    for i, document in enumerate(document_pathes):

        with open(document, "rt") as f:
            commands = json.load(f)

        repeat = set()
        for action in commands[ACTION]:
from nlingua.stemmers import GermanSnowballStemmer
from nltk.stem.snowball import GermanStemmer
import codecs

if __name__ == '__main__':
    l = []
    with codecs.open("german_words.txt", encoding="utf-8", mode="r") as f:
        words = f.readlines()

    words = [x[:-1] for x in words]

    correct = 0
    stemmer = GermanSnowballStemmer()
    stemmer2 = GermanStemmer()
    for word in words:
        a = stemmer.stem(word)
        b = stemmer2.stem(word)
        if a == b:
            correct += 1
            print(word, a, b)

    print(f"{correct}/{len(words)} correct")