def stemmer_middle_high_german(text_l, rem_umlauts = True, exceptions = exc_dict): """text_l: text in string format rem_umlauts: choose whether to remove umlauts from string exceptions: hard-coded dictionary for the cases the algorithm fails""" #Normalize text text_l = normalize_middle_high_german(text_l, to_lower_all = False, to_lower_beginning = True) #Tokenize text word_tokenizer = WordTokenizer("middle_high_german") text_l = word_tokenizer.tokenize(text_l) text = [] for word in text_l: try: text.append(exceptions[word]) #test if word in exception dictionary except: if word[0].isupper(): #MHG only uses upper case for locations, people, etc. So any word that starts with a capital #letter while not being at the start of a sentence will automatically be excluded. text.append(word) elif word in MHG_STOPS: text.append(word) #Filter stop words else: text.append(stem_helper(word, rem_umlaut = rem_umlauts)) return text
def stemmer_middle_high_german(text_l, rem_umlauts=True, exceptions=exc_dict): """text_l: text in string format rem_umlauts: choose whether to remove umlauts from string exceptions: hard-coded dictionary for the cases the algorithm fails""" #Normalize text text_l = normalize_middle_high_german(text_l, to_lower_all=False, to_lower_beginning=True) #Tokenize text word_tokenizer = WordTokenizer("middle_high_german") text_l = word_tokenizer.tokenize(text_l) text = [] for word in text_l: try: text.append( exceptions[word]) #test if word in exception dictionary except: if word[0].isupper(): #MHG only uses upper case for locations, people, etc. So any word that starts with a capital #letter while not being at the start of a sentence will automatically be excluded. text.append(word) elif word in MHG_STOPS: text.append(word) #Filter stop words else: text.append(stem_helper(word, rem_umlaut=rem_umlauts)) return text
def test_middle_high_german_normalizer(self): """ Test Middle High German punctuation normalizer """ normalized = normalize_middle_high_german("Si sprach: ‘herre Sigemunt, ir sult iȥ lāȥen stān", punct = True) target = 'si sprach herre sigemunt ir sult iȥ lâȥen stân' self.assertEqual(normalized, target)
def test_middle_high_german_normalizer_spelling(self): """ Test Middle High German spelling normalizer """ normalized = normalize_middle_high_german("Mit ūf erbürten schilden in was ze strīte nōt", alpha_conv = True) target = 'mit ûf erbürten schilden in was ze strîte nôt' self.assertEqual(normalized, target)
def test_middle_high_german_normalizer(self): """ Test Middle High German normalizer """ normalized = normalize_middle_high_german("Dô erbiten si der nahte und fuoren über Rîn") target = 'dô erbiten si der nahte und fuoren über rîn' self.assertEqual(normalized, target)
def test_middle_high_german_normalizer(self): """ Test Middle High German punctuation normalizer """ normalized = normalize_middle_high_german( "Si sprach: ‘herre Sigemunt, ir sult iȥ lāȥen stān", punct=True) target = 'si sprach herre sigemunt ir sult iȥ lâȥen stân' self.assertEqual(normalized, target)
def test_middle_high_german_normalizer_spelling(self): """ Test Middle High German spelling normalizer """ normalized = normalize_middle_high_german( "Mit ūf erbürten schilden in was ze strīte nōt", alpha_conv=True) target = 'mit ûf erbürten schilden in was ze strîte nôt' self.assertEqual(normalized, target)
def test_middle_high_german_normalizer(self): """ Test Middle High German normalizer """ normalized = normalize_middle_high_german( "Dô erbiten si der nahte und fuoren über Rîn") target = 'dô erbiten si der nahte und fuoren über rîn' self.assertEqual(normalized, target)