Beispiel #1
0
def stemmer_middle_high_german(text_l, rem_umlauts = True, exceptions = exc_dict):
	"""text_l: text in string format
	   rem_umlauts: choose whether to remove umlauts from string
	   exceptions: hard-coded dictionary for the cases the algorithm fails"""
	
	#Normalize text
	text_l = normalize_middle_high_german(text_l, to_lower_all = False, to_lower_beginning = True)
	
	#Tokenize text
	word_tokenizer = WordTokenizer("middle_high_german")
	text_l = word_tokenizer.tokenize(text_l)
	text = []

	
	for word in text_l:
		try:
			text.append(exceptions[word]) #test if word in exception dictionary
			
		except:
			if word[0].isupper():
				#MHG only uses upper case for locations, people, etc. So any word that starts with a capital
				#letter while not being at the start of a sentence will automatically be excluded.
				text.append(word)
				
			elif word in MHG_STOPS: 
				text.append(word) #Filter stop words
				
			else:
				text.append(stem_helper(word, rem_umlaut = rem_umlauts))
	return text
Beispiel #2
0
def stemmer_middle_high_german(text_l, rem_umlauts=True, exceptions=exc_dict):
    """text_l: text in string format
	   rem_umlauts: choose whether to remove umlauts from string
	   exceptions: hard-coded dictionary for the cases the algorithm fails"""

    #Normalize text
    text_l = normalize_middle_high_german(text_l,
                                          to_lower_all=False,
                                          to_lower_beginning=True)

    #Tokenize text
    word_tokenizer = WordTokenizer("middle_high_german")
    text_l = word_tokenizer.tokenize(text_l)
    text = []

    for word in text_l:
        try:
            text.append(
                exceptions[word])  #test if word in exception dictionary

        except:
            if word[0].isupper():
                #MHG only uses upper case for locations, people, etc. So any word that starts with a capital
                #letter while not being at the start of a sentence will automatically be excluded.
                text.append(word)

            elif word in MHG_STOPS:
                text.append(word)  #Filter stop words

            else:
                text.append(stem_helper(word, rem_umlaut=rem_umlauts))
    return text
Beispiel #3
0
    def test_middle_high_german_normalizer(self):
        """
        Test Middle High German punctuation normalizer
        """
        normalized = normalize_middle_high_german("Si sprach: ‘herre Sigemunt, ir sult iȥ lāȥen stān", punct = True)
        target =  'si sprach herre sigemunt ir sult iȥ lâȥen stân'

        self.assertEqual(normalized, target)
Beispiel #4
0
    def test_middle_high_german_normalizer_spelling(self):
        """
        Test Middle High German spelling normalizer
        """
        normalized =  normalize_middle_high_german("Mit ūf erbürten schilden in was ze strīte nōt", alpha_conv = True)
        target = 'mit ûf erbürten schilden in was ze strîte nôt'

        self.assertEqual(normalized, target)
Beispiel #5
0
    def test_middle_high_german_normalizer(self):
        """
        Test Middle High German normalizer
        """
        normalized = normalize_middle_high_german("Dô erbiten si der nahte und fuoren über Rîn")
        target = 'dô erbiten si der nahte und fuoren über rîn'

        self.assertEqual(normalized, target)
Beispiel #6
0
    def test_middle_high_german_normalizer(self):
        """
        Test Middle High German punctuation normalizer
        """
        normalized = normalize_middle_high_german(
            "Si sprach: ‘herre Sigemunt, ir sult iȥ lāȥen stān", punct=True)
        target = 'si sprach herre sigemunt ir sult iȥ lâȥen stân'

        self.assertEqual(normalized, target)
Beispiel #7
0
    def test_middle_high_german_normalizer_spelling(self):
        """
        Test Middle High German spelling normalizer
        """
        normalized = normalize_middle_high_german(
            "Mit ūf erbürten schilden in was ze strīte nōt", alpha_conv=True)
        target = 'mit ûf erbürten schilden in was ze strîte nôt'

        self.assertEqual(normalized, target)
Beispiel #8
0
    def test_middle_high_german_normalizer(self):
        """
        Test Middle High German normalizer
        """
        normalized = normalize_middle_high_german(
            "Dô erbiten si der nahte und fuoren über Rîn")
        target = 'dô erbiten si der nahte und fuoren über rîn'

        self.assertEqual(normalized, target)