Exemple #1
0
  def stem(self, word):
    """
    Stem a Norwegian word and return the stemmed form.

    :param word: The word that is stemmed.
    :type word: str or unicode
    :return: The stemmed form.
    :rtype: unicode

    """
    word = word.lower()

    if word in self.stopwords:
      return word

    r1 = self._r1_scandinavian(word, self.__vowels)

    # STEP 1
    for suffix in self.__step1_suffixes:
      if r1.endswith(suffix):
        if suffix in ("erte", "ert"):
          word = suffix_replace(word, suffix, "er")
          r1 = suffix_replace(r1, suffix, "er")

        elif suffix == "s":
          if word[-2] in self.__s_ending or (
            word[-2] == "k" and word[-3] not in self.__vowels
          ):
            word = word[:-1]
            r1 = r1[:-1]
        else:
          word = word[: -len(suffix)]
          r1 = r1[: -len(suffix)]
        break

    # STEP 2
    for suffix in self.__step2_suffixes:
      if r1.endswith(suffix):
        word = word[:-1]
        r1 = r1[:-1]
        break

    # STEP 3
    for suffix in self.__step3_suffixes:
      if r1.endswith(suffix):
        word = word[: -len(suffix)]
        break

    return word
 def normalize(self, token):
     # strip diacritics
     token = self.__vocalization.sub('', token)
     #strip kasheeda
     token = self.__kasheeda.sub('', token)
     # strip punctuation marks
     token = self.__arabic_punctuation_marks.sub('', token)
     # normalize last hamza
     for hamza in self.__last_hamzat:
         if token.endswith(hamza):
             token = suffix_replace(token, hamza, '\u0621')
             break
     # normalize other hamzat
     token = self.__initial_hamzat.sub('\u0627', token)
     token = self.__waw_hamza.sub('\u0648', token)
     token = self.__yeh_hamza.sub('\u064a', token)
     token = self.__alefat.sub('\u0627', token)
     return token
	def stem(self, word):
		"""
		Stem a Romanian word and return the stemmed form.

		:param word: The word that is stemmed.
		:type word: str or unicode
		:return: The stemmed form.
		:rtype: unicode

		"""
		# word = word.lower()
		# nltk.download('stopwords')
		# print (self.__step0_suffixes)
		# print (self.__step1_suffixes)
		# print (self.__step2_suffixes)
		# print (self.__step3_suffixes)

		punctuations = ""
		result = ""
		while word:
			if word[-1] in special:
				punctuations = word[-1] + punctuations
				word = word[:-1]
			else:
				break

		if word.lower() in list(stopwords.words('romanian')):
			return word + " " + punctuations + " "

		step1_success = False
		step2_success = False

		for i in range(1, len(word)-1):
			if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
				if word[i] == "u":
					word = "".join((word[:i], "U", word[i+1:]))

				elif word[i] == "i":
					word = "".join((word[:i], "I", word[i+1:]))

		r1, r2 = self._r1r2_standard(word, self.__vowels)
		rv = self._rv_standard(word, self.__vowels)

		# STEP 0: Removal of plurals and other simplifications
		for suffix in self.__step0_suffixes:
			if word.endswith(suffix):
				if suffix in r1:
					if suffix in ("ul", "ului"):
						result =  tok_s + suffix + " " + result
						word = word[:-len(suffix)]

						if suffix in rv:
							rv = rv[:-len(suffix)]
						else:
							rv = ""

					elif (suffix == "aua" or suffix == "atei" or
						  (suffix == "ile" and word[-5:-3] != "ab")):
						result =  tok_s + suffix[-2:] + " " + result
						word = word[:-2]

					elif suffix in ("ea", "ele", "elor"):
						result =  tok_s + suffix[1:] + " " + result
						word = word[:-len(suffix)+1]

						if suffix in rv:
							rv = suffix_replace(rv, suffix, "e")
						else:
							rv = ""

					elif suffix in ("ii", "iua", "iei",
									"iile", "iilor", "ilor"):
						result =  tok_s + suffix[1:] + " " + result
						word = word[:-len(suffix)+1]

						if suffix in rv:
							rv = suffix_replace(rv, suffix, "i")
						else:
							rv = ""

					elif suffix in ("a\u0163ie", "a\u0163ia"):
						result =  tok_s + suffix + " " + result
						word = word[:-len(suffix)+1]
				break

		# STEP 1: Reduction of combining suffixes
		while True:

			replacement_done = False

			for suffix in self.__step1_suffixes:
				if word.endswith(suffix):
					if suffix in r1:
						step1_success = True
						replacement_done = True

						if suffix in ("abilitate", "abilitati",
									  "abilit\u0103i",
									  "abilit\u0103\u0163i"):
							result =  tok_s + suffix[4:] + " " + result
							word = word[:-len(suffix)+4]

						elif suffix == "ibilitate":
							result =  tok_s + suffix[4:] + " " + result
							word = word[:-len(suffix)+4]

						elif suffix in ("ivitate", "ivitati",
										"ivit\u0103i",
										"ivit\u0103\u0163i"):
							result =  tok_s + suffix[2:] + " " + result
							word = word[:-len(suffix)+2]

						elif suffix in ("icitate", "icitati", "icit\u0103i",
										"icit\u0103\u0163i", "icator",
										"icatori", "iciv", "iciva",
										"icive", "icivi", "iciv\u0103",
										"ical", "icala", "icale", "icali",
										"ical\u0103"):
							result =  tok_s + suffix[2:] + " " + result
							word = word[:-len(suffix)+2]

						elif suffix in ("ativ", "ativa", "ative", "ativi",
										"ativ\u0103", "a\u0163iune",
										"atoare", "ator", "atori",
										"\u0103toare",
										"\u0103tor", "\u0103tori"):
							result =  tok_s + suffix[2:] + " " + result
							word = word[:-len(suffix)+2]

							if suffix in r2:
								r2 = suffix_replace(r2, suffix, "at")

						elif suffix in ("itiv", "itiva", "itive", "itivi",
										"itiv\u0103", "i\u0163iune",
										"itoare", "itor", "itori"):
							result =  tok_s + suffix[2:] + " " + result
							word = word[:-len(suffix)+2]

							if suffix in r2:
								r2 = suffix_replace(r2, suffix, "it")
					else:
						step1_success = False
					break

			if not replacement_done:
				break

		# STEP 2: Removal of standard suffixes
		for suffix in self.__step2_suffixes:
			if word.endswith(suffix):
				if suffix in r2:
					step2_success = True

					# if suffix in ("iune", "iuni"):
					#     if word[-5] == "\u0163":
					#         word = "".join((word[:-5], "t"))

					if suffix in ("ism", "isme", "ist", "ista", "iste",
									"isti", "ist\u0103", "i\u015Fti"):
						result =  tok_s + suffix[3:] + " " + result
						word = word[:-len(suffix)+3]

					else:
						result =  tok_s + suffix + " " + result
						word = word[:-len(suffix)]
				break

		# STEP 3: Removal of verb suffixes
		if not step1_success and not step2_success:
			for suffix in self.__step3_suffixes:
				if word.endswith(suffix):
					if suffix in rv:
						if suffix in ('seser\u0103\u0163i', 'seser\u0103m',
									  'ser\u0103\u0163i', 'sese\u015Fi',
									  'seser\u0103', 'ser\u0103m', 'sesem',
									  'se\u015Fi', 'ser\u0103', 'sese',
									  'a\u0163i', 'e\u0163i', 'i\u0163i',
									  '\xE2\u0163i', 'sei', '\u0103m',
									  'em', 'im', '\xE2m', 'se'):
							result =  tok_s + suffix + " " + result
							word = word[:-len(suffix)]
							rv = rv[:-len(suffix)]
						else:
							if (not rv.startswith(suffix) and
								rv[rv.index(suffix)-1] not in
								"aeio\u0103\xE2\xEE"):
								result =  tok_s + suffix + " " + result
								word = word[:-len(suffix)]
						break

		# STEP 4: Removal of final vowel
		for suffix in ("ie", "a", "e", "i", "\u0103"):
			if word.endswith(suffix):
				if suffix in rv:
					result =  tok_s + suffix + " " + result
					word = word[:-len(suffix)]
				break

		word = word.replace("I", "i").replace("U", "u")


		return (word + " " + result).rstrip(" ") + " " + punctuations + " "
	def stem(self, word):
		"""
		Stem a Dutch word and return the stemmed form.

		:param word: The word that is stemmed.
		:type word: str or unicode
		:return: The stemmed form.
		:rtype: unicode

		"""
		punctuations = ""
		result = ""
		while word:
			if word[-1] in special:
				punctuations = word[-1] + punctuations
				word = word[:-1]
			else:
				break

		heden = False
		# word = word.lower()

		if word.lower() in list(stopwords.words('dutch')):
			return word + " " + punctuations + " "

		step2_success = False

		# Vowel accents are removed.
		# word = (word.replace("\xE4", "a").replace("\xE1", "a")
					# .replace("\xEB", "e").replace("\xE9", "e")
					# .replace("\xED", "i").replace("\xEF", "i")
					# .replace("\xF6", "o").replace("\xF3", "o")
					# .replace("\xFC", "u").replace("\xFA", "u"))

		# An initial 'y', a 'y' after a vowel,
		# and an 'i' between self.__vowels is put into upper case.
		# As from now these are treated as consonants.
		if word.startswith("y"):
			word = "".join(("Y", word[1:]))

		for i in range(1, len(word)):
			if word[i-1] in self.__vowels and word[i] == "y":
				word = "".join((word[:i], "Y", word[i+1:]))

		for i in range(1, len(word)-1):
			if (word[i-1] in self.__vowels and word[i] == "i" and
			   word[i+1] in self.__vowels):
				word = "".join((word[:i], "I", word[i+1:]))

		r1, r2 = self._r1r2_standard(word, self.__vowels)

		# R1 is adjusted so that the region before it
		# contains at least 3 letters.
		for i in range(1, len(word)):
			if word[i] not in self.__vowels and word[i-1] in self.__vowels:
				if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
					r1 = word[3:]
				elif len(word[:i+1]) == 0:
					return word + " " + punctuations + " "
				break

		# STEP 1
		for suffix in self.__step1_suffixes:
			if r1.endswith(suffix):
				if suffix == "heden":
					word = suffix_replace(word, suffix, "heid")
					r1 = suffix_replace(r1, suffix, "heid")
					heden = True
					if r2.endswith("heden"):
						r2 = suffix_replace(r2, suffix, "heid")

				elif (suffix in ("ene", "en") and
					  not word.endswith("heden") and
					  word[-len(suffix)-1] not in self.__vowels and
					  word[-len(suffix)-3:-len(suffix)] != "gem"):
					result =  tok_s + suffix + " " + result
					word = word[:-len(suffix)]
					r1 = r1[:-len(suffix)]
					r2 = r2[:-len(suffix)]
					if word.endswith(("kk", "dd", "tt")):
						result =  tok_s + word[-1] + " " + result
						word = word[:-1]
						r1 = r1[:-1]
						r2 = r2[:-1]

				elif (suffix in ("se", "s") and
					  word[-len(suffix)-1] not in self.__vowels and
					  word[-len(suffix)-1] != "j"):
					result =  tok_s + suffix + " " + result
					word = word[:-len(suffix)]
					r1 = r1[:-len(suffix)]
					r2 = r2[:-len(suffix)]
				break

		# STEP 2
		if r1.endswith("e") and word[-2] not in self.__vowels:
			step2_success = True
			result =  tok_s + word[-1] + " " + result
			word = word[:-1]
			r1 = r1[:-1]
			r2 = r2[:-1]

			if word.endswith(("kk", "dd", "tt")):
				result =  tok_s + word[-1] + " " + result
				word = word[:-1]
				r1 = r1[:-1]
				r2 = r2[:-1]

		# STEP 3a
		if r2.endswith("heid") and word[-5] != "c":
			if heden :
				result =  tok_s + "heden " + result
			else:
				result =  tok_s + "heid " + result
			word = word[:-4]
			r1 = r1[:-4]
			r2 = r2[:-4]

			if (r1.endswith("en") and word[-3] not in self.__vowels and
				word[-5:-2] != "gem"):
				result =  tok_s + "en " + result
				word = word[:-2]
				r1 = r1[:-2]
				r2 = r2[:-2]

				if word.endswith(("kk", "dd", "tt")):
					result =  tok_s + word[-1] + " " + result
					word = word[:-1]
					r1 = r1[:-1]
					r2 = r2[:-1]

		# STEP 3b: Derivational suffixes
		for suffix in self.__step3b_suffixes:
			if r2.endswith(suffix):
				if suffix in ("end", "ing"):
					result =  tok_s + suffix + " " + result
					word = word[:-3]
					r2 = r2[:-3]

					if r2.endswith("ig") and word[-3] != "e":
						result =  tok_s + suffix + " " + result
						word = word[:-2]
					else:
						if word.endswith(("kk", "dd", "tt")):
							result =  tok_s + word[-1] + " " + result
							word = word[:-1]

				elif suffix == "ig" and word[-3] != "e":
					result =  tok_s + suffix + " " + result
					word = word[:-2]

				elif suffix == "lijk":
					result =  tok_s + suffix + " " + result
					word = word[:-4]
					r1 = r1[:-4]

					if r1.endswith("e") and word[-2] not in self.__vowels:
						result =  tok_s + word[-1] + " " + result
						word = word[:-1]
						if word.endswith(("kk", "dd", "tt")):
							result =  tok_s + word[-1] + " " + result
							word = word[:-1]

				elif suffix == "baar":
					result =  tok_s + suffix + " " + result
					word = word[:-4]

				elif suffix == "bar" and step2_success:
					result =  tok_s + suffix + " " + result
					word = word[:-3]
				break

		# # STEP 4: Undouble vowel
		# if len(word) >= 4:
		#     if word[-1] not in self.__vowels and word[-1] != "I":
		#         if word[-3:-1] in ("aa", "ee", "oo", "uu"):
		#             if word[-4] not in self.__vowels:
		#                 word = "".join((word[:-3], word[-3], word[-1]))

		# All occurrences of 'I' and 'Y' are put back into lower case.
		word = word.replace("I", "i").replace("Y", "y")


		return (word + " " + result).rstrip(" ")+ " " + punctuations + " "
def stem(word, verbose=False):
    if not verbose:
        return word.lower()
    else:
        vowels = "aeiouy"  # The vowels here includes y, we will deal with the case later.
        double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr",
                             "tt")
        li_ending = "cdeghkmnrt"
        step0_suffixes = ("'s'", "'s", "'")
        step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
        step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
        step2_suffixes = ('ization', 'ational', 'fulness', 'ousness',
                          'iveness', 'tional', 'biliti', 'lessli', 'entli',
                          'ation', 'alism', 'aliti', 'ousli', 'iviti', 'fulli',
                          'enci', 'anci', 'abli', 'izer', 'ator', 'alli',
                          'bli', 'ogi', 'li')
        step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti',
                          'ative', 'ical', 'ness', 'ful')
        step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment',
                          'ant', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive',
                          'ize', 'ion', 'al', 'er', 'ic')

        word = word.lower()

        if word in stopwords.words('english'):
            return word

        # remove starting '
        if word.startswith("\x27"):
            word = word[1:]
        """
        Special cases with Y's:
        3 cases Y considered as a vowel, according to Merriam-Webster
        If Y is at the end of a words, we consider this Y as a vowel. e.g.: candy, deny
        If the word has no other vowels than Y, Y is considered as a vowel. e.g. gym
        If Y is in the middle of a syllable. e.g. system, borborygmus
        Thus, we will find the non-vowel y's, and replace them with Y as distinguish.
        """

        # We need to find the Y's, since Y is a special.
        # If a word starts with a y, it is not considered as a vowel.
        # Find starting Y
        if word.startswith("y"):
            word = "".join(("Y", word[1:]))

        # Find vowel + y
        # If any y follows a vowel, that Y is not considered as a vowel.
        for i in range(1, len(word)):
            if word[i - 1] in vowels and word[i] == "y":
                word = "".join((word[:i], "Y", word[i + 1:]))

        step1a_vowel_found = False
        step1b_vowel_found = False

        r1 = ""
        r2 = ""

        # R1 is the region after the first non-vowel following a vowel,
        # or is the null region at the end of the word if there is no
        # such non-vowel.
        #
        # R2 is the region after the first non-vowel following a vowel
        # in R1, or is the null region at the end of the word if there
        # is no such non-vowel.
        if word.startswith(("gener", "commun", "arsen")):
            if word.startswith(("gener", "arsen")):
                r1 = word[5:]
            else:
                r1 = word[6:]

            for i in range(1, len(r1)):
                if r1[i] not in vowels and r1[i - 1] in vowels:
                    r2 = r1[i + 1:]
                    break
        else:
            for i in range(1, len(word)):
                if word[i] not in vowels and word[i - 1] in vowels:
                    r1 = word[i + 1:]
                    break

            for i in range(1, len(r1)):
                if r1[i] not in vowels and r1[i - 1] in vowels:
                    r2 = r1[i + 1:]
                    break

        # Step 0
        # Remove the suffixes 's, s', '
        # The single -s suffix and possessives
        for suffix in step0_suffixes:
            if word.endswith(suffix):
                word = word[:-len(suffix)]
                r1 = r1[:-len(suffix)]
                r2 = r2[:-len(suffix)]
                break

        # Step 1a
        # Deal with "regular" suffix, such as ied, ies, sses
        for suffix in step1a_suffixes:
            if word.endswith(suffix):
                if suffix == 'sses':
                    word = word[:-2]
                    r1 = r1[:-2]
                    w2 = r2[:-2]
                elif suffix in ("ied", "ies"):
                    # For regular words, we remove the last 2 letter
                    if len(word[:-len(suffix)]) > 1:
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]
                    else:
                        # For short words, like pies, we only remove s.
                        word = word[:-1]
                        r1 = r1[:-1]
                        r2 = r2[:-1]
                # When suffix of this word is just s, we remove the last letter
                elif suffix == "s":
                    word = word[:-1]
                    r1 = r1[:-1]
                    r2 = r2[:-1]
                break

        # Step 1b
        for suffix in step1b_suffixes:
            if word.endswith(suffix):
                if suffix in ("eed", "eedly"):
                    if r1.endswith(suffix):
                        word = word[:-len(suffix)] + "ee"
                        # word = suffix_replace(word, suffix, "ee")
                        if len(r1) >= len(suffix):
                            r1 = r1[:-len(suffix)] + 'ee'
                        else:
                            r1 = ""

                        if len(r2) >= len(suffix):
                            r1 = r2[:-len(suffix)] + 'ee'
                        else:
                            r2 = ""
                    else:
                        # For ed, edly+, ing, ingly part.
                        for letter in word[:-len(suffix)]:
                            if letter in vowels:
                                step1b_vowel_found = True
                                break
                        # If such suffix are found, we delete the the suffix.
                        if step1b_vowel_found:
                            word = word[:-len(suffix)]
                            r1 = r1[:-len(suffix)]
                            r2 = r2[:-len(suffix)]

                            # After deletion
                            # If end with at, bl, iz, we add a e, and make to ate, ble, ize.
                            if word.endswith(("at", "bl", "iz")):
                                word = "".join((word, "e"))
                                r1 = "".join((r1, "e"))

                                if len(word) > 5 or len(r1) >= 3:
                                    r2 = "".join((r2, "e"))

                            # If end with double consonants, we delete one consonant
                            elif word.endswith(double_consonants):
                                word = word[:-1]
                                r1 = r1[:-1]
                                r2 = r2[:-1]

                            # If the word is short we add e
                            elif (r1 == "" and len(word) >= 3 and word[-1]
                                  not in vowels and word[-1] not in "wxY"
                                  and word[-2] in vowels
                                  and word[-3] not in vowels) or (
                                      r1 == "" and len(word) == 2
                                      and word[0] in vowels
                                      and word[1] not in vowels):

                                word = "".join((word, "e"))

                                if len(r1) > 0:
                                    r1 = "".join((r1, "e"))

                                if len(r2) > 0:
                                    r2 = "".join((r2, "e"))
                break

        # STEP 1c
        # If word now ends with Y or y, we replace y with i.
        if len(word) > 2 and word[-1] in "yY" and word[-2] not in vowels:
            word = "".join((word[:-1], "i"))
            if len(r1) >= 1:
                r1 = "".join((r1[:-1], "i"))
            else:
                r1 = ""

            if len(r2) >= 1:
                r2 = "".join((r2[:-1], "i"))
            else:
                r2 = ""

        # Step 2
        # In step 2, we go through each of the suffix, and replace them with disired ending.
        # These suffix are
        for suffix in step2_suffixes:
            if word.endswith(suffix):
                if r1.endswith(suffix):
                    if suffix == "tional":
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]

                    elif suffix in ("enci", "anci", "abli"):
                        word = "".join((word[:-1], "e"))

                        if len(r1) >= 1:
                            r1 = "".join((r1[:-1], "e"))
                        else:
                            r1 = ""

                        if len(r2) >= 1:
                            r2 = "".join((r2[:-1], "e"))
                        else:
                            r2 = ""

                    elif suffix == "entli":
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]

                    elif suffix in ("izer", "ization"):
                        word = word[:-len(suffix)] + 'ize'

                        if len(r1) >= len(suffix):
                            r1 = r1[:-len(suffix)] + 'ize'
                        else:
                            r1 = ""

                        if len(r2) >= len(suffix):
                            r2 = r2[:-len(suffix)] + 'ize'
                        else:
                            r2 = ""

                    elif suffix in ("ational", "ation", "ator"):
                        word = word[:-len(suffix)] + 'ate'
                        if len(r1) >= len(suffix):
                            r1 = r1[:-len(suffix)] + 'ate'
                        else:
                            r1 = ""
                        if len(r2) >= len(suffix):
                            r2 = r2[:-len(suffix)] + 'ate'
                        else:
                            r2 = "e"

                    elif suffix in ("alism", "aliti", "alli"):
                        word = suffix_replace(word, suffix, "al")

                        if len(r1) >= len(suffix):
                            r1 = suffix_replace(r1, suffix, "al")
                        else:
                            r1 = ""

                        if len(r2) >= len(suffix):
                            r2 = suffix_replace(r2, suffix, "al")
                        else:
                            r2 = ""

                    elif suffix == "fulness":
                        word = word[:-4]
                        r1 = r1[:-4]
                        r2 = r2[:-4]

                    elif suffix in ("ousli", "ousness"):
                        word = suffix_replace(word, suffix, "ous")

                        if len(r1) >= len(suffix):
                            r1 = suffix_replace(r1, suffix, "ous")
                        else:
                            r1 = ""

                        if len(r2) >= len(suffix):
                            r2 = suffix_replace(r2, suffix, "ous")
                        else:
                            r2 = ""

                    elif suffix in ("iveness", "iviti"):
                        word = suffix_replace(word, suffix, "ive")

                        if len(r1) >= len(suffix):
                            r1 = suffix_replace(r1, suffix, "ive")
                        else:
                            r1 = ""

                        if len(r2) >= len(suffix):
                            r2 = suffix_replace(r2, suffix, "ive")
                        else:
                            r2 = "e"

                    elif suffix in ("biliti", "bli"):
                        word = suffix_replace(word, suffix, "ble")

                        if len(r1) >= len(suffix):
                            r1 = suffix_replace(r1, suffix, "ble")
                        else:
                            r1 = ""

                        if len(r2) >= len(suffix):
                            r2 = suffix_replace(r2, suffix, "ble")
                        else:
                            r2 = ""

                    elif suffix == "ogi" and word[-4] == "l":
                        word = word[:-1]
                        r1 = r1[:-1]
                        r2 = r2[:-1]

                    elif suffix in ("fulli", "lessli"):
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]

                    elif suffix == "li" and word[-3] in li_ending:
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]
                break

        # Step 3
        for suffix in step3_suffixes:
            if word.endswith(suffix):
                if r1.endswith(suffix):
                    if suffix == "tional":
                        word = word[:-2]
                        r1 = r1[:-2]
                        r2 = r2[:-2]
                    elif suffix == "ational":
                        word = suffix_replace(word, suffix, "ate")
                        if len(r1) >= len(suffix):
                            r1 = suffix_replace(r1, suffix, "ate")
                        else:
                            r1 = ""
                        if len(r2) >= len(suffix):
                            r2 = suffix_replace(r2, suffix, "ate")
                        else:
                            r2 = ""
                    elif suffix == "alize":
                        word = word[:-3]
                        r1 = r1[:-3]
                        r2 = r2[:-3]
                    elif suffix in ("icate", "iciti", "ical"):
                        word = suffix_replace(word, suffix, "ic")
                        if len(r1) >= len(suffix):
                            r1 = suffix_replace(r1, suffix, "ic")
                        else:
                            r1 = ""
                        if len(r2) >= len(suffix):
                            r2 = suffix_replace(r2, suffix, "ic")
                        else:
                            r2 = ""
                    elif suffix in ("ful", "ness"):
                        word = word[:-len(suffix)]
                        r1 = r1[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                    elif suffix == "ative" and r2.endswith(suffix):
                        word = word[:-5]
                        r1 = r1[:-5]
                        r2 = r2[:-5]
                break

        # Step 4
        for suffix in step4_suffixes:
            if word.endswith(suffix):
                if r2.endswith(suffix):
                    if suffix == "ion":
                        if word[-4] in "st":
                            word = word[:-3]
                            r1 = r1[:-3]
                            r2 = r2[:-3]
                    else:
                        word = word[:-len(suffix)]
                        r1 = r1[:-len(suffix)]
                        r2 = r2[:-len(suffix)]
                break

        # Step 5
        if r2.endswith("l") and word[-2] == "l":
            word = word[:-1]
        elif r2.endswith("e"):
            word = word[:-1]
        elif r1.endswith("e"):
            if len(word) >= 4 and (word[-2] in vowels or word[-2] in "wxY"
                                   or word[-3] not in vowels
                                   or word[-4] in vowels):
                word = word[:-1]

        word = word.replace("Y", "y")

        return word
Exemple #6
0
    def stem(self, word):
        """
        Stem a Spanish word and return the stemmed form.

        :param word: The word that is stemmed.
        :type word: str or unicode
        :return: The stemmed form.
        :rtype: unicode

        """
        word = word.lower()

        if word in self.stopwords:
            return word

        step1_success = False

        r1, r2 = self._r1r2_standard(word, self.__vowels)
        rv = self._rv_standard(word, self.__vowels)

        # STEP 0: Attached pronoun
        for suffix in self.__step0_suffixes:
            if not (word.endswith(suffix) and rv.endswith(suffix)):
                continue

            if (rv[:-len(suffix)].endswith((
                    "ando",
                    "ar",
                    "er",
                    "iendo",
                    "ir",
            ))) or (rv[:-len(suffix)].endswith("yendo")
                    and word[:-len(suffix)].endswith("uyendo")):

                word = self.__replace_accented(word[:-len(suffix)])
                r1 = self.__replace_accented(r1[:-len(suffix)])
                r2 = self.__replace_accented(r2[:-len(suffix)])
                rv = self.__replace_accented(rv[:-len(suffix)])
            break

        # STEP 1: Standard suffix removal
        for suffix in self.__step1_suffixes:
            if not word.endswith(suffix):
                continue

            if suffix == "amente" and r1.endswith(suffix):
                step1_success = True
                word = word[:-6]
                r2 = r2[:-6]
                rv = rv[:-6]

                if r2.endswith("iv"):
                    word = word[:-2]
                    r2 = r2[:-2]
                    rv = rv[:-2]

                    if r2.endswith("at"):
                        word = word[:-2]
                        rv = rv[:-2]

                elif r2.endswith(("os", "ic", "ad")):
                    word = word[:-2]
                    rv = rv[:-2]

            elif r2.endswith(suffix):
                step1_success = True
                if suffix in (
                        "adora",
                        "ador",
                        "acion",
                        "adoras",
                        "adores",
                        "aciones",
                        "ante",
                        "antes",
                        "ancia",
                        "ancias",
                ):
                    word = word[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    rv = rv[:-len(suffix)]

                    if r2.endswith("ic"):
                        word = word[:-2]
                        rv = rv[:-2]

                elif suffix in ("logia", "logias"):
                    word = suffix_replace(word, suffix, "log")
                    rv = suffix_replace(rv, suffix, "log")

                elif suffix in ("ucion", "uciones"):
                    word = suffix_replace(word, suffix, "u")
                    rv = suffix_replace(rv, suffix, "u")

                elif suffix in ("encia", "encias"):
                    word = suffix_replace(word, suffix, "ente")
                    rv = suffix_replace(rv, suffix, "ente")

                elif suffix == "mente":
                    word = word[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    rv = rv[:-len(suffix)]

                    if r2.endswith(("ante", "able", "ible")):
                        word = word[:-4]
                        rv = rv[:-4]

                elif suffix in ("idad", "idades"):
                    word = word[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    rv = rv[:-len(suffix)]

                    for pre_suff in ("abil", "ic", "iv"):
                        if r2.endswith(pre_suff):
                            word = word[:-len(pre_suff)]
                            rv = rv[:-len(pre_suff)]

                elif suffix in ("ivo", "iva", "ivos", "ivas"):
                    word = word[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    rv = rv[:-len(suffix)]
                    if r2.endswith("at"):
                        word = word[:-2]
                        rv = rv[:-2]
                else:
                    word = word[:-len(suffix)]
                    rv = rv[:-len(suffix)]
            break

        # STEP 2a: Verb suffixes beginning 'y'
        if not step1_success:
            for suffix in self.__step2a_suffixes:
                if rv.endswith(suffix) and word[-len(suffix) -
                                                1:-len(suffix)] == "u":
                    word = word[:-len(suffix)]
                    rv = rv[:-len(suffix)]
                    break

            # STEP 2b: Other verb suffixes
            for suffix in self.__step2b_suffixes:
                if rv.endswith(suffix):
                    word = word[:-len(suffix)]
                    rv = rv[:-len(suffix)]
                    if suffix in ("en", "es", "eis", "emos"):
                        if word.endswith("gu"):
                            word = word[:-1]

                        if rv.endswith("gu"):
                            rv = rv[:-1]
                    break

        # STEP 3: Residual suffix
        for suffix in self.__step3_suffixes:
            if rv.endswith(suffix):
                word = word[:-len(suffix)]
                if suffix in ("e", "\xE9"):
                    rv = rv[:-len(suffix)]

                    if word[-2:] == "gu" and rv.endswith("u"):
                        word = word[:-1]
                break

        word = self.__replace_accented(word)

        return word