def pluralize(self, word): ''' Pluralizes Spanish nouns. Input string can be Unicode (e.g. u"palabra"), or a str encoded in UTF-8 or Latin-1. Output string will be encoded the same way as the input. ''' word, origType = utils.unicodify(word) # all internal calculations are done in Unicode rules = [ [u'(?i)([aeiou])x$', u'\\1x'], # This could fail if the word is oxytone. [u'(?i)([áéíóú])([ns])$', u'|1\\2es'], [u'(?i)(^[bcdfghjklmnñpqrstvwxyz]*)an$', u'\\1anes'], # clan->clanes [u'(?i)([áéíóú])s$', u'|1ses'], [u'(?i)(^[bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])$', u'\\1\\2\\3es'], # tren->trenes [u'(?i)([aeiouáéó])$', u'\\1s'], # casa->casas, padre->padres, papá->papás [u'(?i)([aeiou])s$', u'\\1s'], # atlas->atlas, virus->virus, etc. [u'(?i)([éí])(s)$', u'|1\\2es'], # inglés->ingleses [u'(?i)z$', u'ces'], # luz->luces [u'(?i)([íú])$', u'\\1es'], # ceutí->ceutíes, tabú->tabúes [u'(?i)(ng|[wckgtp])$', u'\\1s'], # Anglicismos como puenting, frac, crack, show (En que casos podría fallar esto?) [u'(?i)$', u'es'] # ELSE +es (v.g. árbol->árboles) ] lower_cased_word = word.lower() for uncountable_word in self.non_changing_words: if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: return utils.deunicodify(word, origType) for irregular_singular, irregular_plural in self.irregular_words.iteritems(): match = re.search(u'(?i)(^' + irregular_singular + u')$', word, re.IGNORECASE) if match: result = re.sub(u'(?i)' + irregular_singular + u'$', match.expand(u'\\1')[0] + irregular_plural[1:], word) return utils.deunicodify(result, origType) for rule in rules: match = re.search(rule[0], word, re.IGNORECASE) if match: groups = match.groups() replacement = rule[1] if re.match(u'\|', replacement): for k in range(1, len(groups)): replacement = replacement.replace(u'|' + unicode( k), self.string_replace(groups[k - 1], u'ÁÉÍÓÚáéíóú', u'AEIOUaeiou')) result = re.sub(rule[0], replacement, word) # Esto acentúa los sustantivos que al pluralizarse se # convierten en esdrújulos como esmóquines, jóvenes... match = re.search(u'(?i)([aeiou]).{1,3}([aeiou])nes$', result) if match and len(match.groups()) > 1 and not re.search(u'(?i)[áéíóú]', word): result = result.replace(match.group(0), self.string_replace( match.group(1), u'AEIOUaeiou', u'ÁÉÍÓÚáéíóú') + match.group(0)[1:]) return utils.deunicodify(result, origType) return utils.deunicodify(word, origType)
def singularize(self, word): ''' Singularizes Spanish nouns. Input string can be Unicode (e.g. u"palabras"), or a str encoded in UTF-8 or Latin-1. Output string will be encoded the same way as the input. ''' word, origType = utils.unicodify(word) # all internal calculations are done in Unicode rules = [ [ur'(?i)^([bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])es$', u'\\1\\2\\3'], [ur'(?i)([aeiou])([ns])es$', u'~1\\2'], [ur'(?i)shes$', u'sh'], # flashes->flash [ur'(?i)oides$', u'oide'], # androides->androide [ur'(?i)(sis|tis|xis)$', u'\\1'], # crisis, apendicitis, praxis [ur'(?i)(é)s$', u'\\1'], # bebés->bebé [ur'(?i)(ces)$', u'z'], # luces->luz [ur'(?i)([^e])s$', u'\\1'], # casas->casa [ur'(?i)([bcdfghjklmnñprstvwxyz]{2,}e)s$', u'\\1'], # cofres->cofre [ur'(?i)([ghñptv]e)s$', u'\\1'], # llaves->llave, radiocasetes->radiocasete [ur'(?i)jes$', u'je'], # ejes->eje [ur'(?i)ques$', u'que'], # tanques->tanque [ur'(?i)es$', u''] # ELSE remove _es_ monitores->monitor ] lower_cased_word = word.lower() for uncountable_word in self.non_changing_words: if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: return utils.deunicodify(word, origType) for irregular_singular, irregular_plural in self.irregular_words.iteritems(): match = re.search(u'(^' + irregular_plural + u')$', word, re.IGNORECASE) if match: result = re.sub(u'(?i)' + irregular_plural + u'$', match.expand(u'\\1')[0] + irregular_singular[1:], word) return utils.deunicodify(result, origType) for rule in rules: match = re.search(rule[0], word, re.IGNORECASE) if match: groups = match.groups() replacement = rule[1] if re.match(u'~', replacement): for k in range(1, len(groups)): replacement = replacement.replace(u'~' + unicode( k), self.string_replace(groups[k - 1], u'AEIOUaeiou', u'ÁÉÍÓÚáéíóú')) result = re.sub(rule[0], replacement, word) # Esta es una posible solución para el problema de dobles # acentos. Un poco guarrillo pero funciona match = re.search(u'(?i)([áéíóú]).*([áéíóú])', result) if match and len(match.groups()) > 1 and not re.search(u'(?i)[áéíóú]', word): result = self.string_replace( result, u'ÁÉÍÓÚáéíóú', u'AEIOUaeiou') return utils.deunicodify(result, origType) return utils.deunicodify(word, origType)
def pluralize(self, word): ''' Pluralizes Spanish nouns. Input string can be Unicode (e.g. u"palabra"), or a str encoded in UTF-8 or Latin-1. Output string will be encoded the same way as the input. ''' word, origType = utils.unicodify( word) # all internal calculations are done in Unicode rules = [ [u'(?i)([aeiou])x$', u'\\1x'], # This could fail if the word is oxytone. [u'(?i)([áéíóú])([ns])$', u'|1\\2es'], [u'(?i)(^[bcdfghjklmnñpqrstvwxyz]*)an$', u'\\1anes'], # clan->clanes [u'(?i)([áéíóú])s$', u'|1ses'], [ u'(?i)(^[bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])$', u'\\1\\2\\3es' ], # tren->trenes [u'(?i)([aeiouáéó])$', u'\\1s'], # casa->casas, padre->padres, papá->papás [u'(?i)([aeiou])s$', u'\\1s'], # atlas->atlas, virus->virus, etc. [u'(?i)([éí])(s)$', u'|1\\2es'], # inglés->ingleses [u'(?i)z$', u'ces'], # luz->luces [u'(?i)([íú])$', u'\\1es'], # ceutí->ceutíes, tabú->tabúes [ u'(?i)(ng|[wckgtp])$', u'\\1s' ], # Anglicismos como puenting, frac, crack, show (En que casos podría fallar esto?) [u'(?i)$', u'es'] # ELSE +es (v.g. árbol->árboles) ] lower_cased_word = word.lower() for uncountable_word in self.non_changing_words: if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: return utils.deunicodify(word, origType) for irregular_singular, irregular_plural in self.irregular_words.iteritems( ): match = re.search(u'(?i)(^' + irregular_singular + u')$', word, re.IGNORECASE) if match: result = re.sub(u'(?i)' + irregular_singular + u'$', match.expand(u'\\1')[0] + irregular_plural[1:], word) return utils.deunicodify(result, origType) for rule in rules: match = re.search(rule[0], word, re.IGNORECASE) if match: groups = match.groups() replacement = rule[1] if re.match(u'\|', replacement): for k in range(1, len(groups)): replacement = replacement.replace( u'|' + unicode(k), self.string_replace(groups[k - 1], u'ÁÉÍÓÚáéíóú', u'AEIOUaeiou')) result = re.sub(rule[0], replacement, word) # Esto acentúa los sustantivos que al pluralizarse se # convierten en esdrújulos como esmóquines, jóvenes... match = re.search(u'(?i)([aeiou]).{1,3}([aeiou])nes$', result) if match and len(match.groups()) > 1 and not re.search( u'(?i)[áéíóú]', word): result = result.replace( match.group(0), self.string_replace(match.group(1), u'AEIOUaeiou', u'ÁÉÍÓÚáéíóú') + match.group(0)[1:]) return utils.deunicodify(result, origType) return utils.deunicodify(word, origType)
def singularize(self, word): ''' Singularizes Spanish nouns. Input string can be Unicode (e.g. u"palabras"), or a str encoded in UTF-8 or Latin-1. Output string will be encoded the same way as the input. ''' word, origType = utils.unicodify( word) # all internal calculations are done in Unicode rules = [ [ ur'(?i)^([bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])es$', u'\\1\\2\\3' ], [ur'(?i)([aeiou])([ns])es$', u'~1\\2'], [ur'(?i)shes$', u'sh'], # flashes->flash [ur'(?i)oides$', u'oide'], # androides->androide [ur'(?i)(sis|tis|xis)$', u'\\1'], # crisis, apendicitis, praxis [ur'(?i)(é)s$', u'\\1'], # bebés->bebé [ur'(?i)(ces)$', u'z'], # luces->luz [ur'(?i)([^e])s$', u'\\1'], # casas->casa [ur'(?i)([bcdfghjklmnñprstvwxyz]{2,}e)s$', u'\\1'], # cofres->cofre [ur'(?i)([ghñptv]e)s$', u'\\1'], # llaves->llave, radiocasetes->radiocasete [ur'(?i)jes$', u'je'], # ejes->eje [ur'(?i)ques$', u'que'], # tanques->tanque [ur'(?i)es$', u''] # ELSE remove _es_ monitores->monitor ] lower_cased_word = word.lower() for uncountable_word in self.non_changing_words: if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: return utils.deunicodify(word, origType) for irregular_singular, irregular_plural in self.irregular_words.iteritems( ): match = re.search(u'(^' + irregular_plural + u')$', word, re.IGNORECASE) if match: result = re.sub( u'(?i)' + irregular_plural + u'$', match.expand(u'\\1')[0] + irregular_singular[1:], word) return utils.deunicodify(result, origType) for rule in rules: match = re.search(rule[0], word, re.IGNORECASE) if match: groups = match.groups() replacement = rule[1] if re.match(u'~', replacement): for k in range(1, len(groups)): replacement = replacement.replace( u'~' + unicode(k), self.string_replace(groups[k - 1], u'AEIOUaeiou', u'ÁÉÍÓÚáéíóú')) result = re.sub(rule[0], replacement, word) # Esta es una posible solución para el problema de dobles # acentos. Un poco guarrillo pero funciona match = re.search(u'(?i)([áéíóú]).*([áéíóú])', result) if match and len(match.groups()) > 1 and not re.search( u'(?i)[áéíóú]', word): result = self.string_replace(result, u'ÁÉÍÓÚáéíóú', u'AEIOUaeiou') return utils.deunicodify(result, origType) return utils.deunicodify(word, origType)