def filter_pos_infinitive(self, s, category_list=[], allowed=False): ''' Filters grammatical categories (pos:Part-of-Speech tags) from a string and converts to infinitive, predicative and singularized forms words: If allowed is set to True it only allows POS in category_list. If allowed is set to False it allows all POS except those in category_list POS that can be in category list: nouns = ['NN', 'NNS', 'NNP', 'NNPS'] verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adjectives = ['JJ','JJR','JJS'] determiners = ['DT'] conjunctions = ['IN', 'CC'] adverbs = ['RB','RBR', 'RBS'] modals = ['MD'] utterances = ['UH'] In: (s:string, category_list:list of strings, allowed:boolean) Out: (string) ''' if isinstance(s, str): s = unicode(s, "utf-8", "xmlcharrefreplace") list = [] pos_list = self.pos_tagging(s) if len(category_list) == 0: return s if allowed == False: for pos in pos_list: if pos.split(':')[1] not in category_list: if pos.split(':')[1] in ['NNS']: word = singularize(pos.split(':')[0]) list.append(word) elif pos.split(':')[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: word = conjugate(pos.split(':')[0], INFINITIVE) list.append(word) elif pos.split(':')[1] in ['JJ','JJR','JJS']: word = predicative(pos.split(':')[0]) list.append(word) else: list.append(pos.split(':')[0]) else: for pos in pos_list: if pos.split(':')[1] in category_list: if pos.split(':')[1] in ['NNS']: word = singularize(pos.split(':')[0]) list.append(word) elif pos.split(':')[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: word = conjugate(pos.split(':')[0], INFINITIVE) list.append(word) elif pos.split(':')[1] in ['JJ','JJR','JJS']: word = predicative(pos.split(':')[0]) list.append(word) else: list.append(pos.split(':')[0]) return u' '.join(list)
def pos_tagging_infinitive(self, s): ''' Grammatical category of each word a.k.a. Part-of-Speech (pos) tagging, but transformming adjectives to predicative form, singularizing nouns and verbs to infinitive form ej. ella:PRP maneja:VBD carros:NNS rojos:JJ PRP: Possesive pronoun ---> ella VBD: Verb in past tense ----> manejar(infinitive) NNS: Noun in plural --------> carro (singularized) JJ: adjective --------------> rojo (predicative) In: (s:string) string text Out: (list) list with grammatical categories in the form 'word:category' ''' categories = parse(s) list = [] if isinstance(s, str): s = unicode(s, "utf-8", "xmlcharrefreplace") for x in categories.split(): for y in x: if y[1] in ['NNS']: word = singularize(y[0]) list.append(word+":NN") elif y[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: word = conjugate(y[0], INFINITIVE) list.append(word+":VB") elif y[1] in ['JJ','JJR','JJS']: word = predicative(y[0]) list.append(word+":JJ") else: list.append(y[0]+':'+y[1]) return list
def convertir( cambiar ): #Diccionario con {'s':[palabrassingulares],'p':[palabrasplurales]} """ Devuelve un diccionario con palabras singulares en plurales y viceversa """ invertido = {'p': [], 's': []} for singulares in cambiar['s']: invertido['p'].append(pluralize(singulares)) for plurales in cambiar['p']: invertido['s'].append(singularize(plurales)) return invertido
def parse_NP(words): number="s" noun="" gender='m' t=Word_list_to_Text(words) # Example: todos los perros m=pattern_match("{DT} {DT} {JJ*|NN*}", t) if m and len(m)==len(t.words): learn_gender(m.group(2)[0].string, noun) noun = singularize(m.group(3)[0].string) if m.group(2)[0].string in plural_words: number='p' if m.group(2)[0].string in female_words: gender='f' return number, gender, noun # Example: el perro m=pattern_match("{DT} {JJ*|NN*}", t) if m and len(m)==len(t.words): noun = singularize(m.group(2)[0].string) learn_gender(m.group(1)[0].string, noun) if m.group(1)[0].string in plural_words: number='p' if m.group(1)[0].string in female_words: gender='f' return number, gender, noun # Example: verde m=pattern_match("{JJ*|NN*}", t) if m and len(m)==len(t.words): noun=m.group(1)[0].string if noun==pluralize(noun): number="p" noun = singularize(noun) # TODO: gender return number, gender, noun print "parse_NP() : not found", t sys.exit(0)
def pword(text): ind = parse(text).split('/')[1][0] # verbo if ind == 'V': word = lemma(text) # sustantivo o adjetivo else: word = singularize(text) return word
def convertir(dicc): dicc_nuevo = {} for x in dicc: if x == 's': dicc_nuevo['p'] = [] for i in range(len(dicc[x])): dicc_nuevo['p'].append(pluralize(dicc[x][i])) if x == 'p': dicc_nuevo['s'] = [] for i in range(len(dicc[x])): dicc_nuevo['s'].append(singularize(dicc[x][i])) return dicc_nuevo
def convertir(dic): diccionario = {} for clave in dic: lista = dic[clave] listaAux = [] if (clave == 's'): for elem in lista: listaAux.append(pluralize(elem)) diccionario['p'] = listaAux else: for elem in lista: listaAux.append(singularize(elem)) diccionario['s'] = listaAux return diccionario
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "n": test.setdefault(lemma, []).append(w) i, n = 0, 0 for sg, pl in test.items(): pl = sorted(pl, key=len, reverse=True)[0] if es.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print("pattern.es.singularize()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "n": test.setdefault(lemma, []).append(w) i, n = 0, 0 for sg, pl in test.items(): pl = sorted(pl, key=len, reverse=True)[0] if es.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print "pattern.es.singularize()"
def modifica_linea(dialogo): p = parse(dialogo) lista = p.split(' ') linea = [] for i in lista: palabra = i.split('/') if palabra[1] == 'VB': p = conjugate(palabra[0], INFINITIVE) linea.append(p) elif palabra[1] == 'NN': linea.append(singularize(palabra[0])) else: linea.append(palabra[0]) l = " ".join(linea) return l
def tagLemma(self, word_old): #print tag(word_old) for word, pos in tag(word_old): if pos=="NNS": #plurales x = singularize(word) elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo x = conjugate(word, INFINITIVE) #To-Do: fix this if x: # a veces da error al conjugar x = x else: x = word else: x = word return x
def unify_tokens(self): """ Singuralizes nouns, conjugates verbs to infinitive and passes adjectives to predicative form in tokens :return: Tokens """ if self._analysis is None: raise Exception('It\'s necessary execute first analize') for i in range(len(self._tokens)): if self._analysis[i][1][0] == 'n': self._tokens[i] = singularize(self._tokens[i]) elif self._analysis[i][1][0] == 'v': self._tokens[i] = conjugate(self._tokens[i], INFINITIVE) elif self._analysis[i][1][0] == 'a': self._tokens[i] = predicative(self._tokens[i]) return self._tokens
def stemming(self,tokens): text = " ".join(tokens) words = [] part_of_speech = {} part_of_speech['noun'] = ["NN"] part_of_speech['verbs'] = ["VB","VBG","VBP","VBZ","VBN","VBD"] part_of_speech['plural'] = ["NNS"] part_of_speech['adjective'] = ["JJ"] for word, pos in tag(text): if pos in part_of_speech['noun']: word = self.stemmer.stemming(word) if pos in part_of_speech["verbs"]: word = lemma(word) if pos in part_of_speech['plural']: word = singularize(word) if pos in part_of_speech['adjective']: word = self.stemmer.stemming(word) words.append(word) return words
def _getSingularize(word, language): import pattern.en as pattern_en # @UnresolvedImport import pattern.es as pattern_es # @UnresolvedImport import pattern.fr as pattern_fr # @UnresolvedImport import pattern.de as pattern_de # @UnresolvedImport import pattern.it as pattern_it # @UnresolvedImport if language == "es": return pattern_es.singularize(word) elif language == "en": return pattern_en.singularize(word) elif language == "it": return pattern_it.singularize(word) elif language == "fr": return pattern_fr.singularize(word) elif language == "de": return pattern_de.singularize(word) else: return pattern_en.singularize(word)
def convertir_corto(cambiar): dicc = { 'p': list(map(lambda x: pluralize(x), cambiar['s'])), 's': list(map(lambda x: singularize(x), cambiar['p'])) } return dicc #por que no anda si hago return {...}??????
def pluralize_singularize(): print(pluralize('gato')) print(singularize('gatos'))
def validacion(teclado, diccionario): """ Esta funcion debe ser llamada dentro de un loop para ingresar todas las palabras. Puntualmente la funcion valida 1a palabra ingresada y la agrega a un diccionario de palabras validas """ web = Wiktionary(language="es") articulo = web.search(teclado) #PALABRA cambio = False try: s = list(filter(lambda x: x.title == "Español", articulo.sections)) etimologia = list( filter(lambda x: x.title == "Etimología", s[0].children)) if etimologia == []: teclado = singularize(teclado) cambio = True #booleano que indica si la palabra cambió de plural a singular validado = False articulo = web.search(teclado) if articulo is not None: try: s = list( filter(lambda x: x.title == "Español", articulo.sections)) etimologia = list( filter(lambda x: x.title == "Etimología", s[0].children)) definicion = etimologia[0].content #DEFINICION lista = [ "Adjetivo", "Verbo", "Verbo intransitivo", "Forma verbal", "Verbo transitivo", "Forma adjetiva", "Sustantivo", "Sustantivo masculino", "Sustantivo femenino" ] lista_verbos = [ "Verbo", "Verbo intransitivo", "Forma verbal", "Verbo transitivo" ] lista_sustantivos = [ "Sustantivo", "Sustantivo masculino", "Sustantivo femenino" ] lista_adjetivos = ["Adjetivo", "Forma Adjetiva"] for tipo in lista: tipo_real = list( filter(lambda x: x.title == tipo, s[0].children)) if tipo_real: clasificacion = tipo break if clasificacion in lista_adjetivos: clasificacion = "Adjetivo" elif clasificacion in lista_sustantivos: clasificacion = "Sustantivo" elif clasificacion in lista_verbos: clasificacion = "Verbo" validado = True except IndexError: #esto previene el error de que la pagina de wik no tenga etimologia (definicion) validado = False definicion = "" # lo declaro para evitar un error posterior de referenciar una variable antes de que tenga un valor clasificacion = "" except AttributeError: validado = False definicion = "" # lo declaro para evitar un error posterior de referenciar una variable antes de que tenga un valor clasificacion = "" if cambio: teclado = pluralize(teclado) indice = teclado diccionario[indice] = { "Definición": definicion, "Tipo": clasificacion } #diccionario con la palabra , su deficinicion y su tipo. if validado: datos = { "info_palabra": diccionario, "validez": True } #si se ingreso correctamente, se modifica la lista y diccionario else: datos = { "info_palabra": diccionario, "validez": False } #si no se ingresó, solo "sirve" la validez return datos
def buscar(palabra, dic): engine = wik(language='es') articulo = None for i in range(0, 2): #3 reconexciones, una cada 1 segundos sg.PopupAnimated('loading.gif', alpha_channel=0.5) try: articulo = engine.article(singularize(palabra)) except: time.sleep(0.1) else: if articulo != None and engine.article(palabra).sections[ 1].title == 'Español': #si esta en wiktionary #y es una palabra en español (por que puede encontrar palabras en otro idioma) try: seccion = articulo.sections[3].content tipo = parsear_tipo(seccion) descripcion = parsear_descripcion(seccion) dic[palabra] = {'tipo': tipo, 'descripcion': descripcion} reportar(palabra + ' está en wiktionary.') sg.PopupAnimated(image_source=None) return dic except: #si esta en wiktionary pero no pudo parsear la definicion y el tipo... sg.PopupAnimated(image_source=None) if onPattern(palabra): tipo = clasificar( singularize(palabra)) #saca el tipo de pattern if not esValido(tipo): tipo = agregarTipo() if not tipo: return False descripcion = agregarDescripcion() if not descripcion: return False dic[palabra] = { 'tipo': tipo, 'descripcion': descripcion } reportar(palabra + ' está en pattern.') return dic else: tipo = agregarTipo() if not tipo: return False descripcion = agregarDescripcion() if not descripcion: return False dic[palabra] = { 'tipo': tipo, 'descripcion': descripcion } reportar(palabra + ' no está ni en wiktionary o pattern.') return dic elif onPattern(palabra): #si fue None se pureba si esta en pattern sg.PopupAnimated(image_source=None) tipo = clasificar(palabra) if not esValido(tipo): tipo = agregarTipo() if not tipo: return False descripcion = agregarDescripcion() if not descripcion: return False reportar(palabra + ' con tipo y descripción generada por el usuario.') dic[palabra] = {'tipo': tipo, 'descripción': descripcion} return dic else: sg.PopupAnimated(image_source=None) reportar(palabra + ' no está en wiktionary ni en pattern.') return False
# Encoding = UTF-8 from pattern.es import singularize, pluralize print(singularize('caballos')) def cambiapalabras(diccionario): """Recibe un diccionario con dos keys: ’s’ y ’p’. Donde ’s’ indica que la lista asociada contiene palabras en singular y ’p’ indica que la lista asociada contiene palabras en plural. Devuelve un diccionario con as palabras cambiadas de singular a plurar y viceversa""" cambiado = {} for key in diccionario: if key == 's': cambiado[key] = list(map(pluralize, diccionario.get(key))) elif key == 'p': cambiado[key] = list(map(singularize, diccionario.get(key))) return cambiado cambiar = { 's': ['gato', 'caballo', 'silla'], 'p': ['informaticas', 'psicologas', 'ingenieras'] } print(cambiapalabras(cambiar)) # devuelve: # {'p': ['informatica', 'psicologa', 'ingeniera'], 's': ['gatos', 'caballos', 'sillas']}