Python tag Examples, pattern.es.tag Python Examples

Example #1

0

Show file

def clasificar(palabra):
    from pattern.es import verbs, tag, spelling, lexicon
    import string

    if palabra != 'q':
        if not palabra.lower() in verbs:
            if not palabra.lower() in spelling:
                if (not (palabra.lower() in lexicon)
                        and not (palabra.upper() in lexicon)
                        and not (palabra.capitalize() in lexicon)):
                    return ('No existe esa palabra')
                else:
                    return tag(palabra,
                               tokenize=True,
                               encoding='utf-8',
                               tagset='UNIVERSAL')  #sustantivos
            else:
                return tag(palabra,
                           tokenize=True,
                           encoding='utf-8',
                           tagset='UNIVERSAL')  #adjetivos
        else:
            return tag(palabra,
                       tokenize=True,
                       encoding='utf-8',
                       tagset='UNIVERSAL')  #verbos

Example #2

0

Show file

 def clasificar(palabra):
     # if ("NN") in ( tag(palabra,tokenize=True, encoding='utf-8')):
     if ((tag(palabra, tokenize=True, encoding='utf-8')[0][1]) == "NN"):
         tipopattern = "Sustantivo"
     elif ((tag(palabra, tokenize=True, encoding='utf-8')[0][1]) == "JJ"):
         tipopattern = "Adjetivo"
     elif ((tag(palabra, tokenize=True, encoding='utf-8')[0][1]) == "VB"):
         tipopattern = "Verbo"
     else:
         tipopattern = ""
     return tipopattern

Example #3

0

Show file

File: SopaDeLetras.py Project: julianMarques1/TrabajoFinalSeminarioDePython

def clasificar(palabra):
    """
    Clasifica la palabra recibida en base al tipo definido por Pattern.

  """
    print(tag(palabra, tokenize=True, encoding='utf-8'))
    a = tag(palabra, tokenize=True, encoding='utf-8')
    aux = a[0][1]
    if aux == 'JJ':
        return 'adjetivo'
    elif aux == 'NN':
        return 'sustantivo'
    elif aux == 'VB':
        return 'verbo'

Example #4

0

Show file

def verbos(oracion):
    # Devuelve una tupla con el siguiente formato: (palabra,tipo) -- tipo:
    # adjetivo,verbo,etc.
    oracion = tag(oracion)
    for palabra, tipo in oracion:
        if (tipo == 'VB'):  # Veo cuales son los verbos
            print(palabra)

Example #5

0

Show file

File: config.py Project: pibytes/pythonUNLP

def analizarpalabra(palabra, cat):
    print(palabra)
    print(cat)
    engine = Wiktionary(license=None, throttle=1.0,
                        language="es")  # Enter your license key.
    sch = engine.search(palabra)

    print('Wiktionary dice que')
    if sch != None:
        if ('ES:Sustantivos' in sch.categories):
            print('es sustantivo!')
        if ('ES:Adjetivos' in sch.categories):
            print('es Adjetivo!')
        if ('ES:Verbos' in sch.categories):
            print('es verbo!')
    else:
        print('no se encuentra en wiktionary')

    print('Pattern.es dice que es')
    #Common part-of-speech tags are NN (noun), VB (verb), JJ (adjective), RB (adverb) and IN (preposition).
    tokenize(palabra, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={})
    tipo = tag(palabra, tokenize=True, encoding='utf-8')[0][1]
    print('Tipo:', tipo)
    if tipo == 'NN':
        print('SUSTANTIVO')
    if tipo == 'VB':
        print('Verbo')
    if tipo == 'JJ':
        print('ADJETIVO')
    if tipo == 'RB':
        print('Adverbio')
    if tipo == 'IN':
        print('Preposición')

Example #6

0

Show file

 def __verificar_palabra_patterEs(self):
     '''
         Devuelve el tipo de la palabra de Pattern.es
     '''
     try:
         return self.__dic[tag(self.__palabra)[0][1]]
     except IndexError:
         return ' '

Example #7

0

Show file

def verbosInfinitivos(str):
    verbos = []
    for word, pos in tag(
            str
    ):  #tag devuelve una lista de tuplas formadas por (palabra, tipo de palabra)
        if pos == "VB":
            verbos.append(conjugate(word, tense=INFINITIVE))
    return verbos

Example #8

0

Show file

File: ner_es.py Project: eternalcrash/pyOrganizer

 def pos_tag_tokenlist(self, tokens):
     """
         Args:
             tokens: list of str tokens of the sentence
         Returns:
             A list of tuples of the form (word,pos_tag)
     """
     return pat_es.tag(tokens, tokenize=False)

Example #9

0

Show file

def verbos_infinitivos(oracion):
    lista_infinitivos = []
    oracion = tag(
        oracion
    )  # Devuelve una tupla con el siguiente formato: (palabra,tipo) -- tipo: adjetivo,verbo,etc.
    for palabra, tipo in oracion:
        if (tipo == 'VB'):  # Veo cuales son los verbos
            lista_infinitivos.append(conjugate(palabra, INFINITIVE))
    return lista_infinitivos

Example #10

0

Show file

def clasificar(palabra,nivel,clasificacionAleatoria):
	'''Evalua si la palabra es correcta, segun el nivel''' 
	esValida = False
	correcto=False
	if 'ñ' in palabra:
		if palabra in lexicon.keys():
			correcto=True
	if (palabra in spelling.keys() and palabra in lexicon.keys()) or correcto:
		if nivel == 'Facil':
			esValida = True
		elif nivel == 'Medio':	
			lista = (tag(palabra, tokenize=True, encoding='utf-8'))
			if lista[0][1] == 'NN' or lista[0][1] == 'VB':
				esValida = True
		else:
			lista = (tag(palabra, tokenize=True, encoding='utf-8'))
			if lista[0][1] == clasificacionAleatoria:
				esValida = True		
	return esValida

Example #11

0

Show file

def grammatical_tagging():
    sentence = "El perro negro muerde sin parar."
    print(tag(sentence))
    pprint(parse(sentence))


#pluralize_singularize()
#verb_conjugation()
#adjetives()
#grammatical_tagging()

Example #12

0

Show file

File: buscador.py Project: marcosazcona00/SopDeLetrasProyecto

 def __verificar_palabra_patterEs(self):
     '''
         Devuelve el tipo de la palabra de Pattern.es
     '''
     tipo = ' '
     try:
         tipo = self.__dic[tag(self.__palabra)[0][1]]
     except (IndexError, KeyError):
         print('Error con pattern')
     finally:
         return tipo

Example #13

0

Show file

def cuentapVerbos(str):
    verbos = []
    for word, pos in tag(
            str
    ):  # tag devuelve una lista de tuplas formadas por (palabra, tipo de palabra)
        if pos == "VB":
            verbos.append(conjugate(word, tense=INFINITIVE))

    string = (' ').join(verbos)
    count = Counter(string.split(' ')).most_common()
    for key in range(3):
        print(count[key][0], count[key][1])

Example #14

0

Show file

def de_pattern(teclado):
    palabra_tipo = tag(teclado, tokenize=True, encoding="utf-8")
    #print(palabra_tipo)
    if palabra_tipo[0][1] == "NN":
        palabra_tipo = [teclado, "Sustantivo"]
    elif palabra_tipo[0][1] == "VB":
        palabra_tipo = [teclado, "Verbo"]
    elif palabra_tipo[0][1] == "JJ":
        palabra_tipo = [teclado, "Adjetivo"]
    else:
        palabra_tipo = ["", ""]  #preguntar por esto
        #print(palabra_tipo)
    return palabra_tipo

Example #15

0

Show file

File: palabras.py Project: salo2019/sopa-de-letras

 def verificar_pattern(self,palabra):
     from pattern.es import tag
     tipo= tag(palabra)[0][1]
     clasificacion=True
     if tipo=="VB":
         pTipo="verbo"
     elif tipo=="NN":
         pTipo="sustantivo"
     elif tipo=="JJ":
         pTipo="adjetivo"
     else:
         clasificacion=False
         pTipo="problema_pattern"   
     return [pTipo,clasificacion]

Example #16

0

Show file

File: emojier_es.py Project: beeva-labs/emojinews

	def tagLemma(self, word_old):
		#print tag(word_old)
		for word, pos in tag(word_old): 
			if pos=="NNS": #plurales
				x = singularize(word)
			elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo 
				x = conjugate(word, INFINITIVE)
				#To-Do: fix this
				if x: # a veces da error al conjugar
					x = x
				else:
					x = word
			else:
				x = word  
		return x

Example #17

0

Show file

def clasificar_pattern(palabra):
    """Clasificación de las palabras segun pattern.es"""
    palabra_tag = tag(palabra, tokenize=True, encoding='utf-8')
    print(palabra_tag)

    if not palabra.lower() in verbs:
        if not palabra.lower() in spelling:
            if (not (palabra.lower() in lexicon)
                    and not (palabra.upper() in lexicon)
                    and not (palabra.capitalize() in lexicon)):
                print('La palabra no se encuentra en pattern.es')
                return [(palabra, '_no_sabe_')]
            else:
                return palabra_tag
        else:
            return palabra_tag
    else:
        return palabra_tag

Example #18

0

Show file

File: cambio_tablero_con_colores.py Project: mailen912/TrabajoFinal

 def check_pattern(palabra, nivel, conjunto_dificil):
     '''esta funcion devuelve un boolean en true si la palabra es sustantivo, verbo o adjetivo y es correcta dependiendo del nivel de dificultad'''
     palabra = palabra.lower()
     palabras_no_permitidas = [
         'puto', 'puta', 'gil', 'choto', 'trolo', 'cagon', 'n***a',
         'nigger', 'cum'
     ]
     if palabra in palabras_no_permitidas:
         return False
     if (palabra in spelling) or (palabra in lexicon):
         tipo_palabra = tag(palabra)[0][1]
         if nivel == 'facil':
             return True
         elif (nivel == 'medio') and (tipo_palabra == 'VB'
                                      or tipo_palabra == 'NN'):
             return True
         elif (nivel == 'dificil') and (tipo_palabra == conjunto_dificil):
             return True
         else:
             return False
     else:
         return False

Example #19

0

Show file

File: configColores.py Project: EnzoPujol/TrabajoFinalPython

                strTipo = 'VB'
        except AttributeError:
            sg.Popup('La palabra no existe.')
            strTipo = None
        finally:
            print(strTipo)

        strDef = strNuevo.split('- 1')
        strDef = strDef[1].split('2')
        if '-' in strDef[0]:
            strDef = strDef[0].split('-')
            strDef = strDef[0]
        else:
            strDef = strDef[0]

        palTag = tag(values['palabra'])

        if strTipo == palTag[0][1]:
            #Usar la definición de Wiktionary si los dos coinciden en el tipo de palabra.
            dic = {}
            dic['Palabra'] = values['palabra']
            dic['Definicion'] = strDef
            dic['Tipo'] = palTag[0][1]
            if strTipo == 'NN':
                if contSustantivos < int(listaCantPal[0]):
                    listaJSONPal.append(dic)
                    contSustantivos += 1
            if strTipo == 'JJ':
                if contAdjetivos < int(listaCantPal[1]):
                    listaJSONPal.append(dic)
                    contAdjetivos += 1

Example #20

0

Show file

File: test_es.py Project: Afey/pattern

 def test_tag(self):
     # Assert [("el", "DT"), ("gato", "NN"), ("negro", "JJ")].
     v = es.tag("el gato negro")
     self.assertEqual(v, [("el", "DT"), ("gato", "NN"), ("negro", "JJ")])
     print("pattern.es.tag()")

Example #21

0

Show file

from pattern.es import tag
from pattern.es import INFINITIVE
from pattern.es import conjugate
from collections import Counter
frase = "Este es un párrafo de prueba. El verbo ser, será el mas utilizado. El otro será crear, por eso se creó la oración de esta manera. Por último, se creará esta oración que posee el tercer verbo: poseer. Nada más que decir."
lista_verbos = list(
    filter(lambda x: x[1] == 'VB',
           tag(frase)))  # Filtro de la lista solo los que son verbos
lista_verbos = list(map(lambda x: conjugate(x[0], INFINITIVE),
                        lista_verbos))  # Convierto los verbos en infinitivo
print(Counter(lista_verbos).most_common(3))

Example #22

0

Show file

File: analyze_vertical_pattern_socket.py Project: fmacias64/SentimentAnalysis

def preprocess(tweet):
	message = tweet.decode('utf-8', errors='ignore')

	#remove @ from tweets
	message = re.sub(re.escape('@')+r'(\w+)','&mention \g<1>',message)

	#remove # from tweets
	message = re.sub(re.escape('#')+r'(\w+)','&hashtag \g<1>',message)

	#remove urls from tweets
	message = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','&url',message)

	#transform emoticons into emotion to be later analyzed
	emoticons = {':-)': '&happy', ':)': '&happy', ':o)': '&happy', ':]': '&happy', ':3': '&happy', ':c)': '&happy',
				 ':>': '&happy', '=]': '&happy', '8)': '&happy', '=)': '&happy', ':}': '&happy', ':^)': '&happy',
				 ':-))': '&happy', '|;-)': '&happy', ":'-)": '&happy', ":')": '&happy', '\o/': '&happy',
				 '*\\0/*': '&happy', ':-D': '&laugh', ':D': '&laugh', '8-D': '&laugh', '8D': '&laugh', 'x-D': '&laugh',
				 'xD': '&laugh', 'X-D': '&laugh', 'XD': '&laugh', '=-D': '&laugh', '=D': '&laugh', '=-3': '&laugh',
				 '=3': '&laugh', 'B^D': '&laugh', '>:[': '&sad', ':-(': '&sad', ':(': '&sad', ':-c': '&sad',
				 ':c': '&sad', ':-<': '&sad', ':<': '&sad', ':-[': '&sad', ':[': '&sad', ':{': '&sad', ':-||': '&sad',
				 ':@': '&sad', ":'-(": '&sad', ":'(": '&sad', 'D:<': '&sad', 'D:': '&sad', 'D8': '&sad', 'D;': '&sad',
				 'D=': '&sad', 'DX': '&sad', 'v.v': '&sad', "D-':": '&sad', '(>_<)': '&sad', ':|': '&sad',
				 '>:O': '&surprise', ':-O': '&surprise', ':-o': '&surprise', ':O': '&surprise', '°o°': '&surprise',
				 ':O': '&surprise', 'o_O': '&surprise', 'o_0': '&surprise', 'o.O': '&surprise', '8-0': '&surprise',
				 '|-O': '&surprise', ';-)': '&wink', ';)': '&wink', '*-)': '&wink', '*)': '&wink', ';-]': '&wink',
				 ';]': '&wink', ';D': '&wink', ';^)': '&wink', ':-,': '&wink', '>:P': '&tong', ':-P': '&tong',
				 ':P': '&tong', 'X-P': '&tong', 'x-p': '&tong', 'xp': '&tong', 'XP': '&tong', ':-p': '&tong',
				 ':p': '&tong', '=p': '&tong', ':-Þ': '&tong', ':Þ': '&tong', ':-b': '&tong', ':b': '&tong',
				 ':-&': '&tong', ':&': '&tong', '>:\\': '&annoyed', '>:/': '&annoyed', ':-/': '&annoyed',
				 ':-.': '&annoyed', ':/': '&annoyed', ':\\': '&annoyed', '=/': '&annoyed', '=\\': '&annoyed',
				 ':L': '&annoyed', '=L': '&annoyed', ':S': '&annoyed', '>.<': '&annoyed', ':-|': '&annoyed',
				 '<:-|': '&annoyed', ':-X': '&seallips', ':X': '&seallips', ':-#': '&seallips', ':#': '&seallips',
				 'O:-)': '&angel', '0:-3': '&angel', '0:3': '&angel', '0:-)': '&angel', '0:)': '&angel',
				 '0;^)': '&angel', '>:)': '&devil', '>;)': '&devil', '>:-)': '&devil', '}:-)': '&devil',
				 '}:)': '&devil', '3:-)': '&devil', '3:)': '&devil', 'o/\o': '&highfive', '^5': '&highfive',
				 '>_>^': '&highfive', '^<_<': '&highfive', '<3': '&heart'
	}

	for symbol in emoticons:
		message = re.sub(r'('+re.escape(symbol)+r')[^a-z0-9A-Z]',' \g<1> '+emoticons[symbol]+' ',message+' ')

	message = message.lower()

	message = re.sub(re.escape('...'),'.' + ' &dots',message)

	#Normalization of punctuation for emphasizing a phrase
	for symbol in string.punctuation:
		message = re.sub(re.escape(symbol)+r'{3,}',' ' + symbol + ' &emphasis',message)

	#Separation of punctuations from words
	for symbol in ["¡","!",",",".","¿","?"]:
		message = re.sub(r'([0-9A-Za-z]+|^)' + re.escape(symbol) + r'($|\s)', r'\g<1> '+ symbol +r'\g<2>', message)
		message = re.sub(r'(\s|^)' + re.escape(symbol) + r'($|[0-9A-Za-z]+)', r'\g<1> '+ symbol +r' \g<2>', message)
		message = re.sub(r'([0-9A-Za-z]+)' + re.escape(symbol) + r'(0-9A-Za-z]+)', r'\g<1> '+ symbol +r' \g<2>', message)

	#Normalization of repeated characters
	for symbol in string.letters:
		message = re.sub(re.escape(symbol)+r'{2,}', symbol+symbol ,message)

	#Replace abbreviations with the full word
	for elem in abrv_map.items():
		message = re.sub(r'(\s+|^)'+re.escape(elem[0])+r'(\s+|$)', r'\g<1>'+elem[1].decode('utf8')+r'\g<2>' , message)

	#Replace booster or negating phrases with said with "_" instead of whitespaces
	for elem in modifiers.items():
		message = re.sub(r'(\s|^)' + re.escape(elem[0]) + r'(\s|$)' , r'\g<1>'+elem[1]+r'\g<2>', message)

	message = re.sub(' +',' ' ,message)
	message = message.strip()

	#Tag each word with the corresponding POS tag
	message = tag(message, tokenize=False)
	return message

Example #23

0

Show file

File: patternES.py Project: pibytes/pythonUNLP

from pattern.es import verbs, conjugate, INFINITIVE, parse, parsetree, tokenize, tag
from pattern.search import search

string = 'papa'

tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={})
bb = tag(string, tokenize=True, encoding='utf-8')[0][1]
print(string)

print('tag:', bb)
pos = tag(string, tokenize=True, encoding='utf-8')[0][1]
print('pos:', pos)
#Common part-of-speech tags are NN (noun), VB (verb), JJ (adjective), RB (adverb) and IN (preposition).
for word, pos in tag('I feel *happy*!'):
    if pos == "JJ":  # Retrieve all adjectives.
        print(word)

Example #24

0

Show file

import json
import os
from pattern.es import tag
from pattern.es import conjugate
from pattern.es import INFINITIVE

archivo = open(os.path.join("archivos","texto"), "r")
verbos = open(os.path.join("archivos","verbos.json"), "w")
dicc = {}
listaVerbos = []
# Almaceno todos los verbos
for palabra, tipo in tag(archivo.read()):
    if (tipo == 'VB'):
        listaVerbos.append(conjugate(palabra, INFINITIVE))
# Creo el dicc para el json verbo:apariciones
for verbo in set(listaVerbos):
    dicc[verbo] = listaVerbos.count(verbo)
# Preparo la lista para el JSON
listaVerbos = []
for verbo, cantidad in dicc.items():
    listaVerbos.append({verbo: cantidad})
# Escribo el JSON
json.dump(listaVerbos, verbos)
archivo.close()

Example #25

0

Show file

def clasificar(palabra):
    print(tag(palabra, tokenize=True, encoding='utf-8', tagset='UNIVERSAL'))
    print(tag(palabra, tokenize=True, encoding='utf-8'))
    print()

Example #26

0

Show file

 def test_tag(self):
     # Assert [("el", "DT"), ("gato", "NN"), ("negro", "JJ")].
     v = es.tag("el gato negro")
     self.assertEqual(v, [("el", "DT"), ("gato", "NN"), ("negro", "JJ")])
     print("pattern.es.tag()")

Example #27

0

Show file

def clasificar(palabra):
    t = tag(palabra, tokenize=True, encoding='utf-8')[0][
        1]  # si fueran varias palabras devuelve una lista de pares (palabra, tag)
    print('  tag:', t)
    return t

Example #28

0

Show file

File: lemmatizer.py Project: sebastiandev/pyragraph

 def _apply_normalizer(self, data):
     lemma_word = lambda x: lemma(x) if tag(x)[0][1] == text.VB else x
     lemma_word_list = lambda xl: [lemma_word(w) for w in xl]
     return lemma_word(data) if not isinstance(data, (list, tuple)) else lemma_word_list(data)

Example #29

0

Show file

File: scrabbleAr.py Project: dajuam/screabbleAr

def Play():
    board_tablero = copy.deepcopy(initial_tablero)
    board_atril = copy.deepcopy(initial_atril)

    # Genero una matriz de 10x10 de tipo RButton con las imagenes en blanco
    tablero = []
    for i in range(10):
        row = []
        for j in range(10):
            piece_image = images[BLANK]
            row.append(render_square(piece_image['imagen'], key=(i, j)))
        tablero.append(row)

    # Genero un array de 7 elementos de tipo RButton con las imágenes de las letras aleatorias
    atril = []
    for i in range(7):
        row = []
        piece_image = images[board_atril[i]]
        row.append(render_square(piece_image['imagen'], key=i))
        atril.append(row)

    board_tab = [[sg.Button('CHECK')], [sg.Column(atril), sg.Column(tablero)]]
    window = sg.Window('ScrabbleAr',
                       default_button_element_size=(12, 1),
                       auto_size_buttons=False).Layout(board_tab)

    word = ''
    move_from = move_to = -1
    first_movement = True
    orientation = ORIENTATION_NONE
    # Temporal para no permitir el click en el atril de los "blancos"
    keys_chosen = []

    while True:
        while True:
            button, value = window.Read()
            if button == 'CHECK':
                if len(word) >= 2 and len(word) <= 7:
                    wordType = tag(word)[0][1]
                    if wordType == 'VB':
                        sg.Popup('La palabra existe y es un verbo: ', word)
                    else:
                        sg.Popup('La palabra no es un verbo: ', word)
                    # Si esta bien, calcular puntos y luego cambia el turno
                else:
                    sg.Popup(
                        'Atención: ',
                        'La palabra formada no cumple con los mínimos ni máximos'
                    )
            if button in (None, 'Exit'):
                exit()
            # Click origen
            if type(button) is int:
                if button in keys_chosen:
                    sg.Popup('Atención: ',
                             'Click incorrecto, este elemento esta vacio')
                    break
                if move_from != -1:
                    sg.Popup('Atención: ',
                             'Click incorrecto, debe insistir en el tablero')
                    break
                move_from = button
                # Busco que numero de letra esta en la posicion clickeada
                piece = board_atril[move_from]
                letter_choosen = images[board_atril[move_from]]['letra']
                keys_chosen.append(button)
            # click destino
            if type(button) is tuple:
                if move_from == -1:
                    sg.Popup('Atención: ',
                             'Click incorrecto, debe insistir en el atril')
                    break
                move_to = button
                row, col = move_to

                if first_movement == False:
                    if orientation == ORIENTATION_NONE:
                        orientation = get_orientation(move_to,
                                                      move_to_anterior)
                    if orientation == ORIENTATION_ERROR:
                        sg.Popup('Atención: ',
                                 'No se pudo calcular el sentido')
                        orientation = ORIENTATION_NONE
                        break
                    if not correct_movement(move_to, move_to_anterior,
                                            orientation):
                        sg.Popup('Atención: ', 'Movimiento incorrecto')
                        break

                # La posicion de la letra que se fue queda en 0
                board_atril[move_from] = BLANK
                # El tablero queda con el numero "nuevo"
                board_tablero[row][col] = piece
                # Luego tengo que "redibujar" ambos tableros
                redraw_atril(window, board_atril)
                redraw_tablero(window, board_tablero)
                word = word + letter_choosen

                move_to_anterior = move_to
                move_from = move_to = -1
                first_movement = False
                break

Example #30

0

Show file

File: analyze.py Project: silvestrelosada/SentimentAnalysis

def preprocess(tweet):
    message = tweet.decode('utf-8', errors='ignore')

    #remove @ from tweets
    message = re.sub(re.escape('@') + r'(\w+)', '&mention \g<1>', message)

    #remove # from tweets
    message = re.sub(re.escape('#') + r'(\w+)', '&hashtag \g<1>', message)

    #remove urls from tweets
    message = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        '&url', message)

    #transform emoticons into emotion to be later analyzed
    emoticons = {
        ':-)': '&happy',
        ':)': '&happy',
        ':o)': '&happy',
        ':]': '&happy',
        ':3': '&happy',
        ':c)': '&happy',
        ':>': '&happy',
        '=]': '&happy',
        '8)': '&happy',
        '=)': '&happy',
        ':}': '&happy',
        ':^)': '&happy',
        ':-))': '&happy',
        '|;-)': '&happy',
        ":'-)": '&happy',
        ":')": '&happy',
        '\o/': '&happy',
        '*\\0/*': '&happy',
        ':-D': '&laugh',
        ':D': '&laugh',
        '8-D': '&laugh',
        '8D': '&laugh',
        'x-D': '&laugh',
        'xD': '&laugh',
        'X-D': '&laugh',
        'XD': '&laugh',
        '=-D': '&laugh',
        '=D': '&laugh',
        '=-3': '&laugh',
        '=3': '&laugh',
        'B^D': '&laugh',
        '>:[': '&sad',
        ':-(': '&sad',
        ':(': '&sad',
        ':-c': '&sad',
        ':c': '&sad',
        ':-<': '&sad',
        ':<': '&sad',
        ':-[': '&sad',
        ':[': '&sad',
        ':{': '&sad',
        ':-||': '&sad',
        ':@': '&sad',
        ":'-(": '&sad',
        ":'(": '&sad',
        'D:<': '&sad',
        'D:': '&sad',
        'D8': '&sad',
        'D;': '&sad',
        'D=': '&sad',
        'DX': '&sad',
        'v.v': '&sad',
        "D-':": '&sad',
        '(>_<)': '&sad',
        ':|': '&sad',
        '>:O': '&surprise',
        ':-O': '&surprise',
        ':-o': '&surprise',
        ':O': '&surprise',
        '°o°': '&surprise',
        ':O': '&surprise',
        'o_O': '&surprise',
        'o_0': '&surprise',
        'o.O': '&surprise',
        '8-0': '&surprise',
        '|-O': '&surprise',
        ';-)': '&wink',
        ';)': '&wink',
        '*-)': '&wink',
        '*)': '&wink',
        ';-]': '&wink',
        ';]': '&wink',
        ';D': '&wink',
        ';^)': '&wink',
        ':-,': '&wink',
        '>:P': '&tong',
        ':-P': '&tong',
        ':P': '&tong',
        'X-P': '&tong',
        'x-p': '&tong',
        'xp': '&tong',
        'XP': '&tong',
        ':-p': '&tong',
        ':p': '&tong',
        '=p': '&tong',
        ':-Þ': '&tong',
        ':Þ': '&tong',
        ':-b': '&tong',
        ':b': '&tong',
        ':-&': '&tong',
        ':&': '&tong',
        '>:\\': '&annoyed',
        '>:/': '&annoyed',
        ':-/': '&annoyed',
        ':-.': '&annoyed',
        ':/': '&annoyed',
        ':\\': '&annoyed',
        '=/': '&annoyed',
        '=\\': '&annoyed',
        ':L': '&annoyed',
        '=L': '&annoyed',
        ':S': '&annoyed',
        '>.<': '&annoyed',
        ':-|': '&annoyed',
        '<:-|': '&annoyed',
        ':-X': '&seallips',
        ':X': '&seallips',
        ':-#': '&seallips',
        ':#': '&seallips',
        'O:-)': '&angel',
        '0:-3': '&angel',
        '0:3': '&angel',
        '0:-)': '&angel',
        '0:)': '&angel',
        '0;^)': '&angel',
        '>:)': '&devil',
        '>;)': '&devil',
        '>:-)': '&devil',
        '}:-)': '&devil',
        '}:)': '&devil',
        '3:-)': '&devil',
        '3:)': '&devil',
        'o/\o': '&highfive',
        '^5': '&highfive',
        '>_>^': '&highfive',
        '^<_<': '&highfive',
        '<3': '&heart'
    }

    for symbol in emoticons:
        message = re.sub(r'(' + re.escape(symbol) + r')[^a-z0-9A-Z]',
                         ' \g<1> ' + emoticons[symbol] + ' ', message + ' ')

    message = message.lower()

    message = re.sub(re.escape('...'), '.' + ' &dots', message)

    #Normalization of punctuation for emphasizing a phrase
    for symbol in string.punctuation:
        message = re.sub(
            re.escape(symbol) + r'{3,}', ' ' + symbol + ' &emphasis', message)

    #Separation of punctuations from words
    for symbol in ["¡", "!", ",", ".", "¿", "?"]:
        message = re.sub(r'([0-9A-Za-z]+|^)' + re.escape(symbol) + r'($|\s)',
                         r'\g<1> ' + symbol + r'\g<2>', message)
        message = re.sub(r'(\s|^)' + re.escape(symbol) + r'($|[0-9A-Za-z]+)',
                         r'\g<1> ' + symbol + r' \g<2>', message)
        message = re.sub(
            r'([0-9A-Za-z]+)' + re.escape(symbol) + r'(0-9A-Za-z]+)',
            r'\g<1> ' + symbol + r' \g<2>', message)

    #Normalization of repeated characters
    for symbol in string.letters:
        message = re.sub(re.escape(symbol) + r'{2,}', symbol + symbol, message)

    #Replace abbreviations with the full word
    for elem in abrv_map.items():
        message = re.sub(r'(\s+|^)' + re.escape(elem[0]) + r'(\s+|$)',
                         r'\g<1>' + elem[1].decode('utf8') + r'\g<2>', message)

    #Replace booster or negating phrases with said with "_" instead of whitespaces
    for elem in modifiers.items():
        message = re.sub(r'(\s|^)' + re.escape(elem[0]) + r'(\s|$)',
                         r'\g<1>' + elem[1] + r'\g<2>', message)

    message = re.sub(' +', ' ', message)
    message = message.strip()

    #Tag each word with the corresponding POS tag
    message = tag(message, tokenize=False)
    return message

Example #31

0

Show file

def verificacion_palabra(palabra, jsonObj):
    '''Verifica que tipo de clasificacion tiene la palabra en pattern.es y en wikcionario.
Realiza reportes si:
  *No se encuentra la palabra en wikcionario
  *No se encuentra la palabra en wikcionario ni en pattern.es
  *No coinciden las clasificaciones de wikcionario y pattern.es'''
    entra = True
    try:
        palabra = palabra.lower()
        tipo_pat = pat.tag(palabra)
        tag = tipo_pat[0][1][0:2]
        engine = wik(language='es')
        articulo = engine.article(palabra)
        secciones = articulo.sections
        texto = articulo.plaintext()
        if const.TAGS_FUNCIONAL[tag] in texto:
            for x in range(len(secciones)):
                if const.TAGS_FUNCIONAL[tag] in secciones[x].title:
                    indice = x
                    break
        else:
            clasificacion = tag
            for var in const.TAGS_FUNCIONAL:
                if const.TAGS_FUNCIONAL[var] in texto:
                    tag = var
                    break
            for x in range(len(secciones)):
                if const.TAGS_FUNCIONAL[tag] in secciones[x].title:
                    indice = x
                    break
            #Reportando a pattern
            lectura_pattern = open("report files/reporte_pattern", 'r')
            if not palabra in (' ').join(lectura_pattern.readlines()):
                reporte_pattern = open("report files/reporte_pattern", 'a')
                reporte = "<<" + palabra + ">> La clasificacion de pattern y wiktionary difieren. Pattern=" + clasificacion + ". Wiktionary=" + secciones[
                    indice].title + ". \n"
                reporte_pattern.write(reporte)
                reporte_pattern.close()
            lectura_pattern.close()
        descripciones = secciones[indice].plaintext()
        lista_descripciones = descripciones.split('\n')
        desc = []
        for var in lista_descripciones:
            for n in range(10):
                if str(n) in var:
                    desc.append(var)
                    break
        layout = [
            [
                sg.Text(
                    'Por favor seleccione una de las descripciones para la palabra.'
                )
            ],
            [
                sg.Text(
                    'Utilice las flechas arriba/abajo para seleccionar en caso de ser muy larga la descripcion'
                )
            ], [sg.Combo(desc, default_value=desc[0])], [sg.Button('Aceptar')]
        ]
        window = sg.Window('Elija descripcion').Layout(layout)
        while True:
            event, values = window.Read()
            if event is None or event == 'Aceptar':
                descrip = values
                break
        window.Close()
    except AttributeError:
        #Reportando a wikcionario
        lectura_wikcionario = open("report files/reporte_wikcionario", 'r')
        if not palabra in (' ').join(lectura_wikcionario.readlines()):
            reporte_wikcionario = open("report files/reporte_wikcionario", 'a')
            reporte = "<<" + palabra + ">> No existe la palabra en el wiktionario. Se tomara la clasificacion de pattern si existe."
            reporte_wikcionario.write(reporte)
            reporte_wikcionario.close()
        lectura_wikcionario.close()
        if tag == 'NN':
            entra = comprobar_sustantivo(palabra)
        if entra and (tag == 'VB' or tag == 'JJ' or tag == 'NN'):
            texto1 = 'No existe un articulo en wikcionario de la palabra <<' + palabra + '>>.'
            if 'JJ' in tag:
                texto2 = 'Segun el modulo <<pattern.es>> la palabra ingresada es un adjetivo.'
            elif 'NN' in tag:
                texto2 = 'Segun el modulo <<pattern.es>> la palabra ingresada es un sustantivo.'
            elif 'VB' in tag:
                texto2 = 'Segun el modulo <<pattern.es>> la palabra ingresada es un verbo.'
            layout = [
                [sg.Text(texto1)], [sg.Text(texto2)],
                [
                    sg.Text('Ingrese una descripcion para dicha palabra:'),
                    sg.Input()
                ], [sg.OK()]
            ]
            window = sg.Window('Descripcion').Layout(layout)
            event, values = window.Read()
            if event is None or event == 'OK':
                descrip = values
                window.Close()
        else:
            #Reportando a wikcionario y pattern
            lectura_wik_pat = open("report files/reporte_wikcionario_pattern",
                                   'r')
            if not palabra in (' ').join(lectura_wik_pat.readlines()):
                reporte_wikcionario_pattern = open(
                    "report files/reporte_wikcionario_pattern", 'a')
                if entra:
                    reporte = "<<" + palabra + ">> La palabra no se encuentra en wikcionario y no clasifica como verbo, adjetivo o sustantivo en pattern.\n"
                else:
                    reporte = "<<" + palabra + ">> La palabra no se encuentra en wikcionario ni en pattern.\n"
                reporte_wikcionario_pattern.write(reporte)
                reporte_wikcionario_pattern.close()
            lectura_wik_pat.close()
            entra = False
            layout = [[
                sg.Text(
                    'No se encuentra la palabra en wikcionario ni en pattern. Se incluirá en un reporte.'
                )
            ], [sg.OK()]]
            window = sg.Window('No se encuentra').Layout(layout)
            event, values = window.Read()
            if event is None or event == 'OK':
                window.Close()
    if entra:
        jsonObj[0][tag][palabra] = descrip
    return jsonObj