Python loadの例、es_core_news_sm.load Pythonの例

コード例 #1

0

ファイルを表示

ファイル: text_preprocessor.py プロジェクト: thefirebanks/policy-data-analyzer

 def __init__(self):
     self.nlp_model = es_core_news_sm.load(
     )  # load spacy with spanish model
     self.nlp_model.max_length = 4000000
     stop = stopwords.words('spanish')  # stopwords in spanish
     sw_es = self.nlp_model.Defaults.stop_words  # stopwords in spanish
     self.sw = sw_es.union(stop)

コード例 #2

0

ファイルを表示

ファイル: main.py プロジェクト: LCC-TopicosAvanzadosAI/ChatBot

def respond(sentence, diccionario):
    #Se carga lo relacionado a spacy
    nlp = es_core_news_sm.load()
    doc = nlp(sentence)
    #Para imprimir los tags que spacy nos ofrece
    #for word in doc:
        #print(word.text, word.pos_, word.tag_)
    #Funcion para dividir una sentencia con una orden de varias partes en varias ordenes
    parsed = process_order_into_orders(doc)

    #Detectamos que orden desea el cliente (Funcionando 80%, pasa los testeos pero no la he testeado bien)
    ordenes = detect_order(parsed)

    #Dependiendo de la orden hacemos lo que se nos pida
    #Nota: Ordenar es la mas completa, remover esta mas o menos completa, cambiar esta incompleta y se espera que podamos brindar mas ordenes
    #print("ORDENES: ", ordenes)
    response = ""
    for idx,orden in enumerate(ordenes):
        if orden == "ordenar":
            resp,diccionario = add_to_list(parsed[idx],diccionario)
            response += resp
        elif orden == "remover":
            resp,diccionario = remove_from_list(parsed[idx], diccionario)
            response += resp
        elif orden == "recomendar":
        	response += recomend(parsed[idx])
        elif orden == "mostrar":
            resp = show_list(diccionario)
            response += resp

    #print(diccionario)

    return response, diccionario

コード例 #3

0

ファイルを表示

ファイル: views.py プロジェクト: pedrojsalinas/JeffersonSBC

    def limpiezaDatos(self, text):
        nlp = es_core_news_sm.load()
        text = nlp(text)
        tokenized_sentences = [sentence.text for sentence in text.sents]

        datos = []

        for sentence in tokenized_sentences:
            for entity in nlp(sentence).ents:
                consulta = 'SELECT ?s ?p ?o  WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % (
                    entity.text)
                for row in self.g.query(consulta):
                    tripleta = []
                    sujeto = row.s
                    predicado = row.p.split("/")
                    objeto = row.o.split("/")
                    objetoUri = row.o
                    predicado = predicado[len(predicado) - 1]
                    objeto = objeto[len(objeto) - 1]
                    # if '$' in objetoUri:
                    #     objetoUri = ''
                    tripleta.append(entity.text)
                    tripleta.append(sujeto)
                    tripleta.append(predicado)
                    tripleta.append(objeto)
                    tripleta.append(objetoUri)
                    datos.append(tripleta)
        datos = OrderedDict((tuple(x), x) for x in datos).values()
        lista = []
        for i in datos:
            lista.append(i)
        return lista

コード例 #4

0

ファイルを表示

    def limpiezaDatos(self, text):
        # libreria spacy
        nlp = es_core_news_sm.load()
        text = nlp(text)
        tokenized_sentences = [sentence.text for sentence in text.sents]
        g = rdflib.Graph()
        # nombre del archivo
        g.parse("mydataset.rdf")
        datos = []

        for sentence in tokenized_sentences:
            for entity in nlp(sentence).ents:
                consulta = 'SELECT ?s ?p ?o  WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % (
                    entity.text)
                for row in g.query(consulta):
                    tripleta = []
                    predicado = row.p.split("/")
                    objeto = row.o.split("/")
                    predicado = predicado[len(predicado) - 1]
                    objeto = objeto[len(objeto) - 1]
                    tripleta.append(entity.text)
                    tripleta.append(predicado)
                    tripleta.append(objeto)
                    datos.append(tripleta)
        # elimina duplicados
        datos = OrderedDict((tuple(x), x) for x in datos).values()
        lista = []
        for i in datos:
            lista.append(i)
        return lista

コード例 #5

0

ファイルを表示

def preprocess(text):
    sp = es_core_news_sm.load()
    # Texto en minusculas
    text_lower = text.lower()
    # Tokenize
    token_word = nltk.word_tokenize(text_lower, "spanish")
    # Algunos textos tienen caracteres raros al principio, por tanto hay que eliminarlos para que no influyan en el código
    token_word[0] = clear_first_token(token_word[0])
    # Stopwords de palabras españolas
    stopword_spanish = stopwords.words("spanish")
    i = 0

    while (i < len(token_word)):
        # Se eliminan los tokens que se encuentren dentro de las stopwords
        if token_word[i] in stopword_spanish:
            token_word.remove(token_word[i])
        #Se elimina cualquier token que sea distinto de caracteres alfanumericos
        elif not (token_word[i].isalpha()):
            token_word.remove(token_word[i])
        else:
            # Debido a que la libreria nltk no lematiza textos en español es necesario utilizar otra libreria de NLP la cual es Spacy
            word = sp(token_word[i])[0]
            token_word[i] = word.lemma_
            i = i + 1
    return token_word

コード例 #6

0

ファイルを表示

class Semantico():
    sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/")
    nlp = es_core_news_sm.load()

    def consultaVirutoso(self, texto):
        text = self.nlp(texto)
        tokenized_sentences = [sentence.text for sentence in text.sents]
        datos = []
        entidades = []
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                entidades.append(entity.text)
                # consulta mejorada
                consulta = """
                SELECT ?s ?p ?o
                    WHERE 
                        { 
                           ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                        }
                        """ % (entity.text, entity.text)
                self.sbcEndpoint.setQuery(consulta)
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:
                    lista = []
                    listaS = result["s"]["value"]
                    listaP = result["p"]["value"]
                    listaO = result["o"]["value"]
                    # por si sale con ese link no agregar (revisar)
                    # if listaO.startswith('http://www.openlinks'):
                    lista.append(listaS)
                    lista.append(listaP)
                    lista.append(listaO)
                    datos.append(lista)
        # Eliminando duplicados
        # entidades = list(set(entidades))
        return datos, entidades

    def textoHtml(self, texto, entidades):
        for palabra in entidades:
            if palabra in texto:
                url = '<a href = "http://localhost:8080/negociador/page/{}">{}</a>'.format(
                    palabra, palabra)
                if url not in texto:
                    texto = texto.replace(palabra, url)
        return texto

    def consultaPorUri(self, uri):
        consulta = """
                    SELECT ?p ?o
                        WHERE
                        {
                            <%s> ?p  ?o
                        }
                """ % (uri)
        self.sbcEndpoint.setQuery(consulta)
        self.sbcEndpoint.setReturnFormat(JSON)
        results = self.sbcEndpoint.query().convert()
        return results["results"]["bindings"]

コード例 #7

0

ファイルを表示

ファイル: Labeler.py プロジェクト: rogargon/tfg-nlp

 def _load_spacy(self, ):
     # LOAD SPACY
     t1 = timeit.default_timer()
     spacy_nlp = es_core_news_sm.load()
     spacy_nlp.max_length = 1500000
     t2 = timeit.default_timer()
     print(f"Time to load spaCy: {t2 - t1}")
     return spacy_nlp

コード例 #8

0

ファイルを表示

    def __init__(self):
        self.dao = Dao()
        self.module_url = "./Modelos/modelo_dm0_20_2000.txt"
        self.embed = gensim.models.doc2vec.Doc2Vec.load(self.module_url)
        self.nlp = es_core_news_sm.load()
        self.stop = set(stopwords.words('spanish'))

        self.non_words = list(punctuation)
        self.non_words.extend(['¿', '¡'])
        self.non_words.extend(self.stop)
        self.non_words.extend(map(str, range(10)))

コード例 #9

0

ファイルを表示

ファイル: __main__.py プロジェクト: warpcomdev/sentiment

def _nlp(spacy_module: str) -> Optional[NLP]:
    print("Loading spacy language model for '", spacy_module, "'")
    if spacy_module == 'en':
        nlp = en_core_web_sm.load()
    elif spacy_module == 'es':
        nlp = es_core_news_sm.load()
    elif spacy_module == 'de':
        nlp = de_core_news_sm.load()
    elif spacy_module == 'fr':
        nlp = fr_core_news_sm.load()
    elif spacy_module == 'it':
        nlp = it_core_news_sm.load()
    elif spacy_module == 'pt':
        nlp = pt_core_news_sm.load()
    else:
        raise ValueError(f'Unsupported language {spacy_module}')
    return nlp

コード例 #10

0

ファイルを表示

ファイル: import-and-basic-analysis.py プロジェクト: Sebastenhauer/nlp-repo

def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp

コード例 #11

0

ファイルを表示

ファイル: test-datos.py プロジェクト: retorres9/websemantica

class Semantico():
    sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/")
    nlp = es_core_news_sm.load()

    def consultaVirutoso(self, texto):
        text = self.nlp(texto)
        tokenized_sentences = [sentence.text for sentence in text.sents]
        datos = []
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                palabra = self.limpiarDatos(entity)
                consulta = """
                SELECT ?s ?p ?o
                    WHERE 
                        { 
                            ?s ?p ?o .FILTER regex(str(?s), "%s") .
                        }
                        """ % (palabra)
                self.sbcEndpoint.setQuery(consulta)
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:
                    lista = []
                    listaS = result["s"]["value"]
                    listaP = result["p"]["value"]
                    listaO = result["o"]["value"]
                    lista.append(listaS)
                    lista.append(listaP)
                    lista.append(listaO)
                    datos.append(lista)
        datos = OrderedDict((tuple(x), x) for x in datos).values()
        return datos
    def limpiarDatos(self,palabra):
        palabra = str(palabra)
        print('***'*10)
        print(palabra)
        palabra = palabra.replace(' ','_')
        palabra = palabra.replace('á','a')
        palabra = palabra.replace('é','e')
        palabra = palabra.replace('í','i')
        palabra = palabra.replace('ó','o')
        palabra = palabra.replace('ú','u')
        print(palabra)
        print('***'*10)
        return palabra

コード例 #12

0

ファイルを表示

ファイル: recapper.py プロジェクト: Tsadoq/rec.ap.discord

 def __init__(self, url):
     try:
         pattern = re.compile(
             "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$"
         )
         if not pattern.match(url):
             print(f"{url} is not a valid url")
         self.url = url
         self.article = Article(self.url)
         self.article.download()
         self.article.parse()
         self.author = self.article.authors
         self.oneline = self.article.summary
         self.text = self.article.text.replace("\n", ".")
         if self.article.meta_lang == 'en' or (self.article.meta_lang == ''
                                               and url.find(
                                                   "cnn.com", 0, 10)):
             import en_core_web_sm
             self.model = en_core_web_sm.load()
         elif self.article.meta_lang == 'it':
             import it_core_news_sm
             self.model = it_core_news_sm.load()
         elif self.article.meta_lang == 'fr':
             import fr_core_news_sm
             self.model = fr_core_news_sm.load()
         elif self.article.meta_lang == 'es':
             import es_core_news_sm
             self.model = es_core_news_sm.load()
         elif self.article.meta_lang == 'pt':
             import pt_core_news_sm
             self.model = pt_core_news_sm.load()
         else:
             print(
                 f"The {self.article.meta_lang} language is not supported")
         self.data = []
         self.vectorizer = TfidfVectorizer(strip_accents='unicode')
     except article.ArticleException:
         print(
             f"The url {url} is not supported, please write to [email protected] for further help"
         )
         self.valid = False

コード例 #13

0

ファイルを表示

ファイル: process_tags_spanish.py プロジェクト: ds4a-team-13/clasificador-noticias

def master_tags_spanish(serie_word: pd.Series):
    """
	This function apply the above function to each news
	
	@input:
		serie_word: series object contains all news text
	@return:
		tags_pos: dictionary that contains the id of news in the keys and the 
		          list of tuples with tags_pos in the values
	"""
    # model for tags in spanish from scipy
    nlp = es_core_news_sm.load()
    # names file
    df_names = pd.read_csv('data/external/stopwords_names.txt')
    names = list(df_names['name'])

    # applying the above function to each text
    tags_pos = serie_word.apply(lambda x: process_tags_spanish(x, names, nlp))
    tags_pos = dict(tags_pos)

    return tags_pos

コード例 #14

0

ファイルを表示

ファイル: app.py プロジェクト: TopicosIA/Pytbot

def respuesta(mensaje, database):
    nlp = es_core_news_sm.load()
    parsed = nlp(str(mensaje))
    pronoun, aux, noun, adjective, verb, det = find_candidate_parts_of_speech(
        parsed)
    respuesta = check_for_comment_about_bot(pronoun, noun, adjective, det)
    #esNuevo = check_for_experience(pronoun,aux,noun,verb,det)
    print("respuesta", respuesta)
    noEntendi = [
        "Puedes ser un poco más claro", "Podrías ser mas especifico",
        "No te entiendo -.-", "Ni idea!", "Preguntas serias por favor!",
        "En que te puedo ayudar?"
    ]

    if not respuesta:
        respuesta = construir_respuesta(mensaje, pronoun, aux, noun, verb, det,
                                        database)

    if not respuesta:
        respuesta = random.choice(noEntendi)

    #logger.info("Returning phrase '%s'", resp)
    return respuesta

コード例 #15

0

ファイルを表示

ファイル: views.py プロジェクト: pedrojsalinas/semantico

    def post(self, request):
        form = BuscadorForm(request.POST)
        my_title = "Caso Arroz Verde"
        if form.is_valid():
            nlp = es_core_news_sm.load()
            text = form.cleaned_data['query']
            textoInicial = form.cleaned_data['query']
            text = nlp(text)
            tokenized_sentences = [sentence.text for sentence in text.sents]
            print(tokenized_sentences)
            g = rdflib.Graph()
            g.parse("arroz_verde.rdf")
            datos = []
            form = BuscadorForm()

            for sentence in tokenized_sentences:
                for entity in nlp(sentence).ents:
                    consulta = 'SELECT ?s ?p ?o  WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % (
                        entity.text)
                    for row in g.query(consulta):
                        tripleta = []
                        predicado = row.p.split("/")
                        objeto = row.o.split("/")
                        predicado = predicado[len(predicado) - 1]
                        objeto = objeto[len(objeto) - 1]
                        tripleta.append(entity.text)
                        tripleta.append(predicado)
                        tripleta.append(objeto)
                        datos.append(tripleta)
            print(datos)
            args = {
                "titulo": my_title,
                "datos": datos,
                "form": form,
                "textoInicial": textoInicial
            }
        return render(request, self.template_name, args)

コード例 #16

0

ファイルを表示

def get_data(reviews, label_and_terms):
    """
    Generate the data in proper format to train the NER model by 
    matching the search terms of the entities in the texts of 
    the reviews.
    """
    # Load the spaCy statistical model
    nlp = es_core_news_sm.load()
    # Disable unnneeded pipeline components
    nlp.disable_pipes('ner', 'tagger', 'parser')
    
    data = []
    for review in tqdm(reviews):
        # Match the terms of the entities in the text
        matches_in_text = get_matches_in_proper_format(review, 
                                                       label_and_terms, 
                                                       nlp)

        matches_info_in_text = {"entities": matches_in_text}
        # Create a tuple with the text and the matches
        row_data = (review, matches_info_in_text)
        data.append(row_data)

    return data

コード例 #17

0

ファイルを表示

ファイル: transform.py プロジェクト: noeliacarrion/tweeffinity-final-project-ironhack

                                           infer_datetime_format=True)
    return dataframe[new_column]


def dropcolumns(dataframe, columns):
    dataframe = dataframe.drop(columns, axis=1)
    return dataframe


def modifiedDate(dataframe, column, year, month, day):
    dataframe = dataframe[(dataframe[column] > datetime.date(year, month,
                                                             day))]
    return dataframe


nlp = es_core_news_sm.load(parser=True)
nlp.Defaults.stop_words |= {
    "RT", "próx", "xd", "rt", "htt", "parir", "sobrar", "the", "and",
    "gracias", "hola", "jajaja", "jajajaja", "hablar", "comer", "personar",
    "you", "with", "casar", "was", "that", "what", "pasar", "salir"
}


def spacyTokenizer(sentence):
    sentence = re.sub(r"htt\S+", '', sentence)
    sentence = re.sub('@', '', sentence)
    tokens = nlp(sentence)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        if lemma not in STOP_WORDS and re.search("^[a-zA-Z]{2}\w+", lemma):

コード例 #18

0

ファイルを表示

ファイル: text_processing_pipeline.py プロジェクト: sandroclark/twitter_topic_modeling

def preprocessing_text(text, lemmatize=True):
    '''
    INPUT: string tweet
    OUTPUT: str w/ emojis, urls, numbers, and reserved words removed
    '''
    def remove_symbols(word, symbol_set):
        return ''.join(char for char in word if char not in symbol_set)

    def fix_lemmatized_hashtags(tweet):
        '''
        Lemmatizing function separates # and word.
        This function returns string that rejoins hashtags
        '''
        tokens = []
        for i, j in enumerate(tweet.split()):
            if j == '#':
                j = tweet.split()[i] + tweet.split()[i + 1]
                tokens.append(j)
                continue
            if (tweet.split()[i - 1] == '#'):
                continue
            elif j != '#':
                tokens.append(j)

        return ' '.join(tokens)

    # define stopwords
    stop_words_sp = stopwords.words('spanish')
    stop_words_en = stopwords.words('english')
    stop_words = stop_words_sp + stop_words_en + [' ']

    # define punctuation
    punct = set('!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~¿… °¡')

    # remove laughter
    matcher = re.compile(r'(ja)\1*')
    jaja = [match.group() for match in matcher.finditer(text)]
    jaja += ['lol', 'LOL', 'Lol', 'LoL']

    text = ' '.join([word for word in text.split() if word not in jaja])

    if lemmatize == True:
        # Lemmatize and rejoin
        nlp = es_core_news_sm.load()
        nlp_text = nlp(text)
        text = ' '.join([token.lemma_ for token in nlp_text])
        text = fix_lemmatized_hashtags(text)

    else:
        # Stem and rejoin
        stemmer = SnowballStemmer('spanish')
        text = ' '.join([stemmer.stem(token) for token in text.split()])

    # remove emojis, urls, numbers, and reserved words
    p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.NUMBER, p.OPT.RESERVED)
    clean_text = p.clean(text)

    # split tweet, remove stopwords, and len(words) <= 2
    clean_text = [
        word for word in clean_text.split()
        if (remove_symbols(word, punct).lower() not in stop_words) and (
            word not in punct) and (len(remove_symbols(word, punct)) > 2) and (
                p.clean(remove_symbols(word, punct)) != '')
    ]

    clean_text = [
        word.lower()
        if word.startswith('@') else remove_symbols(word, punct).lower()
        for word in clean_text
    ]

    return clean_text

コード例 #19

0

ファイルを表示

for palabra in texto:
    steem = spanishStemmer.stem(palabra)
    raiz.append(steem)
raiz

# In[129]:

common_steem = Counter(raiz).most_common()
common_steem[0:10]

# # BONUS Punto 4:
#
# - Contar cuántas ocurrencias hay por cada parte de la oración

# In[96]:

import es_core_news_sm
spacy_es = es_core_news_sm.load()

# In[128]:

componente = []

a = " ".join(texto)
b = spacy_es(a)

for palabra in b:
    componente.append(palabra.pos_)
Counter(componente).most_common()

コード例 #20

0

ファイルを表示

ファイル: views.py プロジェクト: amloarte/Entities-Recognition

def loadindex(request):
    # Carga lenguaje español Spacy
    nlp = es_core_news_sm.load()

    my_title = "Caso Arroz Verde"
    texto = ""
    datos = []

    # Se carga el texto de prueba
    if request.method == "POST" and 'prueba' in request.POST:
        texto = request.POST["textoprueba"]
    # Obtiene el texto de entrada
    if request.method == "POST" and 'buscar' in request.POST:
        texto = request.POST["palabraClave"]

    texto = limpiarDatos(texto)
    text = nlp(texto)
    # Tokeniza la entrada de texto con Spacy
    tokenized_sentences = [sentence.text for sentence in text.sents]
    entidadSpacy = []
    # Reconocimiento de entidades con Spacy
    for sentence in tokenized_sentences:
        for entity in nlp(sentence).ents:
            spacyEntidad = entity.text
            entidadSpacy.append(spacyEntidad)
    # Eliminando duplicados en las listas, sin perder el orden
    entidadSpacy = list(set(entidadSpacy))
    # Conteo de entidades
    etiquetaEntidad = []
    count = {}
    claves = []
    valores = []
    dicEntidades = []
    for sentence in entidadSpacy:
        for entity in nlp(sentence).ents:
            entidad = entity.text
            etiqueta = entity.label_
            etiquetaEntidad.append(etiqueta)
            dicEntidades.append({"entidad": entidad, "etiqueta": etiqueta})
    # Conteo de entidades
    count = countDistinct(etiquetaEntidad)
    keyes = count.keys()
    values = count.values()
    for elemento in keyes:
        claves.append(elemento)
    for elemento in values:
        valores.append(elemento)

    palabras_limpias = []
    for enti in entidadSpacy:
        # Limpieza de datos para consulta
        palabra = enti
        palabra = palabra.replace(' ', '_')
        palabra = palabra.replace('á', 'a')
        palabra = palabra.replace('é', 'e')
        palabra = palabra.replace('í', 'i')
        palabra = palabra.replace('ó', 'o')
        palabra = palabra.replace('ú', 'u')
        palabras_limpias.append(palabra)
        datos = listaEntidadesPropias(datos, palabra)
    # Elimina tripletas duplicadas
    datos = OrderedDict((tuple(x), x) for x in datos).values()
    # Valor del texto de entrada
    mis_entidades = texto
    # Imprimir texto con etiquetas de entidades de Spacy
    for enti in palabras_limpias:
        # Saca el indice de cada palabra del arreglo
        indice = palabras_limpias.index(enti)

        if indice == len(palabras_limpias):
            break
        else:
            tripletaResultante = anotacion(enti)
            print("triple Salida\t", tripletaResultante)
            if tripletaResultante is not None:
                for uri in tripletaResultante:
                    print("URI encontrada:\t", uri)
                    valorUri = uri.split("/")
                    valorUri = valorUri[len(valorUri) - 1]
                    valorUri = valorUri.replace('_', ' ')
                    if valorUri in entidadSpacy[indice]:
                        entidadEtiquetada = '<a href="' + uri + '">' + valorUri + " " + etiquetaEntidad[
                            indice] + "</a>"
                        # Realiza la anotación luego de obtener la URI
                        mis_entidades = mis_entidades.replace(
                            entidadSpacy[indice], entidadEtiquetada)
                        valorUri = displacy.render(text, style="ent")
                    else:
                        print("Texto no relacionado")
            else:
                print("No hay resultados para este entidad")

    var = charts()

    # Diccionario visualizacion en template
    context = {
        'my_title': my_title,
        'claves': claves,
        'valores': valores,
        'dicEntidades': dicEntidades,
        'mis_entidades': mis_entidades,
        'datos': datos,
        'text': var
    }
    return render(request, "index.html", context)

コード例 #21

0

ファイルを表示

def check_spacy_models(main, lang, pipeline):
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in [
            'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other'
    ]:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)
            # Other Languages
            elif lang == 'other':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wordless_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))

コード例 #22

0

ファイルを表示

import spacy
from spacy.lang.es.stop_words import STOP_WORDS
import es_core_news_sm

nlp = es_core_news_sm.load()
#nlp = spacy.load("es_core_news_sm", disable=['ner', 'parser', 'tagger'])


def clean(word):
    w = word
    while len(w) >= 1 and w[0] in [
            '/', '-', '¡', '¿', '.', ',', ';', ':', '\'', '"', '?', '!'
    ]:
        w = w[1:]
    while len(w) >= 1 and w[-1] in [
            '/', '-', '¡', '¿', '.', ',', ';', ':', '\'', '"', '?', '!'
    ]:
        w = w[:-1]
    return w


# Tokenizer based on spacy only
def tokenizer(doc, lowercase=True):
    return [x.orth_ for x in nlp(doc.lower() if lowercase else doc)]


# Tokenizing and deliting stopwords
def tokenizer_wo_stopwords(doc, lowecase=True):
    return [
        x.orth_ for x in nlp(doc.lower() if lowercase else doc)
        if x.orth_ not in STOP_WORDS

コード例 #23

0

ファイルを表示

class Semantico():
    sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/")
    nlp = es_core_news_sm.load()

    def consultaVirutoso(self, texto):
        text = self.nlp(texto)
        tokenized_sentences = [sentence.text for sentence in text.sents]
        datos = []
        datos2 = []
        listaAux2 = []
        auxO = ""
        listaTipos2 = []
        listaAux = []
        listaCountTipos = []
        listaCountTipos2 = []
        listaTipos3 = []
        entidades = []
        lista = []
        entidades2 = []
        auxiliar = []
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                entidades2.append(entity.text)
                # print ("sem")
                # print (entity.text)
                palabras = difflib.get_close_matches(entity.text, [
                    'Rafael Correa', 'Odebrecht', 'Alexis Mera', 'CWNE',
                    'SK Engeenering'
                ])
                palabras2 = ''.join(palabras)

                if len(palabras2) > 0:
                    entidades.append(palabras2)
                    consulta = """
                              SELECT ?s ?p ?o
                             WHERE 
                                { 
                                       ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                                }
                            """ % (palabras2.replace(' ', ''), palabras2)

                else:
                    entidades.append(entity.text)
                    consulta = """
                    SELECT ?s ?p ?o
                    WHERE 
                        { 
                            ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                        }
                     """ % (entity.text.replace(' ', ''), entity)

                #if len(palabras2) > 0:

                #   entidades.append(palabras2)
                #else:

                #t = entity.text.split(" ")
                #if len(t) > 1:
                #  for i in range(len(t)):
                #     auxiliar.append(entity.text.split())
                #    for palabraEn in auxiliar:
                # consulta mejorada
                #     print (palabraEn)
                #
                #   consulta = """
                #      SELECT ?s ?p ?o
                #     WHERE
                #        {
                #           ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                #      }
                # """ % (palabraEn(' ',''), palabraEn)
                #else:
                #   print ("one token")
                # consulta = """
                #     SELECT ?s ?p ?o
                #     WHERE
                #         {
                #             ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                #         }
                #      """ % (entity.text.replace(' ',''), entity)

                self.sbcEndpoint.setQuery(consulta)
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:

                    listaTipos = []
                    contador = []
                    listaTipos2 = []

                    listaS = result["s"]["value"].strip()
                    listaP = result["p"]["value"]
                    listaO = result["o"]["value"]
                    # por si sale con ese link no agregar (revisar)
                    # if listaO.startswith('http://www.openlinks'):
                    lista.append(listaS)
                    lista.append(listaP)
                    aux2 = listaP.rsplit('/', 1).pop()
                    if aux2 == "type":
                        listaTipos.append(listaO.rsplit('/', 1).pop())
                        listaTipos.append(listaS.rsplit('/', 1).pop())
                        auxO = listaO.rsplit('/', 1).pop()
                        auxO = ''.join(auxO)
                        # print (auxO)
                        listaAux.append(auxO)
                        listaTipos2.append(auxO)
                        # print (listaTipos2)

                    lista.append(listaO)

                    listaAux2 = listaAux
                    listaAux2 = list(set(listaAux2))

                    # print (listaTipos2)
                    listaTipos3.append(listaTipos2)
                    listaTipos2 = [x for x in listaTipos if x != []]
                    listaTipos3 = [x for x in listaTipos3 if x != []]
                    # print (auxO)
                    # counter = listaTipos3.count(auxO)
                    # print (counter)

                    # print (auxO, ", ", listaS)

                    datos2.append(listaTipos2)
                    datos2.sort()
                # print (listaTipos3)

                datos.append(lista)

        for tipos in listaAux2:
            # print (tipos)
            # print (listaAux)
            counter = listaAux.count(tipos)
            listCount = []
            listCount.append(counter)
            liste = [counter, tipos]
            # print (counter)
            listaCountTipos.append(liste)
            # listaCountTipos2.append(tipos)
            # listaCountTipos.append(listaCountTipos2)

        # listaCountTipos.append(listaCountTipos2)

        print(listaCountTipos)
        # print (datos2)

        # Eliminando duplicados
        # entidades = list(set(entidades))
        return datos, entidades, datos2, entidades2, listaCountTipos

    def textoHtml(self, texto, entidades2):
        aux2 = ""
        listaObjetos = []
        #print (entidades2)
        for palabra in entidades2:
            if palabra in texto:
                consulta2 = """
                            PREFIX cavr: <http://localhost:8080/mydataset/schema/>
                            SELECT ?s ?o
                            WHERE
                            {
                                ?s cavr:label ?o .
                            }"""
                self.sbcEndpoint.setQuery(consulta2)
                self.sbcEndpoint.setReturnFormat(JSON)
                results3 = self.sbcEndpoint.query().convert()
                for result in results3["results"]["bindings"]:
                    listaS = result["s"]["value"].strip()
                    listaO = result["o"]["value"].strip()
                    aux2 = listaO.rsplit('/', 1).pop()
                    aux6 = listaS.rsplit('/', 1).pop()
                    listaObjetos.append(aux2)
                listaObjetos = list(set(listaObjetos))

                palabraUnica = difflib.get_close_matches(palabra, listaObjetos)
                palabraUnica = ''.join(palabraUnica)
                #print (palabraUnica )
                if len(palabraUnica) > 0:
                    palabra2 = palabraUnica
                else:
                    palabra = palabra

                consulta = """
                            PREFIX cavr: <http://localhost:8080/mydataset/schema/>
                            SELECT ?s ?o
                            WHERE
                            {
                                ?s cavr:label ?o .FILTER (regex(str(?o), "%s")) .
                            }""" % (palabra2)
                # print (consulta)
                self.sbcEndpoint.setQuery(consulta)
                self.sbcEndpoint.setReturnFormat(JSON)
                results2 = self.sbcEndpoint.query().convert()
                #print (results2)
                for result in results2["results"]["bindings"]:
                    listaS = result["s"]["value"].strip()
                    aux2 = listaS.rsplit('/', 1).pop()
                #palabra = palabra.replace('í', 'i')

                url = '<a href = "{}">{}</a>'.format(listaS, palabra)
                # print (url)

                if url not in texto:
                    texto = texto.replace(palabra, url)

        return texto

    def getTipos(self, texto):
        print(texto)

    def consultaPorUri(self, uri):
        consulta = """
                    SELECT ?p ?o
                        WHERE
                        {
                            <%s> ?p  ?o
                        }
                """ % (uri)
        self.sbcEndpoint.setQuery(consulta)
        self.sbcEndpoint.setReturnFormat(JSON)
        results = self.sbcEndpoint.query().convert()
        return results["results"]["bindings"]

コード例 #24

0

ファイルを表示

ファイル: wordless_text_utils.py プロジェクト: zhangco1079/Wordless

def check_spacy_models(main, lang, pipeline):
    if lang == 'other':
        lang = 'eng'

    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['tokenization', 'sentence_tokenization']:
        nlp_pipelines = ['sbd']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)

        if 'sbd' in nlp_pipelines:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            if 'sbd' not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe('sentencizer'))

コード例 #25

0

ファイルを表示

from html.parser import HTMLParser
import unicodedata
from tqdm.auto import tqdm
from nltk.corpus import wordnet as wn

import fr_core_news_sm
nlp_fr = fr_core_news_sm.load()

import en_core_web_sm
nlp_en = en_core_web_sm.load()

import de_core_news_sm
nlp_de = de_core_news_sm.load()

import es_core_news_sm
nlp_es = es_core_news_sm.load()

import it_core_news_sm
nlp_it = it_core_news_sm.load()

import pt_core_news_sm
nlp_pt = pt_core_news_sm.load()

import nl_core_news_sm
nlp_nl = nl_core_news_sm.load()

# global variables
wnl = WordNetLemmatizer()
html_parser = HTMLParser()
stopword_list = []
language = ""

コード例 #26

0

ファイルを表示

ファイル: views.py プロジェクト: pedrojsalinas/JeffersonSBC

class Semantico():
    # sparql endopoint
    sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/")
    nlp = es_core_news_sm.load()
    g = rdflib.Graph()
    g.parse("datos.rdf")

    def consultaVirutoso(self, texto):
        text = self.nlp(texto)
        tokenized_sentences = [sentence.text for sentence in text.sents]
        datos = []
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                consulta = """
                SELECT ?s ?p ?o
                    WHERE 
                        { 
                            ?s ?p ?o .FILTER regex(str(?s), "%s") .
                        }
                        """ % (entity.text)
                self.sbcEndpoint.setQuery(consulta)
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:
                    lista = []
                    listaS = result["s"]["value"]
                    listaP = result["p"]["value"]
                    listaO = result["o"]["value"]
                    lista.append(listaS)
                    lista.append(listaP)
                    lista.append(listaO)
                    datos.append(lista)
        return datos

    def obtenerRecursos(self, uri):
        datos = []
        consulta = """SELECT ?p ?o
                        WHERE
                        {
                            <%s> ?p  ?o
                        }""" % (uri)

        datos = []
        for row in self.g.query(consulta):
            recursos = []
            p = row.p.split("/")
            o = row.o.split("/")
            p = p[len(p) - 1]
            o = o[len(o) - 1]
            recursos.append(row.p)
            recursos.append(p)
            recursos.append(row.o)
            recursos.append(o)
            datos.append(recursos)
        return datos

    def limpiezaDatos(self, text):
        nlp = es_core_news_sm.load()
        text = nlp(text)
        tokenized_sentences = [sentence.text for sentence in text.sents]

        datos = []

        for sentence in tokenized_sentences:
            for entity in nlp(sentence).ents:
                consulta = 'SELECT ?s ?p ?o  WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % (
                    entity.text)
                for row in self.g.query(consulta):
                    tripleta = []
                    sujeto = row.s
                    predicado = row.p.split("/")
                    objeto = row.o.split("/")
                    objetoUri = row.o
                    predicado = predicado[len(predicado) - 1]
                    objeto = objeto[len(objeto) - 1]
                    # if '$' in objetoUri:
                    #     objetoUri = ''
                    tripleta.append(entity.text)
                    tripleta.append(sujeto)
                    tripleta.append(predicado)
                    tripleta.append(objeto)
                    tripleta.append(objetoUri)
                    datos.append(tripleta)
        datos = OrderedDict((tuple(x), x) for x in datos).values()
        lista = []
        for i in datos:
            lista.append(i)
        return lista

コード例 #27

0

ファイルを表示

import re ## regular expression
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import spacy
import es_core_news_sm ## import the model 
import unidecode
import pickle

nlp = es_core_news_sm.load() ## load the model
sb_spanish = SnowballStemmer('spanish') ## quitar el plural
nltk.download("punkt") ## Sentence tokenizer | divide a text


#################################################################
######### PREPROCESSING
#################################################################

def regex(text):
    ## delete any character that not is a word
    new_text = re.sub("\W+", " ", text)    
    ## delete a number [+] one or more ocurrences
    new_text = re.sub("\d+", " ", new_text).strip()

    return new_text

def lower_case(text):
    ## put lowercase
    new_text = text.lower()
    
    return new_text

コード例 #28

0

ファイルを表示

ファイル: spanish.py プロジェクト: fluent-labs/language-service

import es_core_news_sm
from language_service.dto.word import Word

parser = es_core_news_sm.load()


def tag_spanish(text):
    return [
        Word(token=word.text, tag=word.pos_, lemma=word.lemma_)
        for word in parser(text)
    ]

コード例 #29

0

ファイルを表示

class Semantico():
    # sparql endopoint
    sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/")
    nlp = es_core_news_sm.load()
    #g = rdflib.Graph()
    #g.parse("datos.rdf")

    def consultaVirutoso(self, texto):
        # tokenizar texto con spacy
        text = self.nlp(texto)
        tokenized_sentences = [sentence.text for sentence in text.sents]
        # dar estilos al texo analizado
        spacyText = displacy.render(text, style="ent")
        # declaras listas vacias
        datos = []
        datostype = []
        datoscompani = []
        entidades = []
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                entidades.append(entity.text)
                palabra = self.limpiarDatos(entity)
                consulta = """
                SELECT ?s ?p ?o
                    WHERE 
                        { 
                            ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) .
                        }
                        """ % (palabra,palabra)
                self.sbcEndpoint.setQuery(consulta)
                # retornar consulta enformto json
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:
                    lista = []
                    listaS = result["s"]["value"]
                    listaP = result["p"]["value"]
                    listaO = result["o"]["value"]
                    lista.append(listaS)
                    lista.append(listaP)
                    lista.append(listaO)
                    datos.append(lista)
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                entidades.append(entity.text)
                palabra = self.limpiarDatos(entity)
                consultatype = """
                PREFIX caseav: <http://localhost:8080/Data/page/>
                SELECT ?o
                    WHERE 
                        { 
                           {?s  caseav:hasNombrePersona ?o .FILTER (regex(str(?o), "%s")) .}  
                           UNION
                           {?s caseav:hashasApellidoPersona ?o .FILTER (regex(str(?o), "%s")) .}
                           UNION
                           {?s  caseav:hasCodigo ?o .FILTER (regex(Str(?o), "%s")) .} 
                           UNION
                           {?s  caseav:hasNombreCompletoPersona ?o .FILTER (regex(Str(?o), "%s")) .} 
                        }
                        """ % (palabra,palabra,palabra,palabra)
                self.sbcEndpoint.setQuery(consultatype)
                # retornar consulta enformto json
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:
                    listae = []
                    #listaSe = result["s"]["value"]
                    #listaPe = result["p"]["value"]
                    listaOe = result["o"]["value"]
                    #listae.append(listaSe)
                    #listae.append(listaPe)
                    listae.append(listaOe)
                    datostype.append(listae)
        for sentence in tokenized_sentences:
            for entity in self.nlp(sentence).ents:
                entidades.append(entity.text)
                palabra = self.limpiarDatos(entity)
                consultacompani = """
                PREFIX caseav: <http://localhost:8080/Data/page/>
                SELECT ?s ?o
                    WHERE 
                        { 
                           {?s  caseav:hasNombreEmpresa  ?o .FILTER (regex(str(?o), "%s")) .}
                        }
                        """ % (palabra)
                self.sbcEndpoint.setQuery(consultacompani)
                # retornar consulta enformto json
                self.sbcEndpoint.setReturnFormat(JSON)
                results = self.sbcEndpoint.query().convert()
                for result in results["results"]["bindings"]:
                    
                    listaec = []
                    #listaSe = result["s"]["value"]
                    #listaPe = result["p"]["value"]
                    listaOec = result["o"]["value"]
                    #listae.append(listaSe)
                    #listae.append(listaPe)
                    listaec.append(listaOec)
                    datoscompani.append(listaec)
        return datos, entidades, spacyText,datostype,datoscompani
        
    def textoHtml(self, texto, entidades):
        for palabra in entidades:
            if palabra in texto:
                # reemplazo de tildes
                # -> NFD y eliminar diacríticos
                s = re.sub(
                        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
                        normalize( "NFD", palabra), 0, re.I
                    )

                # -> NFC
                s = normalize( 'NFC', s)
                palabraUrl = s.replace(" ","_")
                palabraUrl = self.limpiarDatos(palabraUrl)
                url = '<a href = "http://localhost:8080/Data/page/{}">{}</a>'.format(palabraUrl,palabra)
                if url not in texto:
                    texto = texto.replace(palabra, url)
        return texto


    def limpiarDatos(self,palabra):
        palabra = str(palabra)
        palabra = palabra.replace(' ','_')
        palabra = palabra.replace('á','a')
        palabra = palabra.replace('é','e')
        palabra = palabra.replace('í','i')
        palabra = palabra.replace('ó','o')
        palabra = palabra.replace('ú','u')
        palabra = palabra.replace('Alianza_Pais','AP')
        palabra = palabra.replace('Alianza_PAIS','AP')
        palabra = palabra.replace('Hidalgo_&_Hidalgo','HIDALGO_&_HIDALGO')
        palabra = palabra.replace('Fopeca','FOPECA')
        palabra = palabra.replace('Midisa_S.A','MIDISA_S.A')
        palabra = palabra.replace('Pamela_Martinez','Maria_Pamela_Martinez_Loaiza')
        return palabra

コード例 #30

0

ファイルを表示

class Service:
    dao = Dao()
    module_url = "./Modelos/modelo_dm0_20_2000.txt"
    embed = gensim.models.doc2vec.Doc2Vec.load(module_url)
    nlp = es_core_news_sm.load()
    stop = set(stopwords.words('spanish'))
    non_words = list(punctuation)
    non_words.extend(['¿', '¡'])
    non_words.extend(stop)
    non_words.extend(map(str, range(10)))

    def __init__(self):
        self.dao = Dao()
        self.module_url = "./Modelos/modelo_dm0_20_2000.txt"
        self.embed = gensim.models.doc2vec.Doc2Vec.load(self.module_url)
        self.nlp = es_core_news_sm.load()
        self.stop = set(stopwords.words('spanish'))

        self.non_words = list(punctuation)
        self.non_words.extend(['¿', '¡'])
        self.non_words.extend(self.stop)
        self.non_words.extend(map(str, range(10)))

    def getModelo(self, id_documento):
        rutaModelo = "./Modelos/" + str(id_documento) + ".dat"
        modelo = Word2Vec.load(rutaModelo)
        return modelo

    def limpiar(self, text):
        self.nlp.max_length = 10000000
        text = text.lower()
        doc = self.nlp(text, disable=['ner', 'parser'])
        #lemmas = [t.norm_ for t in doc if not t.is_punct | t.is_stop and t not in stop]
        lemmas = [t.norm_ for t in doc if not t.is_punct | t.is_stop]
        words = [t.lower() for t in lemmas if len(t) > 3 and t.isalpha()]
        return words

    def GetResumenes(self):
        sql = "select * from documento_resumen"
        df = pd.read_sql(sa.text(sql), self.dao.engine)
        self.dao.engine.dispose()
        return df

    def GetResumenesD(self):
        sql = "select id_documento,resumen from documento_resumen"
        df = pd.read_sql(sa.text(sql), self.dao.engine)
        self.dao.engine.dispose()
        return df

    def proceso_lda(self, df):
        corpus = []
        stem = PorterStemmer()
        lem = WordNetLemmatizer()
        for news in df['contenido'].dropna():
            words = [w for w in word_tokenize(news) if (w not in self.stop)]

            words = [lem.lemmatize(w) for w in words if len(w) > 2]

            corpus.append(words)
        return corpus

    def GetDocumentoConResumen(self):
        sql = "select id_documento,resumen from documento"
        df = pd.read_sql(sa.text(sql), self.dao.engine)
        self.dao.engine.dispose()
        return df

    def GetDocumentosRelacionados(self, idDocumento):
        sql = "select * from public.fn_documentos_relacionados(:idDocumento)"
        df = pd.read_sql(sa.text(sql),
                         self.dao.engine,
                         params={'idDocumento': idDocumento})
        self.dao.engine.dispose()
        return df

    def GetResumenDocumentosRelacionados(self, idDocumento):
        sql = "select * from public.fn_documentos_resumen_relacionados(:idDocumento)"
        df = pd.read_sql(sa.text(sql),
                         self.dao.engine,
                         params={'idDocumento': idDocumento})
        self.dao.engine.dispose()
        return df

    def GetDocumentos(self):
        sql = "select * from documento"
        df = pd.read_sql(sa.text(sql), self.dao.engine)
        self.dao.engine.dispose()
        return df

    def GetVectors(self, texto):
        busqueda_limpia = self.limpiar(texto)
        vector = self.embed.infer_vector(busqueda_limpia)
        return vector

    def GetBusqueda(self, busqueda):
        busqueda_limpia = self.limpiar(busqueda)
        vector = self.embed.infer_vector(busqueda_limpia)
        sql = "select * from public.fn_busqueda_documentos(:v0,:v1,:v2,:v3,:v4,:v5,:v6,:v7,:v8,:v9,:v10,:v11,:v12,:v13,:v14,:v15,:v16,:v17,:v18,:v19)"
        df = pd.read_sql(sa.text(sql),
                         self.dao.engine,
                         params={
                             'v0': str(vector[0]),
                             'v1': str(vector[1]),
                             'v2': str(vector[2]),
                             'v3': str(vector[3]),
                             'v4': str(vector[4]),
                             'v5': str(vector[5]),
                             'v6': str(vector[6]),
                             'v7': str(vector[7]),
                             'v8': str(vector[8]),
                             'v9': str(vector[9]),
                             'v10': str(vector[10]),
                             'v11': str(vector[11]),
                             'v12': str(vector[12]),
                             'v13': str(vector[13]),
                             'v14': str(vector[14]),
                             'v15': str(vector[15]),
                             'v16': str(vector[16]),
                             'v17': str(vector[17]),
                             'v18': str(vector[18]),
                             'v19': str(vector[19])
                         })
        self.dao.engine.dispose()
        return df

    def reconocer_conceptos(self, texto):
        self.nlp = spacy.load('es_core_news_sm')
        self.doc = self.nlp(texto)
        conceptos = ""
        for palabra in self.doc.ents:
            p = palabra.text.replace(" ", "_")
            for t in palabra.text.split(' '):
                d1 = self.nlp(t.lower())
                for ent in d1:
                    if not ent.is_punct and not ent.is_stop and len(ent) > 1:
                        conceptos = str(conceptos) + " " + str(ent)
        return conceptos

    def tokenize(self, text):
        text = text.lower()
        doc1 = self.nlp(text)
        lemmas = [t.norm_ for t in doc1 if not t.is_punct | t.is_stop]
        lemmas = [t.lemma_ for t in doc1 if not t.is_punct | t.is_stop]
        words = [t.lower() for t in lemmas if len(t) > 1 and t.isalpha()]
        return words

    def tokenizer(self, text):
        text = text.lower()
        doc1 = self.nlp(text)
        lemmas = [t.norm_ for t in doc1 if not t.is_punct | t.is_stop]
        lemmas = [t.lemma_ for t in doc1 if not t.is_punct | t.is_stop]
        words = [t.lower() for t in lemmas if len(t) > 1 and t.isalpha()]
        return ''.join(words)

    def replay(self, text):
        text = text.replace("algoritmos", "algoritmo")
        text = text.replace("patrones", "patrón")
        text = text.replace("acopladas", "acoplada")
        text = text.replace("herramientas", "herramienta")
        text = text.replace("tablas", "tabla")
        text = text.replace("frecuentes", "frecuente")
        text = text.replace("sistemas", "sistema")
        text = text.replace("itemsets", "itemset")
        text = text.replace("generar", "genera")
        text = text.replace("tareas", "tarea")
        text = text.replace("paquetes", "paquete")
        text = text.replace("árboles", "árbol")
        text = text.replace("artistas", "artista")
        text = text.replace("maestros", "maestro")
        text = text.replace("carrozas", "carroza")
        text = text.replace("carnavales", "carnaval")
        text = text.replace("bosques", "bosque")
        text = text.replace("garrapatas", "garrapata")
        text = text.replace("blood", "sangre")
        text = text.replace("niños", "niño")
        text = text.replace("niñas", "niña")
        text = text.replace("juegos", "juego")
        text = text.replace("clases", "clase")
        text = text.replace("estudiantes", "estudiante")
        text = text.replace("docentes", "docente")
        text = text.replace("jugar", "juego")
        text = text.replace("jugando", "juego")
        text = text.replace("hombres", "hombre")
        text = text.replace("mujeres", "mujer")
        text = text.replace("relaciones", "relación")
        text = text.replace("casos", "caso")
        text = text.replace("suicidas", "suicida")
        text = text.replace("fríjol", "frijol")
        if text == 'items':
            text = text.replace("items", "items")
        return ''.join(text)

    def getTexto(self, id_documento):
        sql = "select resumen from documento where id_documento=:v0"
        #sql="select contenido as resumen from documento where id_documento=:v0"
        df = pd.read_sql(sa.text(sql),
                         self.dao.engine,
                         params={'v0': id_documento})
        self.dao.engine.dispose()
        contenido = ''
        for res in df.resumen:
            contenido = contenido + ' ' + res
        return contenido

    def modelo_wordwvec(self, texto):
        self.nlp = spacy.load('es_core_news_sm')
        self.doc = self.nlp(texto)
        sent = []
        for num, oracion in enumerate(self.doc.sents):
            o = self.tokenize(str(oracion))
            sent.append(o)
        #Crea las frases relevantes de la lista de oraciones:
        phrases = Phrases(sent, min_count=30, progress_per=10000)
        #Transforme el corpus en función de las bigramas detectadas:
        bigram = Phraser(phrases)
        sentences = bigram[sent]
        #palabras mas frecuentes
        word_freq = defaultdict(int)
        for sent in sentences:
            for i in sent:
                word_freq[i] += 1

        #Entrenamiento del modelo
        cores = multiprocessing.cpu_count()  #cuenta el nro de nucles de la pc

        w2v_model = Word2Vec(min_count=2,
                             window=3,
                             size=300,
                             sample=6e-5,
                             alpha=0.03,
                             min_alpha=0.0007,
                             negative=20,
                             workers=cores - 1)
        t = time()
        w2v_model.build_vocab(
            sentences, progress_per=10000)  # prepare the model vocabulary
        print('Time to build vocab: {} mins'.format(round((time() - t) / 60,
                                                          2)))
        t = time()
        w2v_model.train(sentences,
                        total_examples=w2v_model.corpus_count,
                        epochs=6000,
                        report_delay=1)
        print('Time to train the model: {} mins'.format(
            round((time() - t) / 60, 2)))
        return w2v_model

    def tsne_plot(self, model):
        "Creates and TSNE model and plots it"
        labels = []
        tokens = []

        for word in model.wv.vocab:
            tokens.append(model[word])
            labels.append(word)

        tsne_model = TSNE(perplexity=40,
                          n_components=2,
                          init='pca',
                          n_iter=2500,
                          random_state=23)
        new_values = tsne_model.fit_transform(tokens)

        x = []
        y = []
        for value in new_values:
            x.append(value[0])
            y.append(value[1])

        plt.figure(figsize=(16, 16))
        for i in range(len(x)):
            plt.scatter(x[i], y[i])
            plt.annotate(labels[i],
                         xy=(x[i], y[i]),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        return (plt.show())

    def mapa_conceptual(self, model, conceptos):
        "Creates and TSNE model and plots it"
        labels = []
        tokens = []

        for word in conceptos:
            try:
                tokens.append(model[word])
                labels.append(word)
            except KeyError:
                continue

        tsne_model = TSNE(perplexity=40,
                          n_components=2,
                          init='pca',
                          n_iter=2500,
                          random_state=23)
        new_values = tsne_model.fit_transform(tokens)

        x = []
        y = []
        for value in new_values:
            x.append(value[0])
            y.append(value[1])

        plt.figure(figsize=(7, 7))
        for i in range(len(x)):
            plt.scatter(x[i], y[i])
            plt.annotate(labels[i],
                         xy=(x[i], y[i]),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')
        return (plt.show())

    def df_conceptual(self, w2v_model, lista_concepos):
        df = pd.DataFrame()
        i = 0
        while i < len(lista_concepos):
            a = []
            s = []
            try:
                result = w2v_model.wv.most_similar(
                    positive=[lista_concepos[i].lower()], topn=2)
                for r in result:
                    a.append(r[0])
                    s.append(r[1])
                    df = df.append(
                        {
                            'source': lista_concepos[i],
                            'target': r[0],
                            'score': r[1]
                        },
                        ignore_index=True)
                #df=df.append({'bigram':(lista_concepos[i],a[0]),'score':s[0]},ignore_index=True)
                i = i + 1
            except KeyError:
                i = i + 1
                continue
        return df

    def df_conceptual_json(self, w2v_model, lista_concepos):
        df = pd.DataFrame()
        i = 0
        while i < len(lista_concepos):
            a = []
            s = []
            try:
                result = w2v_model.wv.most_similar(
                    positive=[lista_concepos[i].lower()], topn=4)

                for r in result:
                    a.append(r[0])
                    s.append(r[1])
                    df = df.append(
                        {
                            'source': self.replay(lista_concepos[i]),
                            'target': self.replay(r[0]),
                            'score': r[1]
                        },
                        ignore_index=True)
                    #df=df.append({'source':lista_concepos[i],'target':r[0],'score':r[1]},ignore_index=True)

                #df=df.append({'source':lista_concepos[i],'target':a[0],'score':s[0]},ignore_index=True)
                i = i + 1
            except KeyError:
                i = i + 1
                continue
        return df

    def df_conceptual_jsonF(self, w2v_model, lista_concepos, tope):
        df = pd.DataFrame()
        i = 0
        while i < len(lista_concepos):
            a = []
            s = []
            try:
                result = w2v_model.wv.most_similar(
                    positive=[lista_concepos[i].lower()], topn=tope)

                for r in result:
                    a.append(r[0])
                    s.append(r[1])
                    df = df.append(
                        {
                            'source': self.replay(lista_concepos[i]),
                            'target': self.replay(r[0]),
                            'score': r[1]
                        },
                        ignore_index=True)
                    #df=df.append({'source':lista_concepos[i],'target':r[0],'score':r[1]},ignore_index=True)

                #df=df.append({'source':lista_concepos[i],'target':a[0],'score':s[0]},ignore_index=True)
                i = i + 1
            except KeyError:
                i = i + 1
                continue
        return df

    def grafo_conceptual(self, df):
        # Create network plot
        d = df.set_index('bigram').T.to_dict('records')
        G = nx.Graph()
        # Create connections between nodes
        for k, v in d[0].items():
            G.add_edge(k[0], k[1], weight=(v * 100))

        fig, ax = plt.subplots(figsize=(100, 100))
        pos = nx.spring_layout(G, k=2)
        # Plot networks
        nx.draw_networkx(G,
                         pos,
                         font_size=70,
                         width=3,
                         edge_color='grey',
                         node_color='purple',
                         with_labels=True,
                         ax=ax)
        # Create offset labels
        for key, value in pos.items():
            x, y = value[0] + .135, value[1] + .045
            ax.text(x,
                    y,
                    s=str(key),
                    bbox=dict(facecolor='red', alpha=0.25),
                    horizontalalignment='center',
                    fontsize=13)

        return (plt.show())

    def ResumirDocumento(self, texto, id_documento):
        try:
            ##nltk.download('stopwords')
            ##nltk.download('punkt')
            SW = set(stopwords.words("spanish"))
            text = texto
            words = word_tokenize(text)

            # Se crea un diccionario para crear una tabla de frecuencias de las palabras.
            freqTable = dict()

            # Con un for se recorre el texto y se almacena en la tabla.
            for word in words:
                word = word.lower(
                )  # setea las palabras en minúscula y las almacena en word.
                if word in SW:
                    continue  # Si la palabra se encuentra en SW, continua con el ciclo.
                if word in freqTable:  # Si la palabra ya se encuentra en la tabla frecuencia,
                    freqTable[
                        word] += 1  # Suma 1 a la posición donde se encuentra la palabra.
                else:
                    freqTable[
                        word] = 1  # Sino, la palabra en la TF va a ser igual a 1.

            # Crea una variable y un diccionario.

            # Variable sentences para almacenar las oraciones a valorizar del texto.
            sentences = sent_tokenize(text)

            # Diccionario sentenceValue para almacenar los valores de las oraciones.
            sentenceValue = dict()

            #Se crea un ciclo for para recorrer las oraciones que se encuentran en el texto.
            for sentence in sentences:
                # Se crea un segundo for para recorrer los items que se encuentran el la TF.
                for word, freq in freqTable.items():
                    if word in sentence.lower(
                    ):  # Si la palabra se encuentra en las oraciones (en minuscula)
                        if sentence in sentenceValue:  # Y si la oración está en el diccionario de las oraciones a valorizar.
                            sentenceValue[
                                sentence] += freq  # Entonces sume 1 al número de frecuencia en la posición de la oración del sV.
                        else:
                            sentenceValue[
                                sentence] = freq  # Sino, que el valor de la posición de la oración sea igual a la frecuencia.

            # Se crea una variable donde se almacena la suma de los valores.
            sumValues = 0

            # Se crea un ciclo for para evaluar la oración dentro del Diccionario de las oraciones valorizadas.
            for sentence in sentenceValue:
                sumValues += sentenceValue[
                    sentence]  # Se suma 1 al valor de la Oración en su respectiva posición

            # Valor promedio de una oración desde un texto original
            average = int(
                sumValues / len(sentenceValue)
            )  # Divide la suma de valores en la total de oraciones valorizadas.

            # Se crea una variable para almacenar el resumen a imprimir.
            summary = ''

            # Se crea un for para recorrer las oraciones almacenadas
            for sentence in sentences:

                #Donde si, la oración está las oraciones Valorizadas y la posición de la oración es mayor que 1.2 veces el promedio:
                if (sentence in sentenceValue) and (sentenceValue[sentence] >
                                                    (1.2 * average)):
                    # El resumen va a agregar un espacio más la oración que aprobó la condición.
                    summary += " " + sentence

            df = pd.DataFrame(columns=('id_documento', 'resumen'))
            df = df.append({
                'id_documento': id_documento,
                'resumen': summary
            },
                           ignore_index=True)
            df.to_sql('documento_resumen',
                      con=self.dao.engine,
                      if_exists='append',
                      index=False)
            self.dao.engine.dispose()
            return (summary)
        except:
            df = pd.DataFrame(columns=('id_documento', 'resumen'))
            df = df.append(
                {
                    'id_documento': id_documento,
                    'resumen': 'ERROR_RESUMEN'
                },
                ignore_index=True)
            df.to_sql('documento_resumen',
                      con=self.dao.engine,
                      if_exists='append',
                      index=False)
            self.dao.engine.dispose()
            return ("ERROR_RESUMEN")

    def getDoc2vec(self, id_documento):
        sql = "select vec.*,distancia,case when distancia<0.34 then 0 else 1 end grupo from fn_evaluacion(:id_documento) ml join documento d on ml.id_documento=d.id_documento join documentos_doc2_vec  vec on vec.id_documento=ml.id_documento"
        df = pd.read_sql(sa.text(sql),
                         self.dao.engine,
                         params={'id_documento': id_documento})
        self.dao.engine.dispose()
        return df