def __init__(self): self.nlp_model = es_core_news_sm.load( ) # load spacy with spanish model self.nlp_model.max_length = 4000000 stop = stopwords.words('spanish') # stopwords in spanish sw_es = self.nlp_model.Defaults.stop_words # stopwords in spanish self.sw = sw_es.union(stop)
def respond(sentence, diccionario): #Se carga lo relacionado a spacy nlp = es_core_news_sm.load() doc = nlp(sentence) #Para imprimir los tags que spacy nos ofrece #for word in doc: #print(word.text, word.pos_, word.tag_) #Funcion para dividir una sentencia con una orden de varias partes en varias ordenes parsed = process_order_into_orders(doc) #Detectamos que orden desea el cliente (Funcionando 80%, pasa los testeos pero no la he testeado bien) ordenes = detect_order(parsed) #Dependiendo de la orden hacemos lo que se nos pida #Nota: Ordenar es la mas completa, remover esta mas o menos completa, cambiar esta incompleta y se espera que podamos brindar mas ordenes #print("ORDENES: ", ordenes) response = "" for idx,orden in enumerate(ordenes): if orden == "ordenar": resp,diccionario = add_to_list(parsed[idx],diccionario) response += resp elif orden == "remover": resp,diccionario = remove_from_list(parsed[idx], diccionario) response += resp elif orden == "recomendar": response += recomend(parsed[idx]) elif orden == "mostrar": resp = show_list(diccionario) response += resp #print(diccionario) return response, diccionario
def limpiezaDatos(self, text): nlp = es_core_news_sm.load() text = nlp(text) tokenized_sentences = [sentence.text for sentence in text.sents] datos = [] for sentence in tokenized_sentences: for entity in nlp(sentence).ents: consulta = 'SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % ( entity.text) for row in self.g.query(consulta): tripleta = [] sujeto = row.s predicado = row.p.split("/") objeto = row.o.split("/") objetoUri = row.o predicado = predicado[len(predicado) - 1] objeto = objeto[len(objeto) - 1] # if '$' in objetoUri: # objetoUri = '' tripleta.append(entity.text) tripleta.append(sujeto) tripleta.append(predicado) tripleta.append(objeto) tripleta.append(objetoUri) datos.append(tripleta) datos = OrderedDict((tuple(x), x) for x in datos).values() lista = [] for i in datos: lista.append(i) return lista
def limpiezaDatos(self, text): # libreria spacy nlp = es_core_news_sm.load() text = nlp(text) tokenized_sentences = [sentence.text for sentence in text.sents] g = rdflib.Graph() # nombre del archivo g.parse("mydataset.rdf") datos = [] for sentence in tokenized_sentences: for entity in nlp(sentence).ents: consulta = 'SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % ( entity.text) for row in g.query(consulta): tripleta = [] predicado = row.p.split("/") objeto = row.o.split("/") predicado = predicado[len(predicado) - 1] objeto = objeto[len(objeto) - 1] tripleta.append(entity.text) tripleta.append(predicado) tripleta.append(objeto) datos.append(tripleta) # elimina duplicados datos = OrderedDict((tuple(x), x) for x in datos).values() lista = [] for i in datos: lista.append(i) return lista
def preprocess(text): sp = es_core_news_sm.load() # Texto en minusculas text_lower = text.lower() # Tokenize token_word = nltk.word_tokenize(text_lower, "spanish") # Algunos textos tienen caracteres raros al principio, por tanto hay que eliminarlos para que no influyan en el código token_word[0] = clear_first_token(token_word[0]) # Stopwords de palabras españolas stopword_spanish = stopwords.words("spanish") i = 0 while (i < len(token_word)): # Se eliminan los tokens que se encuentren dentro de las stopwords if token_word[i] in stopword_spanish: token_word.remove(token_word[i]) #Se elimina cualquier token que sea distinto de caracteres alfanumericos elif not (token_word[i].isalpha()): token_word.remove(token_word[i]) else: # Debido a que la libreria nltk no lematiza textos en español es necesario utilizar otra libreria de NLP la cual es Spacy word = sp(token_word[i])[0] token_word[i] = word.lemma_ i = i + 1 return token_word
class Semantico(): sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/") nlp = es_core_news_sm.load() def consultaVirutoso(self, texto): text = self.nlp(texto) tokenized_sentences = [sentence.text for sentence in text.sents] datos = [] entidades = [] for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades.append(entity.text) # consulta mejorada consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . } """ % (entity.text, entity.text) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: lista = [] listaS = result["s"]["value"] listaP = result["p"]["value"] listaO = result["o"]["value"] # por si sale con ese link no agregar (revisar) # if listaO.startswith('http://www.openlinks'): lista.append(listaS) lista.append(listaP) lista.append(listaO) datos.append(lista) # Eliminando duplicados # entidades = list(set(entidades)) return datos, entidades def textoHtml(self, texto, entidades): for palabra in entidades: if palabra in texto: url = '<a href = "http://localhost:8080/negociador/page/{}">{}</a>'.format( palabra, palabra) if url not in texto: texto = texto.replace(palabra, url) return texto def consultaPorUri(self, uri): consulta = """ SELECT ?p ?o WHERE { <%s> ?p ?o } """ % (uri) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() return results["results"]["bindings"]
def _load_spacy(self, ): # LOAD SPACY t1 = timeit.default_timer() spacy_nlp = es_core_news_sm.load() spacy_nlp.max_length = 1500000 t2 = timeit.default_timer() print(f"Time to load spaCy: {t2 - t1}") return spacy_nlp
def __init__(self): self.dao = Dao() self.module_url = "./Modelos/modelo_dm0_20_2000.txt" self.embed = gensim.models.doc2vec.Doc2Vec.load(self.module_url) self.nlp = es_core_news_sm.load() self.stop = set(stopwords.words('spanish')) self.non_words = list(punctuation) self.non_words.extend(['¿', '¡']) self.non_words.extend(self.stop) self.non_words.extend(map(str, range(10)))
def _nlp(spacy_module: str) -> Optional[NLP]: print("Loading spacy language model for '", spacy_module, "'") if spacy_module == 'en': nlp = en_core_web_sm.load() elif spacy_module == 'es': nlp = es_core_news_sm.load() elif spacy_module == 'de': nlp = de_core_news_sm.load() elif spacy_module == 'fr': nlp = fr_core_news_sm.load() elif spacy_module == 'it': nlp = it_core_news_sm.load() elif spacy_module == 'pt': nlp = pt_core_news_sm.load() else: raise ValueError(f'Unsupported language {spacy_module}') return nlp
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
class Semantico(): sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/") nlp = es_core_news_sm.load() def consultaVirutoso(self, texto): text = self.nlp(texto) tokenized_sentences = [sentence.text for sentence in text.sents] datos = [] for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: palabra = self.limpiarDatos(entity) consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") . } """ % (palabra) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: lista = [] listaS = result["s"]["value"] listaP = result["p"]["value"] listaO = result["o"]["value"] lista.append(listaS) lista.append(listaP) lista.append(listaO) datos.append(lista) datos = OrderedDict((tuple(x), x) for x in datos).values() return datos def limpiarDatos(self,palabra): palabra = str(palabra) print('***'*10) print(palabra) palabra = palabra.replace(' ','_') palabra = palabra.replace('á','a') palabra = palabra.replace('é','e') palabra = palabra.replace('í','i') palabra = palabra.replace('ó','o') palabra = palabra.replace('ú','u') print(palabra) print('***'*10) return palabra
def __init__(self, url): try: pattern = re.compile( "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$" ) if not pattern.match(url): print(f"{url} is not a valid url") self.url = url self.article = Article(self.url) self.article.download() self.article.parse() self.author = self.article.authors self.oneline = self.article.summary self.text = self.article.text.replace("\n", ".") if self.article.meta_lang == 'en' or (self.article.meta_lang == '' and url.find( "cnn.com", 0, 10)): import en_core_web_sm self.model = en_core_web_sm.load() elif self.article.meta_lang == 'it': import it_core_news_sm self.model = it_core_news_sm.load() elif self.article.meta_lang == 'fr': import fr_core_news_sm self.model = fr_core_news_sm.load() elif self.article.meta_lang == 'es': import es_core_news_sm self.model = es_core_news_sm.load() elif self.article.meta_lang == 'pt': import pt_core_news_sm self.model = pt_core_news_sm.load() else: print( f"The {self.article.meta_lang} language is not supported") self.data = [] self.vectorizer = TfidfVectorizer(strip_accents='unicode') except article.ArticleException: print( f"The url {url} is not supported, please write to [email protected] for further help" ) self.valid = False
def master_tags_spanish(serie_word: pd.Series): """ This function apply the above function to each news @input: serie_word: series object contains all news text @return: tags_pos: dictionary that contains the id of news in the keys and the list of tuples with tags_pos in the values """ # model for tags in spanish from scipy nlp = es_core_news_sm.load() # names file df_names = pd.read_csv('data/external/stopwords_names.txt') names = list(df_names['name']) # applying the above function to each text tags_pos = serie_word.apply(lambda x: process_tags_spanish(x, names, nlp)) tags_pos = dict(tags_pos) return tags_pos
def respuesta(mensaje, database): nlp = es_core_news_sm.load() parsed = nlp(str(mensaje)) pronoun, aux, noun, adjective, verb, det = find_candidate_parts_of_speech( parsed) respuesta = check_for_comment_about_bot(pronoun, noun, adjective, det) #esNuevo = check_for_experience(pronoun,aux,noun,verb,det) print("respuesta", respuesta) noEntendi = [ "Puedes ser un poco más claro", "Podrías ser mas especifico", "No te entiendo -.-", "Ni idea!", "Preguntas serias por favor!", "En que te puedo ayudar?" ] if not respuesta: respuesta = construir_respuesta(mensaje, pronoun, aux, noun, verb, det, database) if not respuesta: respuesta = random.choice(noEntendi) #logger.info("Returning phrase '%s'", resp) return respuesta
def post(self, request): form = BuscadorForm(request.POST) my_title = "Caso Arroz Verde" if form.is_valid(): nlp = es_core_news_sm.load() text = form.cleaned_data['query'] textoInicial = form.cleaned_data['query'] text = nlp(text) tokenized_sentences = [sentence.text for sentence in text.sents] print(tokenized_sentences) g = rdflib.Graph() g.parse("arroz_verde.rdf") datos = [] form = BuscadorForm() for sentence in tokenized_sentences: for entity in nlp(sentence).ents: consulta = 'SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % ( entity.text) for row in g.query(consulta): tripleta = [] predicado = row.p.split("/") objeto = row.o.split("/") predicado = predicado[len(predicado) - 1] objeto = objeto[len(objeto) - 1] tripleta.append(entity.text) tripleta.append(predicado) tripleta.append(objeto) datos.append(tripleta) print(datos) args = { "titulo": my_title, "datos": datos, "form": form, "textoInicial": textoInicial } return render(request, self.template_name, args)
def get_data(reviews, label_and_terms): """ Generate the data in proper format to train the NER model by matching the search terms of the entities in the texts of the reviews. """ # Load the spaCy statistical model nlp = es_core_news_sm.load() # Disable unnneeded pipeline components nlp.disable_pipes('ner', 'tagger', 'parser') data = [] for review in tqdm(reviews): # Match the terms of the entities in the text matches_in_text = get_matches_in_proper_format(review, label_and_terms, nlp) matches_info_in_text = {"entities": matches_in_text} # Create a tuple with the text and the matches row_data = (review, matches_info_in_text) data.append(row_data) return data
infer_datetime_format=True) return dataframe[new_column] def dropcolumns(dataframe, columns): dataframe = dataframe.drop(columns, axis=1) return dataframe def modifiedDate(dataframe, column, year, month, day): dataframe = dataframe[(dataframe[column] > datetime.date(year, month, day))] return dataframe nlp = es_core_news_sm.load(parser=True) nlp.Defaults.stop_words |= { "RT", "próx", "xd", "rt", "htt", "parir", "sobrar", "the", "and", "gracias", "hola", "jajaja", "jajajaja", "hablar", "comer", "personar", "you", "with", "casar", "was", "that", "what", "pasar", "salir" } def spacyTokenizer(sentence): sentence = re.sub(r"htt\S+", '', sentence) sentence = re.sub('@', '', sentence) tokens = nlp(sentence) filtered_tokens = [] for word in tokens: lemma = word.lemma_.lower().strip() if lemma not in STOP_WORDS and re.search("^[a-zA-Z]{2}\w+", lemma):
def preprocessing_text(text, lemmatize=True): ''' INPUT: string tweet OUTPUT: str w/ emojis, urls, numbers, and reserved words removed ''' def remove_symbols(word, symbol_set): return ''.join(char for char in word if char not in symbol_set) def fix_lemmatized_hashtags(tweet): ''' Lemmatizing function separates # and word. This function returns string that rejoins hashtags ''' tokens = [] for i, j in enumerate(tweet.split()): if j == '#': j = tweet.split()[i] + tweet.split()[i + 1] tokens.append(j) continue if (tweet.split()[i - 1] == '#'): continue elif j != '#': tokens.append(j) return ' '.join(tokens) # define stopwords stop_words_sp = stopwords.words('spanish') stop_words_en = stopwords.words('english') stop_words = stop_words_sp + stop_words_en + [' '] # define punctuation punct = set('!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~¿… °¡') # remove laughter matcher = re.compile(r'(ja)\1*') jaja = [match.group() for match in matcher.finditer(text)] jaja += ['lol', 'LOL', 'Lol', 'LoL'] text = ' '.join([word for word in text.split() if word not in jaja]) if lemmatize == True: # Lemmatize and rejoin nlp = es_core_news_sm.load() nlp_text = nlp(text) text = ' '.join([token.lemma_ for token in nlp_text]) text = fix_lemmatized_hashtags(text) else: # Stem and rejoin stemmer = SnowballStemmer('spanish') text = ' '.join([stemmer.stem(token) for token in text.split()]) # remove emojis, urls, numbers, and reserved words p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.NUMBER, p.OPT.RESERVED) clean_text = p.clean(text) # split tweet, remove stopwords, and len(words) <= 2 clean_text = [ word for word in clean_text.split() if (remove_symbols(word, punct).lower() not in stop_words) and ( word not in punct) and (len(remove_symbols(word, punct)) > 2) and ( p.clean(remove_symbols(word, punct)) != '') ] clean_text = [ word.lower() if word.startswith('@') else remove_symbols(word, punct).lower() for word in clean_text ] return clean_text
for palabra in texto: steem = spanishStemmer.stem(palabra) raiz.append(steem) raiz # In[129]: common_steem = Counter(raiz).most_common() common_steem[0:10] # # BONUS Punto 4: # # - Contar cuántas ocurrencias hay por cada parte de la oración # In[96]: import es_core_news_sm spacy_es = es_core_news_sm.load() # In[128]: componente = [] a = " ".join(texto) b = spacy_es(a) for palabra in b: componente.append(palabra.pos_) Counter(componente).most_common()
def loadindex(request): # Carga lenguaje español Spacy nlp = es_core_news_sm.load() my_title = "Caso Arroz Verde" texto = "" datos = [] # Se carga el texto de prueba if request.method == "POST" and 'prueba' in request.POST: texto = request.POST["textoprueba"] # Obtiene el texto de entrada if request.method == "POST" and 'buscar' in request.POST: texto = request.POST["palabraClave"] texto = limpiarDatos(texto) text = nlp(texto) # Tokeniza la entrada de texto con Spacy tokenized_sentences = [sentence.text for sentence in text.sents] entidadSpacy = [] # Reconocimiento de entidades con Spacy for sentence in tokenized_sentences: for entity in nlp(sentence).ents: spacyEntidad = entity.text entidadSpacy.append(spacyEntidad) # Eliminando duplicados en las listas, sin perder el orden entidadSpacy = list(set(entidadSpacy)) # Conteo de entidades etiquetaEntidad = [] count = {} claves = [] valores = [] dicEntidades = [] for sentence in entidadSpacy: for entity in nlp(sentence).ents: entidad = entity.text etiqueta = entity.label_ etiquetaEntidad.append(etiqueta) dicEntidades.append({"entidad": entidad, "etiqueta": etiqueta}) # Conteo de entidades count = countDistinct(etiquetaEntidad) keyes = count.keys() values = count.values() for elemento in keyes: claves.append(elemento) for elemento in values: valores.append(elemento) palabras_limpias = [] for enti in entidadSpacy: # Limpieza de datos para consulta palabra = enti palabra = palabra.replace(' ', '_') palabra = palabra.replace('á', 'a') palabra = palabra.replace('é', 'e') palabra = palabra.replace('í', 'i') palabra = palabra.replace('ó', 'o') palabra = palabra.replace('ú', 'u') palabras_limpias.append(palabra) datos = listaEntidadesPropias(datos, palabra) # Elimina tripletas duplicadas datos = OrderedDict((tuple(x), x) for x in datos).values() # Valor del texto de entrada mis_entidades = texto # Imprimir texto con etiquetas de entidades de Spacy for enti in palabras_limpias: # Saca el indice de cada palabra del arreglo indice = palabras_limpias.index(enti) if indice == len(palabras_limpias): break else: tripletaResultante = anotacion(enti) print("triple Salida\t", tripletaResultante) if tripletaResultante is not None: for uri in tripletaResultante: print("URI encontrada:\t", uri) valorUri = uri.split("/") valorUri = valorUri[len(valorUri) - 1] valorUri = valorUri.replace('_', ' ') if valorUri in entidadSpacy[indice]: entidadEtiquetada = '<a href="' + uri + '">' + valorUri + " " + etiquetaEntidad[ indice] + "</a>" # Realiza la anotación luego de obtener la URI mis_entidades = mis_entidades.replace( entidadSpacy[indice], entidadEtiquetada) valorUri = displacy.render(text, style="ent") else: print("Texto no relacionado") else: print("No hay resultados para este entidad") var = charts() # Diccionario visualizacion en template context = { 'my_title': my_title, 'claves': claves, 'valores': valores, 'dicEntidades': dicEntidades, 'mis_entidades': mis_entidades, 'datos': datos, 'text': var } return render(request, "index.html", context)
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
import spacy from spacy.lang.es.stop_words import STOP_WORDS import es_core_news_sm nlp = es_core_news_sm.load() #nlp = spacy.load("es_core_news_sm", disable=['ner', 'parser', 'tagger']) def clean(word): w = word while len(w) >= 1 and w[0] in [ '/', '-', '¡', '¿', '.', ',', ';', ':', '\'', '"', '?', '!' ]: w = w[1:] while len(w) >= 1 and w[-1] in [ '/', '-', '¡', '¿', '.', ',', ';', ':', '\'', '"', '?', '!' ]: w = w[:-1] return w # Tokenizer based on spacy only def tokenizer(doc, lowercase=True): return [x.orth_ for x in nlp(doc.lower() if lowercase else doc)] # Tokenizing and deliting stopwords def tokenizer_wo_stopwords(doc, lowecase=True): return [ x.orth_ for x in nlp(doc.lower() if lowercase else doc) if x.orth_ not in STOP_WORDS
class Semantico(): sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/") nlp = es_core_news_sm.load() def consultaVirutoso(self, texto): text = self.nlp(texto) tokenized_sentences = [sentence.text for sentence in text.sents] datos = [] datos2 = [] listaAux2 = [] auxO = "" listaTipos2 = [] listaAux = [] listaCountTipos = [] listaCountTipos2 = [] listaTipos3 = [] entidades = [] lista = [] entidades2 = [] auxiliar = [] for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades2.append(entity.text) # print ("sem") # print (entity.text) palabras = difflib.get_close_matches(entity.text, [ 'Rafael Correa', 'Odebrecht', 'Alexis Mera', 'CWNE', 'SK Engeenering' ]) palabras2 = ''.join(palabras) if len(palabras2) > 0: entidades.append(palabras2) consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . } """ % (palabras2.replace(' ', ''), palabras2) else: entidades.append(entity.text) consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . } """ % (entity.text.replace(' ', ''), entity) #if len(palabras2) > 0: # entidades.append(palabras2) #else: #t = entity.text.split(" ") #if len(t) > 1: # for i in range(len(t)): # auxiliar.append(entity.text.split()) # for palabraEn in auxiliar: # consulta mejorada # print (palabraEn) # # consulta = """ # SELECT ?s ?p ?o # WHERE # { # ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . # } # """ % (palabraEn(' ',''), palabraEn) #else: # print ("one token") # consulta = """ # SELECT ?s ?p ?o # WHERE # { # ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . # } # """ % (entity.text.replace(' ',''), entity) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: listaTipos = [] contador = [] listaTipos2 = [] listaS = result["s"]["value"].strip() listaP = result["p"]["value"] listaO = result["o"]["value"] # por si sale con ese link no agregar (revisar) # if listaO.startswith('http://www.openlinks'): lista.append(listaS) lista.append(listaP) aux2 = listaP.rsplit('/', 1).pop() if aux2 == "type": listaTipos.append(listaO.rsplit('/', 1).pop()) listaTipos.append(listaS.rsplit('/', 1).pop()) auxO = listaO.rsplit('/', 1).pop() auxO = ''.join(auxO) # print (auxO) listaAux.append(auxO) listaTipos2.append(auxO) # print (listaTipos2) lista.append(listaO) listaAux2 = listaAux listaAux2 = list(set(listaAux2)) # print (listaTipos2) listaTipos3.append(listaTipos2) listaTipos2 = [x for x in listaTipos if x != []] listaTipos3 = [x for x in listaTipos3 if x != []] # print (auxO) # counter = listaTipos3.count(auxO) # print (counter) # print (auxO, ", ", listaS) datos2.append(listaTipos2) datos2.sort() # print (listaTipos3) datos.append(lista) for tipos in listaAux2: # print (tipos) # print (listaAux) counter = listaAux.count(tipos) listCount = [] listCount.append(counter) liste = [counter, tipos] # print (counter) listaCountTipos.append(liste) # listaCountTipos2.append(tipos) # listaCountTipos.append(listaCountTipos2) # listaCountTipos.append(listaCountTipos2) print(listaCountTipos) # print (datos2) # Eliminando duplicados # entidades = list(set(entidades)) return datos, entidades, datos2, entidades2, listaCountTipos def textoHtml(self, texto, entidades2): aux2 = "" listaObjetos = [] #print (entidades2) for palabra in entidades2: if palabra in texto: consulta2 = """ PREFIX cavr: <http://localhost:8080/mydataset/schema/> SELECT ?s ?o WHERE { ?s cavr:label ?o . }""" self.sbcEndpoint.setQuery(consulta2) self.sbcEndpoint.setReturnFormat(JSON) results3 = self.sbcEndpoint.query().convert() for result in results3["results"]["bindings"]: listaS = result["s"]["value"].strip() listaO = result["o"]["value"].strip() aux2 = listaO.rsplit('/', 1).pop() aux6 = listaS.rsplit('/', 1).pop() listaObjetos.append(aux2) listaObjetos = list(set(listaObjetos)) palabraUnica = difflib.get_close_matches(palabra, listaObjetos) palabraUnica = ''.join(palabraUnica) #print (palabraUnica ) if len(palabraUnica) > 0: palabra2 = palabraUnica else: palabra = palabra consulta = """ PREFIX cavr: <http://localhost:8080/mydataset/schema/> SELECT ?s ?o WHERE { ?s cavr:label ?o .FILTER (regex(str(?o), "%s")) . }""" % (palabra2) # print (consulta) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results2 = self.sbcEndpoint.query().convert() #print (results2) for result in results2["results"]["bindings"]: listaS = result["s"]["value"].strip() aux2 = listaS.rsplit('/', 1).pop() #palabra = palabra.replace('í', 'i') url = '<a href = "{}">{}</a>'.format(listaS, palabra) # print (url) if url not in texto: texto = texto.replace(palabra, url) return texto def getTipos(self, texto): print(texto) def consultaPorUri(self, uri): consulta = """ SELECT ?p ?o WHERE { <%s> ?p ?o } """ % (uri) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() return results["results"]["bindings"]
def check_spacy_models(main, lang, pipeline): if lang == 'other': lang = 'eng' if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['tokenization', 'sentence_tokenization']: nlp_pipelines = ['sbd'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) if 'sbd' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sbd' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
from html.parser import HTMLParser import unicodedata from tqdm.auto import tqdm from nltk.corpus import wordnet as wn import fr_core_news_sm nlp_fr = fr_core_news_sm.load() import en_core_web_sm nlp_en = en_core_web_sm.load() import de_core_news_sm nlp_de = de_core_news_sm.load() import es_core_news_sm nlp_es = es_core_news_sm.load() import it_core_news_sm nlp_it = it_core_news_sm.load() import pt_core_news_sm nlp_pt = pt_core_news_sm.load() import nl_core_news_sm nlp_nl = nl_core_news_sm.load() # global variables wnl = WordNetLemmatizer() html_parser = HTMLParser() stopword_list = [] language = ""
class Semantico(): # sparql endopoint sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/") nlp = es_core_news_sm.load() g = rdflib.Graph() g.parse("datos.rdf") def consultaVirutoso(self, texto): text = self.nlp(texto) tokenized_sentences = [sentence.text for sentence in text.sents] datos = [] for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") . } """ % (entity.text) self.sbcEndpoint.setQuery(consulta) self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: lista = [] listaS = result["s"]["value"] listaP = result["p"]["value"] listaO = result["o"]["value"] lista.append(listaS) lista.append(listaP) lista.append(listaO) datos.append(lista) return datos def obtenerRecursos(self, uri): datos = [] consulta = """SELECT ?p ?o WHERE { <%s> ?p ?o }""" % (uri) datos = [] for row in self.g.query(consulta): recursos = [] p = row.p.split("/") o = row.o.split("/") p = p[len(p) - 1] o = o[len(o) - 1] recursos.append(row.p) recursos.append(p) recursos.append(row.o) recursos.append(o) datos.append(recursos) return datos def limpiezaDatos(self, text): nlp = es_core_news_sm.load() text = nlp(text) tokenized_sentences = [sentence.text for sentence in text.sents] datos = [] for sentence in tokenized_sentences: for entity in nlp(sentence).ents: consulta = 'SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER regex(str(?s), "%s") .}' % ( entity.text) for row in self.g.query(consulta): tripleta = [] sujeto = row.s predicado = row.p.split("/") objeto = row.o.split("/") objetoUri = row.o predicado = predicado[len(predicado) - 1] objeto = objeto[len(objeto) - 1] # if '$' in objetoUri: # objetoUri = '' tripleta.append(entity.text) tripleta.append(sujeto) tripleta.append(predicado) tripleta.append(objeto) tripleta.append(objetoUri) datos.append(tripleta) datos = OrderedDict((tuple(x), x) for x in datos).values() lista = [] for i in datos: lista.append(i) return lista
import re ## regular expression import nltk from nltk.tokenize import word_tokenize from nltk.stem.snowball import SnowballStemmer import spacy import es_core_news_sm ## import the model import unidecode import pickle nlp = es_core_news_sm.load() ## load the model sb_spanish = SnowballStemmer('spanish') ## quitar el plural nltk.download("punkt") ## Sentence tokenizer | divide a text ################################################################# ######### PREPROCESSING ################################################################# def regex(text): ## delete any character that not is a word new_text = re.sub("\W+", " ", text) ## delete a number [+] one or more ocurrences new_text = re.sub("\d+", " ", new_text).strip() return new_text def lower_case(text): ## put lowercase new_text = text.lower() return new_text
import es_core_news_sm from language_service.dto.word import Word parser = es_core_news_sm.load() def tag_spanish(text): return [ Word(token=word.text, tag=word.pos_, lemma=word.lemma_) for word in parser(text) ]
class Semantico(): # sparql endopoint sbcEndpoint = SPARQLWrapper("http://localhost:8890/sparql/") nlp = es_core_news_sm.load() #g = rdflib.Graph() #g.parse("datos.rdf") def consultaVirutoso(self, texto): # tokenizar texto con spacy text = self.nlp(texto) tokenized_sentences = [sentence.text for sentence in text.sents] # dar estilos al texo analizado spacyText = displacy.render(text, style="ent") # declaras listas vacias datos = [] datostype = [] datoscompani = [] entidades = [] for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades.append(entity.text) palabra = self.limpiarDatos(entity) consulta = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o .FILTER (regex(str(?s), "%s") || regex(str(?o), "%s")) . } """ % (palabra,palabra) self.sbcEndpoint.setQuery(consulta) # retornar consulta enformto json self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: lista = [] listaS = result["s"]["value"] listaP = result["p"]["value"] listaO = result["o"]["value"] lista.append(listaS) lista.append(listaP) lista.append(listaO) datos.append(lista) for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades.append(entity.text) palabra = self.limpiarDatos(entity) consultatype = """ PREFIX caseav: <http://localhost:8080/Data/page/> SELECT ?o WHERE { {?s caseav:hasNombrePersona ?o .FILTER (regex(str(?o), "%s")) .} UNION {?s caseav:hashasApellidoPersona ?o .FILTER (regex(str(?o), "%s")) .} UNION {?s caseav:hasCodigo ?o .FILTER (regex(Str(?o), "%s")) .} UNION {?s caseav:hasNombreCompletoPersona ?o .FILTER (regex(Str(?o), "%s")) .} } """ % (palabra,palabra,palabra,palabra) self.sbcEndpoint.setQuery(consultatype) # retornar consulta enformto json self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: listae = [] #listaSe = result["s"]["value"] #listaPe = result["p"]["value"] listaOe = result["o"]["value"] #listae.append(listaSe) #listae.append(listaPe) listae.append(listaOe) datostype.append(listae) for sentence in tokenized_sentences: for entity in self.nlp(sentence).ents: entidades.append(entity.text) palabra = self.limpiarDatos(entity) consultacompani = """ PREFIX caseav: <http://localhost:8080/Data/page/> SELECT ?s ?o WHERE { {?s caseav:hasNombreEmpresa ?o .FILTER (regex(str(?o), "%s")) .} } """ % (palabra) self.sbcEndpoint.setQuery(consultacompani) # retornar consulta enformto json self.sbcEndpoint.setReturnFormat(JSON) results = self.sbcEndpoint.query().convert() for result in results["results"]["bindings"]: listaec = [] #listaSe = result["s"]["value"] #listaPe = result["p"]["value"] listaOec = result["o"]["value"] #listae.append(listaSe) #listae.append(listaPe) listaec.append(listaOec) datoscompani.append(listaec) return datos, entidades, spacyText,datostype,datoscompani def textoHtml(self, texto, entidades): for palabra in entidades: if palabra in texto: # reemplazo de tildes # -> NFD y eliminar diacríticos s = re.sub( r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize( "NFD", palabra), 0, re.I ) # -> NFC s = normalize( 'NFC', s) palabraUrl = s.replace(" ","_") palabraUrl = self.limpiarDatos(palabraUrl) url = '<a href = "http://localhost:8080/Data/page/{}">{}</a>'.format(palabraUrl,palabra) if url not in texto: texto = texto.replace(palabra, url) return texto def limpiarDatos(self,palabra): palabra = str(palabra) palabra = palabra.replace(' ','_') palabra = palabra.replace('á','a') palabra = palabra.replace('é','e') palabra = palabra.replace('í','i') palabra = palabra.replace('ó','o') palabra = palabra.replace('ú','u') palabra = palabra.replace('Alianza_Pais','AP') palabra = palabra.replace('Alianza_PAIS','AP') palabra = palabra.replace('Hidalgo_&_Hidalgo','HIDALGO_&_HIDALGO') palabra = palabra.replace('Fopeca','FOPECA') palabra = palabra.replace('Midisa_S.A','MIDISA_S.A') palabra = palabra.replace('Pamela_Martinez','Maria_Pamela_Martinez_Loaiza') return palabra
class Service: dao = Dao() module_url = "./Modelos/modelo_dm0_20_2000.txt" embed = gensim.models.doc2vec.Doc2Vec.load(module_url) nlp = es_core_news_sm.load() stop = set(stopwords.words('spanish')) non_words = list(punctuation) non_words.extend(['¿', '¡']) non_words.extend(stop) non_words.extend(map(str, range(10))) def __init__(self): self.dao = Dao() self.module_url = "./Modelos/modelo_dm0_20_2000.txt" self.embed = gensim.models.doc2vec.Doc2Vec.load(self.module_url) self.nlp = es_core_news_sm.load() self.stop = set(stopwords.words('spanish')) self.non_words = list(punctuation) self.non_words.extend(['¿', '¡']) self.non_words.extend(self.stop) self.non_words.extend(map(str, range(10))) def getModelo(self, id_documento): rutaModelo = "./Modelos/" + str(id_documento) + ".dat" modelo = Word2Vec.load(rutaModelo) return modelo def limpiar(self, text): self.nlp.max_length = 10000000 text = text.lower() doc = self.nlp(text, disable=['ner', 'parser']) #lemmas = [t.norm_ for t in doc if not t.is_punct | t.is_stop and t not in stop] lemmas = [t.norm_ for t in doc if not t.is_punct | t.is_stop] words = [t.lower() for t in lemmas if len(t) > 3 and t.isalpha()] return words def GetResumenes(self): sql = "select * from documento_resumen" df = pd.read_sql(sa.text(sql), self.dao.engine) self.dao.engine.dispose() return df def GetResumenesD(self): sql = "select id_documento,resumen from documento_resumen" df = pd.read_sql(sa.text(sql), self.dao.engine) self.dao.engine.dispose() return df def proceso_lda(self, df): corpus = [] stem = PorterStemmer() lem = WordNetLemmatizer() for news in df['contenido'].dropna(): words = [w for w in word_tokenize(news) if (w not in self.stop)] words = [lem.lemmatize(w) for w in words if len(w) > 2] corpus.append(words) return corpus def GetDocumentoConResumen(self): sql = "select id_documento,resumen from documento" df = pd.read_sql(sa.text(sql), self.dao.engine) self.dao.engine.dispose() return df def GetDocumentosRelacionados(self, idDocumento): sql = "select * from public.fn_documentos_relacionados(:idDocumento)" df = pd.read_sql(sa.text(sql), self.dao.engine, params={'idDocumento': idDocumento}) self.dao.engine.dispose() return df def GetResumenDocumentosRelacionados(self, idDocumento): sql = "select * from public.fn_documentos_resumen_relacionados(:idDocumento)" df = pd.read_sql(sa.text(sql), self.dao.engine, params={'idDocumento': idDocumento}) self.dao.engine.dispose() return df def GetDocumentos(self): sql = "select * from documento" df = pd.read_sql(sa.text(sql), self.dao.engine) self.dao.engine.dispose() return df def GetVectors(self, texto): busqueda_limpia = self.limpiar(texto) vector = self.embed.infer_vector(busqueda_limpia) return vector def GetBusqueda(self, busqueda): busqueda_limpia = self.limpiar(busqueda) vector = self.embed.infer_vector(busqueda_limpia) sql = "select * from public.fn_busqueda_documentos(:v0,:v1,:v2,:v3,:v4,:v5,:v6,:v7,:v8,:v9,:v10,:v11,:v12,:v13,:v14,:v15,:v16,:v17,:v18,:v19)" df = pd.read_sql(sa.text(sql), self.dao.engine, params={ 'v0': str(vector[0]), 'v1': str(vector[1]), 'v2': str(vector[2]), 'v3': str(vector[3]), 'v4': str(vector[4]), 'v5': str(vector[5]), 'v6': str(vector[6]), 'v7': str(vector[7]), 'v8': str(vector[8]), 'v9': str(vector[9]), 'v10': str(vector[10]), 'v11': str(vector[11]), 'v12': str(vector[12]), 'v13': str(vector[13]), 'v14': str(vector[14]), 'v15': str(vector[15]), 'v16': str(vector[16]), 'v17': str(vector[17]), 'v18': str(vector[18]), 'v19': str(vector[19]) }) self.dao.engine.dispose() return df def reconocer_conceptos(self, texto): self.nlp = spacy.load('es_core_news_sm') self.doc = self.nlp(texto) conceptos = "" for palabra in self.doc.ents: p = palabra.text.replace(" ", "_") for t in palabra.text.split(' '): d1 = self.nlp(t.lower()) for ent in d1: if not ent.is_punct and not ent.is_stop and len(ent) > 1: conceptos = str(conceptos) + " " + str(ent) return conceptos def tokenize(self, text): text = text.lower() doc1 = self.nlp(text) lemmas = [t.norm_ for t in doc1 if not t.is_punct | t.is_stop] lemmas = [t.lemma_ for t in doc1 if not t.is_punct | t.is_stop] words = [t.lower() for t in lemmas if len(t) > 1 and t.isalpha()] return words def tokenizer(self, text): text = text.lower() doc1 = self.nlp(text) lemmas = [t.norm_ for t in doc1 if not t.is_punct | t.is_stop] lemmas = [t.lemma_ for t in doc1 if not t.is_punct | t.is_stop] words = [t.lower() for t in lemmas if len(t) > 1 and t.isalpha()] return ''.join(words) def replay(self, text): text = text.replace("algoritmos", "algoritmo") text = text.replace("patrones", "patrón") text = text.replace("acopladas", "acoplada") text = text.replace("herramientas", "herramienta") text = text.replace("tablas", "tabla") text = text.replace("frecuentes", "frecuente") text = text.replace("sistemas", "sistema") text = text.replace("itemsets", "itemset") text = text.replace("generar", "genera") text = text.replace("tareas", "tarea") text = text.replace("paquetes", "paquete") text = text.replace("árboles", "árbol") text = text.replace("artistas", "artista") text = text.replace("maestros", "maestro") text = text.replace("carrozas", "carroza") text = text.replace("carnavales", "carnaval") text = text.replace("bosques", "bosque") text = text.replace("garrapatas", "garrapata") text = text.replace("blood", "sangre") text = text.replace("niños", "niño") text = text.replace("niñas", "niña") text = text.replace("juegos", "juego") text = text.replace("clases", "clase") text = text.replace("estudiantes", "estudiante") text = text.replace("docentes", "docente") text = text.replace("jugar", "juego") text = text.replace("jugando", "juego") text = text.replace("hombres", "hombre") text = text.replace("mujeres", "mujer") text = text.replace("relaciones", "relación") text = text.replace("casos", "caso") text = text.replace("suicidas", "suicida") text = text.replace("fríjol", "frijol") if text == 'items': text = text.replace("items", "items") return ''.join(text) def getTexto(self, id_documento): sql = "select resumen from documento where id_documento=:v0" #sql="select contenido as resumen from documento where id_documento=:v0" df = pd.read_sql(sa.text(sql), self.dao.engine, params={'v0': id_documento}) self.dao.engine.dispose() contenido = '' for res in df.resumen: contenido = contenido + ' ' + res return contenido def modelo_wordwvec(self, texto): self.nlp = spacy.load('es_core_news_sm') self.doc = self.nlp(texto) sent = [] for num, oracion in enumerate(self.doc.sents): o = self.tokenize(str(oracion)) sent.append(o) #Crea las frases relevantes de la lista de oraciones: phrases = Phrases(sent, min_count=30, progress_per=10000) #Transforme el corpus en función de las bigramas detectadas: bigram = Phraser(phrases) sentences = bigram[sent] #palabras mas frecuentes word_freq = defaultdict(int) for sent in sentences: for i in sent: word_freq[i] += 1 #Entrenamiento del modelo cores = multiprocessing.cpu_count() #cuenta el nro de nucles de la pc w2v_model = Word2Vec(min_count=2, window=3, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores - 1) t = time() w2v_model.build_vocab( sentences, progress_per=10000) # prepare the model vocabulary print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=6000, report_delay=1) print('Time to train the model: {} mins'.format( round((time() - t) / 60, 2))) return w2v_model def tsne_plot(self, model): "Creates and TSNE model and plots it" labels = [] tokens = [] for word in model.wv.vocab: tokens.append(model[word]) labels.append(word) tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23) new_values = tsne_model.fit_transform(tokens) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) plt.figure(figsize=(16, 16)) for i in range(len(x)): plt.scatter(x[i], y[i]) plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') return (plt.show()) def mapa_conceptual(self, model, conceptos): "Creates and TSNE model and plots it" labels = [] tokens = [] for word in conceptos: try: tokens.append(model[word]) labels.append(word) except KeyError: continue tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23) new_values = tsne_model.fit_transform(tokens) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) plt.figure(figsize=(7, 7)) for i in range(len(x)): plt.scatter(x[i], y[i]) plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') return (plt.show()) def df_conceptual(self, w2v_model, lista_concepos): df = pd.DataFrame() i = 0 while i < len(lista_concepos): a = [] s = [] try: result = w2v_model.wv.most_similar( positive=[lista_concepos[i].lower()], topn=2) for r in result: a.append(r[0]) s.append(r[1]) df = df.append( { 'source': lista_concepos[i], 'target': r[0], 'score': r[1] }, ignore_index=True) #df=df.append({'bigram':(lista_concepos[i],a[0]),'score':s[0]},ignore_index=True) i = i + 1 except KeyError: i = i + 1 continue return df def df_conceptual_json(self, w2v_model, lista_concepos): df = pd.DataFrame() i = 0 while i < len(lista_concepos): a = [] s = [] try: result = w2v_model.wv.most_similar( positive=[lista_concepos[i].lower()], topn=4) for r in result: a.append(r[0]) s.append(r[1]) df = df.append( { 'source': self.replay(lista_concepos[i]), 'target': self.replay(r[0]), 'score': r[1] }, ignore_index=True) #df=df.append({'source':lista_concepos[i],'target':r[0],'score':r[1]},ignore_index=True) #df=df.append({'source':lista_concepos[i],'target':a[0],'score':s[0]},ignore_index=True) i = i + 1 except KeyError: i = i + 1 continue return df def df_conceptual_jsonF(self, w2v_model, lista_concepos, tope): df = pd.DataFrame() i = 0 while i < len(lista_concepos): a = [] s = [] try: result = w2v_model.wv.most_similar( positive=[lista_concepos[i].lower()], topn=tope) for r in result: a.append(r[0]) s.append(r[1]) df = df.append( { 'source': self.replay(lista_concepos[i]), 'target': self.replay(r[0]), 'score': r[1] }, ignore_index=True) #df=df.append({'source':lista_concepos[i],'target':r[0],'score':r[1]},ignore_index=True) #df=df.append({'source':lista_concepos[i],'target':a[0],'score':s[0]},ignore_index=True) i = i + 1 except KeyError: i = i + 1 continue return df def grafo_conceptual(self, df): # Create network plot d = df.set_index('bigram').T.to_dict('records') G = nx.Graph() # Create connections between nodes for k, v in d[0].items(): G.add_edge(k[0], k[1], weight=(v * 100)) fig, ax = plt.subplots(figsize=(100, 100)) pos = nx.spring_layout(G, k=2) # Plot networks nx.draw_networkx(G, pos, font_size=70, width=3, edge_color='grey', node_color='purple', with_labels=True, ax=ax) # Create offset labels for key, value in pos.items(): x, y = value[0] + .135, value[1] + .045 ax.text(x, y, s=str(key), bbox=dict(facecolor='red', alpha=0.25), horizontalalignment='center', fontsize=13) return (plt.show()) def ResumirDocumento(self, texto, id_documento): try: ##nltk.download('stopwords') ##nltk.download('punkt') SW = set(stopwords.words("spanish")) text = texto words = word_tokenize(text) # Se crea un diccionario para crear una tabla de frecuencias de las palabras. freqTable = dict() # Con un for se recorre el texto y se almacena en la tabla. for word in words: word = word.lower( ) # setea las palabras en minúscula y las almacena en word. if word in SW: continue # Si la palabra se encuentra en SW, continua con el ciclo. if word in freqTable: # Si la palabra ya se encuentra en la tabla frecuencia, freqTable[ word] += 1 # Suma 1 a la posición donde se encuentra la palabra. else: freqTable[ word] = 1 # Sino, la palabra en la TF va a ser igual a 1. # Crea una variable y un diccionario. # Variable sentences para almacenar las oraciones a valorizar del texto. sentences = sent_tokenize(text) # Diccionario sentenceValue para almacenar los valores de las oraciones. sentenceValue = dict() #Se crea un ciclo for para recorrer las oraciones que se encuentran en el texto. for sentence in sentences: # Se crea un segundo for para recorrer los items que se encuentran el la TF. for word, freq in freqTable.items(): if word in sentence.lower( ): # Si la palabra se encuentra en las oraciones (en minuscula) if sentence in sentenceValue: # Y si la oración está en el diccionario de las oraciones a valorizar. sentenceValue[ sentence] += freq # Entonces sume 1 al número de frecuencia en la posición de la oración del sV. else: sentenceValue[ sentence] = freq # Sino, que el valor de la posición de la oración sea igual a la frecuencia. # Se crea una variable donde se almacena la suma de los valores. sumValues = 0 # Se crea un ciclo for para evaluar la oración dentro del Diccionario de las oraciones valorizadas. for sentence in sentenceValue: sumValues += sentenceValue[ sentence] # Se suma 1 al valor de la Oración en su respectiva posición # Valor promedio de una oración desde un texto original average = int( sumValues / len(sentenceValue) ) # Divide la suma de valores en la total de oraciones valorizadas. # Se crea una variable para almacenar el resumen a imprimir. summary = '' # Se crea un for para recorrer las oraciones almacenadas for sentence in sentences: #Donde si, la oración está las oraciones Valorizadas y la posición de la oración es mayor que 1.2 veces el promedio: if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): # El resumen va a agregar un espacio más la oración que aprobó la condición. summary += " " + sentence df = pd.DataFrame(columns=('id_documento', 'resumen')) df = df.append({ 'id_documento': id_documento, 'resumen': summary }, ignore_index=True) df.to_sql('documento_resumen', con=self.dao.engine, if_exists='append', index=False) self.dao.engine.dispose() return (summary) except: df = pd.DataFrame(columns=('id_documento', 'resumen')) df = df.append( { 'id_documento': id_documento, 'resumen': 'ERROR_RESUMEN' }, ignore_index=True) df.to_sql('documento_resumen', con=self.dao.engine, if_exists='append', index=False) self.dao.engine.dispose() return ("ERROR_RESUMEN") def getDoc2vec(self, id_documento): sql = "select vec.*,distancia,case when distancia<0.34 then 0 else 1 end grupo from fn_evaluacion(:id_documento) ml join documento d on ml.id_documento=d.id_documento join documentos_doc2_vec vec on vec.id_documento=ml.id_documento" df = pd.read_sql(sa.text(sql), self.dao.engine, params={'id_documento': id_documento}) self.dao.engine.dispose() return df