def updateEventoKeyWordEntities(sender, instance, created, **kwargs): assunto = instance.assunto # instancia o modelo de nlp nlp = pt_core_news_sm.load() doc = nlp(assunto) # Separação de tokens tokens = pre_processing(doc) # Requisição do dialogflow para obter as entities client = dialogflow_v2.EntityTypesClient() parent = client.project_agent_path(os.environ['PROJECT_ID']) list_entity_types_response = list(client.list_entity_types(parent)) # cria uma nova instância com as novas entities processadas list_entity_types_response = list(client.list_entity_types(parent)) entity_type = list_entity_types_response[2] entries = [] entities = list(entity_type.entities) for token in tokens: entities.append({'value': token.lemma_, 'synonyms': [token.text]}) #realiza o submit das entities ao dialogflow response = client.batch_update_entities(entity_type.name, entities) response.done() # treina o modelo do client = dialogflow_v2.AgentsClient() project_parent = client.project_path(os.environ['PROJECT_ID']) client.train_agent(project_parent)
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def get_pos_tags(tweet): """ Takes a list of strings (tweets) and returns a list of strings of (POS tags). """ nlp = pt_core_news_sm.load() doc = nlp(tweet) tag_list = [w.pos_ for w in doc] return tag_list
def _nlp(spacy_module: str) -> Optional[NLP]: print("Loading spacy language model for '", spacy_module, "'") if spacy_module == 'en': nlp = en_core_web_sm.load() elif spacy_module == 'es': nlp = es_core_news_sm.load() elif spacy_module == 'de': nlp = de_core_news_sm.load() elif spacy_module == 'fr': nlp = fr_core_news_sm.load() elif spacy_module == 'it': nlp = it_core_news_sm.load() elif spacy_module == 'pt': nlp = pt_core_news_sm.load() else: raise ValueError(f'Unsupported language {spacy_module}') return nlp
def obtemListasPortIng(lista): listaPortAux = [] listaIngAux = [] listaPalavrasNaoReconAux = [] nlp = pt_core_news_sm.load() for termo in lista: w = [] doc = nlp(termo) w = [token.lemma_ for token in doc] if ((w[0] in vocabPortugues) or (nltk.PorterStemmer().stem(termo)) in vocabPortugues) and (nltk.corpus.wordnet.morphy(termo) not in vocabIngles): listaPortAux.append(termo) else: if ((w[0] not in vocabPortugues) or (nltk.PorterStemmer().stem(termo)) not in vocabPortugues) and (nltk.corpus.wordnet.morphy(termo) not in vocabIngles): listaPalavrasNaoReconAux.append(termo) else: listaIngAux.append(termo) return listaPortAux, listaIngAux, listaPalavrasNaoReconAux
def __init__(self, url): try: pattern = re.compile( "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$" ) if not pattern.match(url): print(f"{url} is not a valid url") self.url = url self.article = Article(self.url) self.article.download() self.article.parse() self.author = self.article.authors self.oneline = self.article.summary self.text = self.article.text.replace("\n", ".") if self.article.meta_lang == 'en' or (self.article.meta_lang == '' and url.find( "cnn.com", 0, 10)): import en_core_web_sm self.model = en_core_web_sm.load() elif self.article.meta_lang == 'it': import it_core_news_sm self.model = it_core_news_sm.load() elif self.article.meta_lang == 'fr': import fr_core_news_sm self.model = fr_core_news_sm.load() elif self.article.meta_lang == 'es': import es_core_news_sm self.model = es_core_news_sm.load() elif self.article.meta_lang == 'pt': import pt_core_news_sm self.model = pt_core_news_sm.load() else: print( f"The {self.article.meta_lang} language is not supported") self.data = [] self.vectorizer = TfidfVectorizer(strip_accents='unicode') except article.ArticleException: print( f"The url {url} is not supported, please write to [email protected] for further help" ) self.valid = False
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters # REFERENCE: http://brandonrose.org/clustering """TextRank for Tweet Summarization""" !pip install -U spacy !pip install -U scikit-learn !python -m spacy download pt_core_news_sm #importing libraries import spacy from spacy.lang.pt.stop_words import STOP_WORDS from sklearn.feature_extraction.text import CountVectorizer import pt_core_news_sm nlp = pt_core_news_sm.load() import os path2 = 'drive/MyDrive/Project/Preprocess_Data' for filename in os.listdir(path2): # if(filename=='.ipynb_checkpoints'): # pass if(filename == 'Pfizer.txt'): txt_file = path2 + "/" + filename with open(txt_file, "r", encoding="utf-8") as f: text = " ".join(f.readlines()) doc = nlp(text) corpus = [sent.text.lower() for sent in doc.sents ] cv = CountVectorizer(stop_words=list(STOP_WORDS)) cv_fit=cv.fit_transform(corpus) word_list = cv.get_feature_names();
def check_spacy_models(main, lang, pipeline): if lang == 'other': lang = 'eng' if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['tokenization', 'sentence_tokenization']: nlp_pipelines = ['sbd'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) if 'sbd' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sbd' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp_fr = fr_core_news_sm.load() import en_core_web_sm nlp_en = en_core_web_sm.load() import de_core_news_sm nlp_de = de_core_news_sm.load() import es_core_news_sm nlp_es = es_core_news_sm.load() import it_core_news_sm nlp_it = it_core_news_sm.load() import pt_core_news_sm nlp_pt = pt_core_news_sm.load() import nl_core_news_sm nlp_nl = nl_core_news_sm.load() # global variables wnl = WordNetLemmatizer() html_parser = HTMLParser() stopword_list = [] language = "" def init_lib(lang): global stopword_list, language nltk.download('stopwords')
def summarization(args): with open(args.original_text, "r", encoding="utf-8") as f: text = " ".join(f.readlines()) if args.language == 'portuguese': import pt_core_news_sm nlp = pt_core_news_sm.load() else: import en_core_web_sm nlp = en_core_web_sm.load() doc = nlp(text) corpus = [sent.text.lower() for sent in doc.sents] cv = CountVectorizer(stop_words=list(STOP_WORDS)) cv_fit = cv.fit_transform(corpus) word_list = cv.get_feature_names() count_list = cv_fit.toarray().sum(axis=0) """ The zip(*iterables) function takes iterables as arguments and returns an iterator. This iterator generates a series of tuples containing elements from each iterable. Let's convert these tuples to {word:frequency} dictionary""" word_frequency = dict(zip(word_list, count_list)) val = sorted(word_frequency.values()) # Check words with higher frequencies higher_word_frequencies = [ word for word, freq in word_frequency.items() if freq in val[-3:] ] print("\nWords with higher frequencies: ", higher_word_frequencies) # gets relative frequencies of words higher_frequency = val[-1] for word in word_frequency.keys(): word_frequency[word] = (word_frequency[word] / higher_frequency) # SENTENCE RANKING: the rank of sentences is based on the word frequencies sentence_rank = {} for sent in doc.sents: for word in sent: if word.text.lower() in word_frequency.keys(): if sent in sentence_rank.keys(): sentence_rank[sent] += word_frequency[word.text.lower()] else: sentence_rank[sent] = word_frequency[word.text.lower()] else: continue top_sentences = (sorted(sentence_rank.values())[::-1]) top_sent = top_sentences[:args.nb_sentences] # Mount summary summary = [] for sent, strength in sentence_rank.items(): if strength in top_sent: summary.append(sent) # return orinal text and summary return text, summary
import pt_core_news_sm import operator from nltk.tokenize import sent_tokenize dictEQ = {} #entidade -> quantidade dictEL = {} #entidade -> label i = 1 link = "" titulo = "" data = "" noticia = "" parser = pt_core_news_sm.load() with open("baseUnica.txt", "r", encoding="utf-8") as docs: for linha in docs: trx = linha.rstrip() if (i == 1): link = trx elif (i == 2): titulo = trx elif (i == 3): data = trx elif (trx == "YippieKiYay"): i = 0 lista_tknzd = sent_tokenize(titulo) lista_tknzd += sent_tokenize(noticia) #print(lista_tknzd) noticia = "" for sents in lista_tknzd: parsedEx = parser(sents) ents = list(parsedEx.ents)
#!/usr/bin/python3 import spacy from spacy import displacy import pt_core_news_sm import de_core_news_sm import en_core_web_sm from fuzzywuzzy import fuzz, process NAMED_ENTITY_MINIMUM_LENGTH = 3 SIMILARITY_RATIO_THRESHOLD = 70 NLP_PT = pt_core_news_sm.load() NLP_DE = de_core_news_sm.load() NLP_EN = en_core_web_sm.load() MODELS = { 'de': NLP_DE, 'en': NLP_EN, 'pt': NLP_PT, } def _get_nlp_model(language): return MODELS[language] def _get_named_entities(text, language): nlp_model = _get_nlp_model(language) named_entities = nlp_model(text).ents return named_entities