def __init__(self, url, variant="seria"): # inputs self.url = url if not isinstance(variant, str) or variant not in [ "seria", "gamberra" ]: raise TypeError( """The parameter 'variant' of the tree must be a string containing the name of the desired variant. Allowed values are "seria" or "gamberra".""" ) self.variant = variant # load spanish model self._nlp = es_core_news_md.load() # load data self._load_data() # initalize article self.article = Article(self.url) # download and parse article self.article.download() self.article.parse() # store basic info self.text = self.article.text self.source_url = self.article.source_url if self.source_url in self.df_url_media.media_url.tolist(): self.recognized_media = True else: self.recognized_media = False if self.recognized_media: self.media_name = self.df_url_media.loc[ self.df_url_media.media_url == self.source_url, 'media_name'].iloc[0] else: self.media_name = self.source_url
def _test_biluov_task(): import es_core_news_md from scripts.utils import Sentence def forward(tokensxsentence, entitiesxsentence): labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence) return [ from_biluov(biluov, sentence, spans=True) for biluov, sentence in zip(labelsxsentence, tokensxsentence) ] training = Collection().load(Path("data/training/scenario.txt")) nlp = es_core_news_md.load() def per_label(label): tokensxsentence = [nlp(s.text) for s in training.sentences] entitiesxsentence = [[ k.spans for k in s.keyphrases if k.label == label ] for s in training.sentences] decoded = forward(tokensxsentence, entitiesxsentence) return decoded collection = Collection([Sentence(s.text) for s in training.sentences]) for label in ENTITIES: decoded = per_label(label) for entities, sentence in zip(decoded, collection.sentences): for spans in entities: keyphrase = Keyphrase(sentence, label, -1, spans) sentence.keyphrases.append(keyphrase) collection.fix_ids() output = Path( "data/submissions/forward-biluov/train/run1/scenario2-taskA/") output.mkdir(parents=True, exist_ok=True) collection.dump(output / "scenario.txt", skip_empty_sentences=False)
def __init__(self, counter=Counter(), popcounter=Counter(), lemmatizer=WordNetLemmatizer()): self.__counter = counter self.__popCounter = popcounter self.lemmatizer = lemmatizer self.punctuation = "¡!\"$%&'()*+,-./:;<=>?@[\\]^_`{|}~" self.stopWords = set( stopwords.words("spanish") + stopwords.words("english")) print("loading spaCy model: es_core_news_md") self.nlp = es_core_news_md.load()
def get_entities(self, document: str, language: str = 'english'): ''' Takes a document and returns a list of extracted entities ''' if language == 'spanish': try: import es_core_news_md logger.info("Success importing en_core_web_md") except ImportError: logger.error("Error importing es_core_news_md") sys.exit(-1) else: try: import en_core_web_md logger.info("Success importing en_core_web_md") except ImportError: logger.error("Error importing en_core_web_md") sys.exit(-1) if isinstance(document, List): document = " ".join(document) # if language given is not the name of a spacy parser, try to convert it to one parser_name = language if language in LANG_TO_PARSER.values( ) else LANG_TO_PARSER.get(language.lower()) if not parser_name: raise Exception('language not supported') # if requested parser is not already in memory, try to load from spacy if parser_name not in PARSERS: try: logger.info("Trying to load parser.") if language == 'spanish': PARSERS[parser_name] = es_core_news_md.load() else: PARSERS[parser_name] = en_core_web_md.load() #PARSERS[parser_name] = spacy.load(parser_name) except Exception: logger.exception("Error loading parser") sys.exit(-1) else: logger.info("Found parser %s in memory" % parser_name) def get_ents(doc): ''' prep, parse, then extract entities from doc text ''' doc = prep_text(doc) # preprocess string doc = PARSERS[parser_name](doc) # parse prepped doc ents = set(ent.text for ent in doc.ents if not self.filter_entity(ent)) # extract entities return list(ents) return get_ents(document)
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def __init__(self): """ Initialize the extractor """ # Create the taxonomy from files self._create_taxonomy() # List of words self.__sources, self.__reporters, self.__entities = [], [], [] # Store the tree with the POS-Tagging, lemmata and gramatical relations self._tree = None # load the language model for spacy self.nlp = es_core_news_md.load() # Set maximum character distance that we allow between the reported verb and the recognized entity to consider that it is a font self._max_dist = 100
def classifier_spacy(text, opciones): palabras = text.split() nlp = es_core_news_md.load() stop = nlp.Defaults.stop_words rta = '' sim_max = 0 i = 0 for op in opciones: opcion = nlp(op.lower()) sim = nlp(text).similarity(opcion) print(sim) for palabra in palabras: if (not palabra in stop) and palabra in op.split(): sim += 0.3 print(sim) if sim > sim_max: print(op) sim_max = sim rta = str(i + 1) i += 1 if sim_max < 0.30: rta = str(0) return rta
import logging from telegram.ext import CommandHandler from telegram.ext import MessageHandler, Filters import telegram import telebot import snips_nlu from snips_nlu import SnipsNLUEngine from SPARQLWrapper import SPARQLWrapper, JSON import es_core_news_md import textacy.datasets as ds import textacy import io import json snips_nlu.load_resources("es") nlp = es_core_news_md.load() mi_bot = telebot.TeleBot("605016230:AAEPuIIJGVVoeHaGbP6iroaYdi3uhkvJXrQ") @mi_bot.message_handler(commands=['start', 'help']) def send_welcome(message): mi_bot.reply_to(message, "Howdy, how are you doing?") @mi_bot.message_handler(func=lambda m: True) def imprimirmensaje(message): campo = "" predicado = "" bandera1 = False chatid = message.chat.id with io.open('dataset.json') as file:
def get_news_features(headline, text): nlp = es_core_news_md.load() ## headline ## headline = re.sub(r"http\S+", "", headline) headline = re.sub(r"http", "", headline) headline = re.sub(r"@\S+", "", headline) headline = re.sub("\n", " ", headline) headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline) headline = headline.replace(r"*NUMBER*", "número") headline = headline.replace(r"*PHONE*", "número") headline = headline.replace(r"*EMAIL*", "email") headline = headline.replace(r"*URL*", "url") headline_lower = headline.lower() doc_h = nlp(headline_lower) list_tokens_h = [] list_tags_h = [] for sentence_h in doc_h.sents: for token in sentence_h: list_tokens_h.append(token.text) fdist_h = FreqDist(list_tokens_h) syllables_h = get_nsyllables(headline) words_h = len(list_tokens_h) # headline complexity features avg_word_size_h = round( sum(len(word) for word in list_tokens_h) / words_h, 2) avg_syllables_word_h = round(syllables_h / words_h, 2) unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2) mltd_h = round(ld.mtld(list_tokens_h), 2) ttr_h = round(ld.ttr(list_tokens_h) * 100, 2) ## text content## text = re.sub(r"http\S+", "", text) text = re.sub(r"http", "", text) text = re.sub("\n", " ", text) text = text.replace(r"*NUMBER*", "número") text = text.replace(r"*PHONE*", "número") text = text.replace(r"*EMAIL*", "email") text = text.replace(r"*URL*", "url") # to later calculate upper case letters ratio alph = list(filter(str.isalpha, text)) text_lower = text.lower() doc = nlp(text_lower) list_tokens = [] list_pos = [] list_tag = [] list_entities = [] sents = 0 for entity in doc.ents: list_entities.append(entity.label_) for sentence in doc.sents: sents += 1 for token in sentence: list_tokens.append(token.text) list_pos.append(token.pos_) list_tag.append(token.tag_) # Calculate entities, pos, tag, freq, syllables, words and quotes entities = len(list_entities) n_pos = nltk.Counter(list_pos) n_tag = nltk.Counter(list_tag) fdist = FreqDist(list_tokens) syllables = get_nsyllables(text) words = len(list_tokens) quotes = n_tag['PUNCT__PunctType=Quot'] # complexity features avg_word_sentence = round(words / sents, 2) avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2) avg_syllables_word = round(syllables / words, 2) unique_words = round((len(fdist.hapaxes()) / words) * 100, 2) ttr = round(ld.ttr(list_tokens) * 100, 2) # readability spanish test huerta_score = round( 206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2) szigriszt_score = round( 206.835 - ((62.3 * syllables) / words) - (words / sents), 2) # stylometric features mltd = round(ld.mtld(list_tokens), 2) upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2) entity_ratio = round((entities / words) * 100, 2) quotes_ratio = round((quotes / words) * 100, 2) propn_ratio = round((n_pos['PROPN'] / words) * 100, 2) noun_ratio = round((n_pos['NOUN'] / words) * 100, 2) pron_ratio = round((n_pos['PRON'] / words) * 100, 2) adp_ratio = round((n_pos['ADP'] / words) * 100, 2) det_ratio = round((n_pos['DET'] / words) * 100, 2) punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2) verb_ratio = round((n_pos['VERB'] / words) * 100, 2) adv_ratio = round((n_pos['ADV'] / words) * 100, 2) sym_ratio = round((n_tag['SYM'] / words) * 100, 2) # create df_features df_features = pd.DataFrame({ 'text': text_lower, 'headline': headline_lower, 'words_h': words_h, 'word_size_h': [avg_word_size_h], 'avg_syllables_word_h': [avg_syllables_word_h], 'unique_words_h': [unique_words_h], 'ttr_h': ttr_h, 'mltd_h': [mltd_h], 'sents': sents, 'words': words, 'avg_words_sent': [avg_word_sentence], 'avg_word_size': [avg_word_size], 'avg_syllables_word': avg_syllables_word, 'unique_words': [unique_words], 'ttr': [ttr], 'huerta_score': [huerta_score], 'szigriszt_score': [szigriszt_score], 'mltd': [mltd], 'upper_case_ratio': [upper_case_ratio], 'entity_ratio': [entity_ratio], 'quotes': quotes, 'quotes_ratio': [quotes_ratio], 'propn_ratio': [propn_ratio], 'noun_ratio': [noun_ratio], 'pron_ratio': [pron_ratio], 'adp_ratio': [adp_ratio], 'det_ratio': [det_ratio], 'punct_ratio': [punct_ratio], 'verb_ratio': [verb_ratio], 'adv_ratio': [adv_ratio], 'sym_ratio': [sym_ratio] }) return df_features
def tokenizador_spacy_lemma(texto, nlp=es_core_news_md.load()): return [token.lemma_ for token in nlp(texto)]
def tokenizador_spacy_simple(texto, nlp=es_core_news_md.load()): return [token.text for token in nlp(texto)]
def _load_nlp(self): if SpacyMixin.nlp == None: import es_core_news_md print("NLP is not yet loaded") SpacyMixin.nlp = es_core_news_md.load() print("NLP loaded")
def __init__(self): self.nlp_model = es_core_news_md.load()