def test_issue3803(): """Test that spanish num-like tokens have True for like_num attribute.""" nlp = Spanish() text = "2 dos 1000 mil 12 doce" doc = nlp(text) assert [t.like_num for t in doc] == [True, True, True, True, True, True]
def init_resources(self): self.punctuation_pattern = re.compile("|".join(PUNCTUATION)) self.stemmer = None stopwords_path = os.path.join( os.path.dirname(assistant_dialog_skill_analysis.__file__), "resources", self.language_code, "stopwords", ) if self.language_code == "en": from spacy.lang.en import English self.tokenizer = Tokenizer(English().vocab) self.stemmer = SnowballStemmer(language="english") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "fr": from spacy.lang.fr import French self.tokenizer = Tokenizer(French().vocab) self.stemmer = SnowballStemmer(language="french") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "de": from spacy.lang.de import German self.tokenizer = Tokenizer(German().vocab) self.stemmer = SnowballStemmer(language="german") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "it": from spacy.lang.it import Italian self.tokenizer = Tokenizer(Italian().vocab) self.stemmer = SnowballStemmer(language="italian") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "cs": from spacy.lang.cs import Czech self.tokenizer = Tokenizer(Czech().vocab) self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "pt": from spacy.lang.pt import Portuguese self.tokenizer = Tokenizer(Portuguese().vocab) self.stemmer = SnowballStemmer(language="portuguese") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "es": from spacy.lang.es import Spanish self.tokenizer = Tokenizer(Spanish().vocab) self.stemmer = SnowballStemmer(language="spanish") self.stop_words = self.load_stop_words(stopwords_path) else: raise Exception("language code %s is not supported", self.language_code)
def tokenize(text): tokenized = [] nlp = Spanish() doc = nlp(text) token_list = [] for token in doc: token_list.append(token.text) tokenized.append(token_list) return token_list
def spacy_tokenizer(sentence): parser = Spanish() tokens = parser(sentence) filtered_tokens = [] for word in tokens: lemma = word.lemma_.lower().strip() if lemma not in STOP_WORDS and re.search( '^[a-zA-Z]+$', lemma): filtered_tokens.append(lemma) return filtered_tokens
def preprocess_test(df): # Spacy Tokenizers nlp_es = Spanish() nlp_pt = Portuguese() # Spanish and Portuguese masks to use corresponding language tokenizer mask_spanish = df["language"] == 'spanish' mask_portuguese = df["language"] == 'portuguese' df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,)) df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,)) # Test file only needs id and tokens return df
def main(): nlp = English() doc = nlp("This is a sentence.") print(doc.text) nlp = German() doc = nlp('Liebe Grüße!') print(doc.text) nlp = Spanish() doc = nlp('¿Cómo estás?') print(doc.text)
def preprocess(df): # Spacy Tokenizers nlp_es = Spanish() nlp_pt = Portuguese() # Spanish and Portuguese masks to use corresponding language tokenizer mask_spanish = df["language"] == 'spanish' mask_portuguese = df["language"] == 'portuguese' df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,)) df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,)) # Training and validation df need to have __label__ string before category df["label"] = df["category"].apply(lambda x: '__label__'+ x) return df
def stopwords(self, text): try: nlp = Spanish() if self.lang == 'es' else English() doc = nlp(text) token_list = [token.text for token in doc] sentence = [] for word in token_list: lexeme = nlp.vocab[word] if not lexeme.is_stop: sentence.append(word) return ' '.join(sentence) except Exception as e: Util.standard_error(sys.exc_info()) print('Error stopwords: {0}'.format(e)) return None
def preprocess(text): # Tokenize, remove stopwords, numbers, emtpy spaces and punctuation and lemmatize tokenized = [] nlp = Spanish() doc = nlp(text) token_list = [] # Tokenize for token in doc: # Remove stopwords, numbers, emtpy spaces and punctuation and lemmatize if ((token.text not in nlp.Defaults.stop_words) & (token.text not in string.punctuation) & (token.text.isalpha() == True)): token_list.append(token.lemma_) tokenized.append(token_list) return tokenized
def lang_change(language): if language == 'en': from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS parser = English() file = "\config_files\config_spacy_en.yaml" configfile_path = os.getcwd() + file elif language == 'de': from spacy.lang.de import German from spacy.lang.de.stop_words import STOP_WORDS parser = German() file = "\config_files\config_spacy_de.yaml" configfile_path = os.getcwd() + file elif language == 'es': from spacy.lang.es import Spanish from spacy.lang.es.stop_words import STOP_WORDS parser = Spanish() file = "\config_files\config_spacy_es.yaml" configfile_path = os.getcwd() + file elif language == 'pt': from spacy.lang.pt import Portuguese from spacy.lang.pt.stop_words import STOP_WORDS parser = Portuguese() file = "\config_files\config_spacy_pt.yaml" configfile_path = os.getcwd() + file elif language == 'fr': from spacy.lang.fr import French from spacy.lang.fr.stop_words import STOP_WORDS parser = French() file = "\config_files\config_spacy_fr.yaml" configfile_path = os.getcwd() + file elif language == 'it': from spacy.lang.it import Italian from spacy.lang.it.stop_words import STOP_WORDS parser = Italian() file = "\config_files\config_spacy_it.yaml" configfile_path = os.getcwd() + file elif language == 'nl': from spacy.lang.nl import Dutch from spacy.lang.nl.stop_words import STOP_WORDS parser = Dutch() file = "\config_files\config_spacy_nl.yaml" configfile_path = os.getcwd() + file return parser, STOP_WORDS, configfile_path
def tokenize(text): ''' Tokenize a string in Spanish Parameters ---------- text : str Spanish text string to tokenize. Returns ------- tokenized : list List of tokens (includes punctuation tokens). ''' nlp = Spanish() doc = nlp(text) token_list = [] for token in doc: token_list.append(token.text) return token_list
def spacy_tokenizer(sentence): nlp = spacy.load('es') parser = Spanish() spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS STOPWORDS = list(spacy_stopwords) STOPWORDS.extend(('y', 'a', 'u', 'o', 'e', 'quiero')) tokens = parser(sentence) filtered_tokens = [] for word in tokens: lemma = word.lemma_.lower().strip() lemma = re.sub("á", "a", lemma) lemma = re.sub("é", "e", lemma) lemma = re.sub("í", "i", lemma) lemma = re.sub("ó", "o", lemma) lemma = re.sub("ú", "u", lemma) lemma = re.sub("ñ", "n", lemma) if lemma not in STOPWORDS and re.search('^[a-zA-Z]+$', lemma): filtered_tokens.append(lemma) return filtered_tokens
def tokenize(document, language, punctutation): if language == 'fr': nlp = French() if language == 'de': nlp = German() if language == 'en': nlp = French() if language == 'es': nlp = Spanish() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(document) if punctutation: sentences = [[str(word) for word in sent if str(word) != '\n'] for sent in doc.sents] else: sentences = [[ str(word) for word in sent if ((str(word) != '\n') and (str(word).isalpha())) ] for sent in doc.sents] return sentences
def get_nlp(self, language): """" this method returns the corresponding spacy language model when provided with a language. To do so it also does the required import. This is certainly not the standard approach. But as this endpoint will be deployed to Heroku (space limitation) and only be invoked rarely it is the fastest approach. """ if language == "en": from spacy.lang.en import English return English() elif language == "fr": from spacy.lang.fr import French return French() elif language == "de": from spacy.lang.de import German return German() elif language == "es": from spacy.lang.es import Spanish return Spanish() elif language == "pt": from spacy.lang.pt import Portuguese return Portuguese() else: return {"error": "invalid or not supported language entered"}
import json from spacy.matcher import Matcher from spacy.lang.es import Spanish with open("exercises/es/adidas.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = Spanish() matcher = Matcher(nlp.vocab) # Dos tokens que en minúsculas encuentran "adidas" y "zx" pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}] # Token que en minúsculas encuentra "adidas" y un dígito pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}] # Añade los patrones al matcher y revisa el resultado matcher.add("ROPA", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
def complete_text_analysis(text, raw_entities): start_time = time() date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S") path_to_file = date + " - SingleCompleteTextAnalysis_Performance.txt" p_file = codecs.open(path_to_file, encoding='utf-8', mode='a') p_file.write(date + " Single Complete Text Analysis Test - Local Execution" + "\n") p_file.flush() # II. Prepare data p_file.write("Preparing initial data ... " + "\n") path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \ + 'configuration.ini' config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(codecs.open(path_to_configuration, "r", "utf8")) # 01. Read emojis path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep unicode_emoji_list_file = codecs.open(path + "list - unicode_emojis_metadata.txt", encoding='utf-8') emoji_list = unicode_emoji_list_file.read().splitlines() unicode_emoji_list_file.close() aux_emojis_dict = {} emojis_dict = {} for aux in emoji_list: aux_emoji = aux.split('\t') aux_emojis_dict[aux_emoji[1]] = [aux_emoji[2], aux_emoji[3]] emojis_dict[aux_emoji[2]] = { 'emoji_id': aux_emoji[0], 'unicode': aux_emoji[1], 'name': aux_emoji[3], 'polarity': float(aux_emoji[4]), 'happiness': float(aux_emoji[5]), 'anger': float(aux_emoji[6]), 'fear': float(aux_emoji[7]), 'replusion': float(aux_emoji[8]), 'surprise': float(aux_emoji[9]), 'sadness': float(aux_emoji[10]), 'interest': aux_emoji[11] } sorted_aux_emojis_list = sorted(aux_emojis_dict.keys(), key=len, reverse=True) emojis_list = list() for aux_emoji in sorted_aux_emojis_list: emojis_list.append(aux_emojis_dict[aux_emoji][0]) # print(emojis_list) # 02. Read complementary characters complementary_characters_list_file = codecs.open( path + "list - complementary_characters.txt", encoding='utf-8') complementary_characters_list = complementary_characters_list_file.read( ).splitlines() complementary_characters_list_file.close() complementary_characters_dict = {} for aux in complementary_characters_list: aux_char = aux.split('\t') complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]] # print(complementary_characters_dict) # 03. Read emoticons patterns text_type = 'Twitter' emotions = ast.literal_eval(config.get(text_type, 'emotions')) emoticons_metadata = ast.literal_eval( config.get(text_type, 'emoticons_metadata')) emotions_polarity = ast.literal_eval( config.get(text_type, 'emotions_polarity')) # 04. Configure Google_Universal_POS_Tags tags = config.options("Google_Universal_POS_Tags") google_universal_tags = {} for tag in tags: google_universal_tags[tag.upper()] = config.get( 'Google_Universal_POS_Tags', tag) # 05. Read special characters (#, @, https, etc.) special_characters = ast.literal_eval( config.get('TextAnalysis', 'special_characters')) additional_symbols = ast.literal_eval( config.get('TextAnalysis', 'additional_symbols')) variation_selectors = ast.literal_eval( config.get('TextAnalysis', 'variation_selectors')) # 06. Configure Spanish POS tagger nlp = Spanish() tag_map = spacy.lang.es.TAG_MAP emoticons = [] emojis = [] complementary_characters = [] texts = [] emojis_count = 0 emoticon_count = 0 complementary_characters_count = 0 original_text = text.replace('\n', ' ') results = identify_special_characters( original_text, raw_entities, nlp, tag_map, emotions, emoticons_metadata, emotions_polarity, emojis_dict, emojis_list, variation_selectors, complementary_characters_dict, emoticon_count, emojis_count, complementary_characters_count) spaced_text = results[0] final_clean_text = results[1] emoticons += copy.deepcopy(results[2]) emojis += copy.deepcopy(results[3]) complementary_characters += copy.deepcopy(results[4]) emoticon_count = results[5] emojis_count = results[6] complementary_characters_count = results[7] special_entities = results[8] execution_time = time() - start_time p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n") texts.append(spaced_text + '\t' + final_clean_text + '\t' + str(special_entities)) p_file.write("Texts with: " + "\n") for text in texts: p_file.write(text + "\n") p_file.flush() p_file.close()
if len(x) > 3] # Remove words with less than 3 letters. tokens = [stemmer.stem(token) for token in tokens] # Lemmatize words. tokens = [x for x in tokens if x not in to_avoid] return ' '.join(tokens) if '__main__' == __name__: stemmer = SnowballStemmer('spanish') sys.setrecursionlimit(10000) cwd = os.getcwd() stop_words = get_stop_words('es') parser = Spanish() to_avoid = read_as_list('to_avoid.txt', 'latin-1') my_sheet = 'Sheet1' file_name = 'Proposals - PAM - Spanish.xlsx' # name of your excel file df = read_excel(file_name, sheet_name=my_sheet) df = df[df['category/name/se'] == 'Sanidad y salud'] txt = list(df['body']) text = [filter_vocabulary(txt, 0.01)][0] text = [ prepare_text_for_ML(x, stop_words, parser, stemmer, to_avoid) for x in text ]
def getSentences(text): nlp = Spanish() nlp.add_pipe(nlp.create_pipe('sentencizer')) document = nlp(text) return [sent.string.strip() for sent in document.sents]
from rrec.model.reddit_recommender import RedditRecommender # Spacy from spacy.lang.en import English from spacy.lang.es import Spanish from spacy.lang.fr import French from spacy.lang.zh import Chinese from spacy.lang.ru import Russian from spacy.lang.ar import Arabic from spacy.lang.de import German from spacy.lang.uk import Ukrainian from spacy.lang.ro import Romanian lang_id_to_spacy = { 'en': English(), 'es': Spanish(), 'fr': French(), 'zh-cn': Chinese(), 'ru': Russian(), 'ar': Arabic(), 'de': German(), 'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
def __init__(self): self.nlp_english = English() self.nlp_spanish = Spanish()
# Constants - Hyperparameters interactions_scores_dict = { 'VIEW': 1, 'BOOKMARK': 2, 'FOLLOW': 3, 'LIKE': 4, 'COMMENT CREATED': 5 } # Global objects interactions_df = pd.read_csv('interactions.csv') articles_df = pd.read_csv('articles.csv') person_le = preprocessing.LabelEncoder() tokens_le = preprocessing.LabelEncoder() hidden_dimensions = 250 language_objects = {"en": English(), "pt": Portuguese(), "es": Spanish()} tokenizers = {} summaries = {} filter_regex = "[^A-Za-z0-9]+" batch_size = 10000 max_iterations = 100000 l2_lambda = 0.001 # We summarize each article with Spacy's TextRank implementation. This eliminates most of the noisy information # in the texts. Then we apply tf-idf analysis to the article summaries. For every unique token in the obtained corpus # of summaries, we calculate the expected tf-idf score over all articles. Then we sort the tokens in descending order # of their expected tf-idf scores. The first 5000 tokens will constitute the representing tokens of our article corpus. def create_article_tokens(): def identity_tokenizer(text):
import json from spacy.lang.es import Spanish from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/es/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/es/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Spanish() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # Crea un Span de entidades con el label "LOC" para todos los resultados matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="LOC") for match_id, start, end in matches ] return doc # Añade el componente al pipeline nlp.add_pipe(countries_component) print(nlp.pipe_names) # El getter que busca el texto del span en un diccionario de ciudades # capitales de países
import spacy from spacy.matcher import PhraseMatcher from spacy_lookup import Entity from spacy.lang.es import Spanish nlp = Spanish() entity = Entity(nlp, keywords_list=['pera en Dulce', 'manzana', 'tentacion'], label='FOOD') nlp.add_pipe(entity, name='Food') entity2 = Entity(nlp, keywords_list=['#mora'], label='FOOD_HASHTAGS') nlp.add_pipe(entity2, name='FoodHashtags') text = "Me gustan mucho la manzana y tambien la pera en dulce en salsa de #mora. También me gusta la paleta tentación." doc = nlp(text) for e in doc: print(e.text, e._.is_entity, e.ent_type_)
from spacy.lang.it import Italian from spacy.lang.de import German from spacy.lang.ru import Russian from spacy.lang.zh import Chinese from spacy.lang.ja import Japanese from spacy.lang.ca import Catalan from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy',
from spacy.lang.es import Spanish nlp = Spanish() # Importa las clases Doc y Span from spacy.____ import ____, ____ words = ["Me", "gusta", "David", "Bowie"] spaces = [True, True, True, False] # Crea un doc a partir de las palabras y los espacios doc = ____(____, ____, ____) print(doc.text) # Crea un span para "David Bowie" a partir del doc y asígnalo al label "PERSON" span = ____(____, ____, ____, label=____) print(span.text, span.label_) # Añade el span a las entidades del doc ____.____ = [____] # Imprime en pantalla el texto y los labels de las entidades print([(ent.text, ent.label_) for ent in doc.ents])
import json from spacy.lang.es import Spanish from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/es/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/es/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Spanish() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # Crea un Span de entidades con el label "LOC" para todos los resultados matches = matcher(doc) doc.ents = [ ____(____, ____, ____, label=____) for match_id, start, end in matches ] return doc # Añade el componente al pipeline ____.____(____) print(nlp.pipe_names) # El getter que busca el texto del span en un diccionario de ciudades # capitales de países
# Import the Spanish language class from spacy.lang.en import English from spacy.lang.de import German from spacy.lang.es import Spanish # Create the nlp object nlp = Spanish() # or English() or German() # Process a text (this is Spanish for: "How are you?") doc = nlp("¿Cómo estás?") # Print the document text print(doc.text) # '¿Cómo estás?'
def simple_identification(): client_from = MongoClient() db_from = client_from["SSD"] coll_from = db_from["raw_data"] start_time = time() date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S") path_to_file = date + " - DetectRegexSpecialEntitiesRawData_Performance.txt" p_file = codecs.open(path_to_file, encoding='utf-8', mode='w') p_file.write( date + " Detecting Special Entities with Regex Expression Test - Local Execution" + "\n") p_file.flush() # II. Prepare data p_file.write("Preparing initial data ... " + "\n") path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \ + 'configuration.ini' config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(codecs.open(path_to_configuration, "r", "utf8")) # print(emoticons_dict) # 3. Configure Spanish POS tagger spanish_pipeline = Spanish() all_from_tweets = coll_from.find() count = 0 stop = 100 p_file.write("Total data to process: " + str(stop) + "\n") emoticons = [] text_type = 'Twitter' emotions = ast.literal_eval(config.get(text_type, 'emotions')) emoticons_metadata = ast.literal_eval( config.get(text_type, 'emoticons_metadata')) emotions_polarity = ast.literal_eval( config.get(text_type, 'emotions_polarity')) texts = [] no_texts = [] emoticon_count = 0 for raw_data in all_from_tweets: if 'text' in raw_data.keys() and 'lang' in raw_data.keys(): if "place" in raw_data.keys(): place = raw_data["place"] if place is not None: if "country_code" in place.keys(): raw_data_country_code = raw_data["place"][ "country_code"] if raw_data_country_code in ["CO"]: lang = raw_data["lang"] original_text = raw_data['text'] raw_entities = raw_data['entities'] original_text = original_text.replace('\n', ' ') if lang == 'es': results = identify_special_entities( original_text, raw_entities, spanish_pipeline, emoticon_count, emotions, emoticons_metadata, emotions_polarity) text = results[0] clean_text = results[1] emoticon_count = results[2] special_entities = results[3] emoticons += copy.deepcopy(results[4]) if len(results[4]) != 0: texts.append(original_text + '\t' + text + '\t' + clean_text + '\t' + str(special_entities)) else: no_texts.append(original_text + '\t' + text + '\t' + clean_text) count += 1 else: if len(original_text) >= 3: blob = TextBlob(original_text) detection = True detected_language = '' while detection: try: detected_language = blob.detect_language( ) detection = False except: print( 'error while getting detected language' ) # print(detected_language) if detected_language == 'es': results = identify_special_entities( original_text, raw_entities, spanish_pipeline, emoticon_count, emotions, emoticons_metadata, emotions_polarity) text = results[0] clean_text = results[1] emoticon_count = results[2] special_entities = results[3] emoticons += copy.deepcopy(results[4]) if len(results[4]) != 0: texts.append(original_text + '\t' + text + '\t' + clean_text + '\t' + str(special_entities)) else: no_texts.append(original_text + '\t' + text + '\t' + clean_text) count += 1 print(count) print(emoticon_count) if emoticon_count >= stop: break all_from_tweets.close() client_from.close() p_file.write("Emoticons " + str(len(emoticons)) + "\n") emoticons_counter = Counter(emoticons).most_common() emoticons_counter_sorted = sorted(emoticons_counter, key=lambda tup: tup[1]) for emoticon in emoticons_counter_sorted: p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n") p_file.write("Total Emoticons: " + str(emoticon_count) + ". Total data: " + str(count) + ". Proportion: " + str(emoticon_count / count) + "\n") p_file.write("TEXTS WITH EMOTICONS: \n") for text in texts: p_file.write(text + "\n") p_file.write("TEXTS WITHOUT EMOTICONS: \n") for text in no_texts: p_file.write(text + "\n") p_file.write("Total elements in new list: " + str(count) + "\n") execution_time = time() - start_time p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n") p_file.flush() p_file.close()
from spacy.lang.es import Spanish nlp = Spanish() people = ["David Bowie", "Angela Merkel", "Lady Gaga"] # Crea una lista de patrones para el PhraseMatcher patterns = list(nlp.pipe(people))
import json from spacy.lang.es import Spanish with open("exercises/es/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) nlp = Spanish() doc = nlp("La Unión Europea fue fundada por seis países de Europa occidental " "(Francia, Alemania, Italia, Bélgica, Países Bajos, y Luxemburgo) y " "se amplió en seis ocasiones.") # Importa el PhraseMatcher e inicialízalo from spacy.____ import ____ matcher = ____(____) # Crea objetos Doc patrón y añádelos al matcher # Esta es una versión más rápida de: [nlp(country) for country in COUNTRIES] patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # Llama al matcher sobre el documento de prueba e imprime el # resultado en pantalla matches = ____(____) print([doc[start:end] for match_id, start, end in matches])