def get_french_distances(dataset_fn): pipeline = French() sentencizer = pipeline.create_pipe('sentencizer') pipeline.add_pipe(sentencizer) questions_list, sentences_list, spans_list = compute_question_sentence( dataset_fn, pipeline) nlp_fr = spacy.load('fr_core_news_sm') all_distances = [] error = 0 error_anchor = 0 no_pronoums = 0 all_lexical_variation = [] for i, question in enumerate(questions_list): try: print(questions_list[i], sentences_list[i], spans_list[i]) distance, lexical_variation = get_anchor(questions_list[i], sentences_list[i], nlp_fr, spans_list[i]) if distance is not None: if distance == -1: error_anchor += 1 elif distance == -2: no_pronoums += 1 else: all_distances.append(distance) all_lexical_variation.append(lexical_variation) except: error += 1 continue print(error, error_anchor, no_pronoums) return all_distances, all_lexical_variation
def __init__(self, language='en'): self.exclude = EXCLUDE self.language = language if language == 'fr': nlp = French() else: nlp = English() #nlp.add_pipe(nlp.create_pipe('sentencizer')) sbd = SentenceSegmenter(nlp.vocab, strategy=split_sents) nlp.add_pipe(sbd) self.nlp = nlp
def clean_text(txt): nlp = French() listcode = [x + 45 for x in range(99)] postalcod = lambda dd, liscode: str(int(dd) * 1000 ) if dd in liscode else dd customize_remove_PUNCT = ['%'] for w in customize_remove_PUNCT: nlp.vocab[w].is_punct = False customize_add_PUNCT = [ '>', '=', '$', '™', 'eee', 'ee', 'e', "EE", "EEE", "E", ":" ] for w in customize_add_PUNCT: nlp.vocab[w].is_punct = True reg = '(?<=[0-9])[+\\-\\*^](?=[0-9-])' list_infixes_defaults = list(nlp.Defaults.infixes) if reg in list_infixes_defaults: list_infixes_defaults.remove(reg) # modify process_text infix patterns(dd-dd-dd) infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"]) infix_re = compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer doc = nlp(txt) tokens = [ postalcod(w.text.lower(), listcode) for w in doc if w.text != 'n' and not w.is_punct and not w.is_space and not (w.like_num and len(w.text) > 5) and not len(w.text) > 11 and not w.is_quote ] listToStr = ' '.join(map(str, tokens)) return listToStr
def get_nlp(lang: str): if lang == "fr": return French() elif lang == "en": return English() else: raise ValueError("unknown lang: {}".format(lang))
def read_mtl_file(domain, filename): X = [] Y = [] if domain == 'en': # tokenizer = WordPunctTokenizer() tokenizer = English().Defaults.create_tokenizer() elif domain == 'fr': # tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') tokenizer = French().Defaults.create_tokenizer() elif domain == 'de': # tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') tokenizer = German().Defaults.create_tokenizer() with open(filename, 'r', encoding='utf-8') as inf: for line in inf.readlines(): parts = line.split('\t') if len(parts) == 3: # labeled Y.append(int(float(parts[1]))) elif len(parts) == 2: # unlabeled Y.append(0) else: raise Exception('Unknown format') clean = clean_sentence(parts[-1]) # if domain is 'en': # words = word_tokenize(clean, language='english') # elif domain is 'fr': # words = word_tokenize(clean, language='french') # elif domain is 'de': # words = word_tokenize(clean, language='german') words = [str(e) for e in tokenizer(clean)] tmp = {} tmp['tokens'] = words tmp['sent'] = clean X.append(tmp) #Y = torch.LongTensor(Y).to(opt.device) return (X, Y)
def init_resources(self): self.punctuation_pattern = re.compile("|".join(PUNCTUATION)) self.stemmer = None stopwords_path = os.path.join( os.path.dirname(assistant_dialog_skill_analysis.__file__), "resources", self.language_code, "stopwords", ) if self.language_code == "en": from spacy.lang.en import English self.tokenizer = Tokenizer(English().vocab) self.stemmer = SnowballStemmer(language="english") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "fr": from spacy.lang.fr import French self.tokenizer = Tokenizer(French().vocab) self.stemmer = SnowballStemmer(language="french") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "de": from spacy.lang.de import German self.tokenizer = Tokenizer(German().vocab) self.stemmer = SnowballStemmer(language="german") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "it": from spacy.lang.it import Italian self.tokenizer = Tokenizer(Italian().vocab) self.stemmer = SnowballStemmer(language="italian") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "cs": from spacy.lang.cs import Czech self.tokenizer = Tokenizer(Czech().vocab) self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "pt": from spacy.lang.pt import Portuguese self.tokenizer = Tokenizer(Portuguese().vocab) self.stemmer = SnowballStemmer(language="portuguese") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "es": from spacy.lang.es import Spanish self.tokenizer = Tokenizer(Spanish().vocab) self.stemmer = SnowballStemmer(language="spanish") self.stop_words = self.load_stop_words(stopwords_path) else: raise Exception("language code %s is not supported", self.language_code)
def RecupererTextTokenSansPonctuation(fichier): #Import Package French langages Langue = French() f = fichier tokenizer = RegexpTokenizer(r'\w+') doc = Langue(f.read()) filtered_sent = [] for word in doc: if word.text: filtered_sent.append(word) # delete all and save just text return str(tokenizer.tokenize(str(filtered_sent)))
def define_spacy_tokenizer(language): # Construction 1 from spacy.tokenizer import Tokenizer if (language == 'french'): from spacy.lang.fr import French nlp = French() if (language == 'english'): from spacy.lang.en import English nlp = English() # Create a blank Tokenizer with just the language vocab tokenizer = Tokenizer(nlp.vocab) return tokenizer
def initGlobal(): global parser global fr_stop print("INITIALIZATION") print("Check downloads for nltk libs...") nltk.download('wordnet') nltk.download('stopwords') print("Parse into French") parser = French() fr_stop = set(nltk.corpus.stopwords.words('french')) print("DONE")
def tokenize(document, language, punctutation): if language == 'fr': nlp = French() if language == 'de': nlp = German() if language == 'en': nlp = French() if language == 'es': nlp = Spanish() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(document) if punctutation: sentences = [[str(word) for word in sent if str(word) != '\n'] for sent in doc.sents] else: sentences = [[ str(word) for word in sent if ((str(word) != '\n') and (str(word).isalpha())) ] for sent in doc.sents] return sentences
def get_tokenizers(self, lang): os.environ['TOKENIZERS_PARALLELISM'] = "True" if lang == 'de': spacy = German() bert = "deepset/gbert-base" elif lang == 'fr': spacy = French() bert = "camembert/camembert-base-ccnet" elif lang == 'it': spacy = Italian() bert = "dbmdz/bert-base-italian-cased" else: raise ValueError( f"Please choose one of the following languages: {self.languages}" ) return spacy.tokenizer, AutoTokenizer.from_pretrained(bert)
def lang_change(language): if language == 'en': from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS parser = English() file = "\config_files\config_spacy_en.yaml" configfile_path = os.getcwd() + file elif language == 'de': from spacy.lang.de import German from spacy.lang.de.stop_words import STOP_WORDS parser = German() file = "\config_files\config_spacy_de.yaml" configfile_path = os.getcwd() + file elif language == 'es': from spacy.lang.es import Spanish from spacy.lang.es.stop_words import STOP_WORDS parser = Spanish() file = "\config_files\config_spacy_es.yaml" configfile_path = os.getcwd() + file elif language == 'pt': from spacy.lang.pt import Portuguese from spacy.lang.pt.stop_words import STOP_WORDS parser = Portuguese() file = "\config_files\config_spacy_pt.yaml" configfile_path = os.getcwd() + file elif language == 'fr': from spacy.lang.fr import French from spacy.lang.fr.stop_words import STOP_WORDS parser = French() file = "\config_files\config_spacy_fr.yaml" configfile_path = os.getcwd() + file elif language == 'it': from spacy.lang.it import Italian from spacy.lang.it.stop_words import STOP_WORDS parser = Italian() file = "\config_files\config_spacy_it.yaml" configfile_path = os.getcwd() + file elif language == 'nl': from spacy.lang.nl import Dutch from spacy.lang.nl.stop_words import STOP_WORDS parser = Dutch() file = "\config_files\config_spacy_nl.yaml" configfile_path = os.getcwd() + file return parser, STOP_WORDS, configfile_path
def get_nlp(self, language): """" this method returns the corresponding spacy language model when provided with a language. To do so it also does the required import. This is certainly not the standard approach. But as this endpoint will be deployed to Heroku (space limitation) and only be invoked rarely it is the fastest approach. """ if language == "en": from spacy.lang.en import English return English() elif language == "fr": from spacy.lang.fr import French return French() elif language == "de": from spacy.lang.de import German return German() elif language == "es": from spacy.lang.es import Spanish return Spanish() elif language == "pt": from spacy.lang.pt import Portuguese return Portuguese() else: return {"error": "invalid or not supported language entered"}
def tokenize(self, dataset, language): """ Articles will be processed in parallel """ articles_iter = chunk(dataset, size=self.chunks) length = int(len(dataset) / self.chunks) if language == 'english': nlp_iter = repeat(English()) else: nlp_iter = repeat(French()) tokenized_questions = [] with ProcessPoolExecutor() as executor: chunksize = int(max(length / (self.processes * self.parallelism), 1)) i = 0 for result in executor.map(_tokenize_questions, articles_iter, nlp_iter, chunksize=chunksize): for article in result: tokenized_questions.append(article) i += 1 if i % 10000 == 0: print('Processed {} articles'.format(i)) return tokenized_questions
def preprocess_file(file_path): json_data = [] with open(file_path, encoding="utf8") as json_file: json_data = json.load(json_file) # Filters the question to only take into account the ones that have answers response_data = [] for contrib in json_data: for response in contrib["responses"]: # Si on a une reponse non vide if response["value"] and response["formattedValue"]: # Flattens the responses and add it to the response data response_obj = dict(contrib) del response_obj["responses"] response_obj.update(response) response_data.append(response_obj) df_response_data = pd.DataFrame.from_records(response_data) df_response_data.to_json( os.path.join(data_dir, "response_" + os.path.basename(file_path))) # Loads the french model of spacy and adds some new stop words (could be extended) nlp = fr_core_news_md.load() tokenizer = French().Defaults.create_tokenizer(nlp) additional_stopwords = ["de", "le", "que", "ce", "l"] for stopword in additional_stopwords: nlp.Defaults.stop_words.add(stopword) # Creates a new column in the dataframe that contains each token lemma. # Punctuations, spaces and stopwords are removed df_response_data["lemmatizedValue"] = df_response_data["formattedValue"].\ apply(lambda t: [token.lemma_ for token in tokenizer(t.lower()) if not token.is_stop and not token.is_punct and not token.is_space]) df_response_data.to_json( os.path.join(data_dir, "response_lemmatized_" + os.path.basename(file_path)))
def __init__(self): self.nlp = French()
def fr_nlp(): return French()
import json from spacy.lang.fr import French from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/fr/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/fr/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = French() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # Crée une entité Span avec le label "GPE" pour toutes les correspondances matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="GPE") for match_id, start, end in matches ] return doc # Ajoute le composant au pipeline nlp.add_pipe(countries_component) print(nlp.pipe_names) # Getter qui recherche le texte du span dans le dictionnaire # des capitales des pays
import json from spacy.matcher import Matcher from spacy.lang.fr import French with open("exercises/fr/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = French() matcher = Matcher(nlp.vocab) # Deux tokens dont les formes majuscules correspondent à "iphone" et "x" pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] # Tokens dont les formes majuscules correspondent à "iphone" et un nombre pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}] # Ajoute les motifs au matcher et vérifie le résultat matcher.add("GADGET", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
get_stop_words("en") + STOP_LIST + stopwords.words('english')) elif lang == "nl": return set( get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL) except: print("warning: no stopwords were downloaded. check nltk corpora") print(format_exc()) return set() # load resources _stop_words = load_stoplist() print("Loading spacy model...") _spacy = English() _spacy_fr = French() _spacy_nl = Dutch() _spacy_it = Italian() def get_stoplist(): return _stop_words def lemmatize(text, lowercase=True, lang="en"): """ Return lemmatized text """ if lang == "en": tokens = _spacy(text) elif lang == "fr": tokens = _spacy_fr(text)
""" import re from dataclasses import dataclass, field from typing import Iterable, List, Set, Tuple, Dict from spacy.lang.fr import French SPLITTER_CHAR = {"(", ")", ",", ";", "[", "]", "-", "{", "}"} # Food additives (EXXX) may be mistaken from one another, because of their edit distance proximity ADDITIVES_REGEX = re.compile("(?:E ?\d{3,5}[a-z]*)", re.IGNORECASE) OffsetType = Tuple[int, int] FR_NLP = French() class TokenLengthMismatchException(Exception): pass def normalize_ingredients(ingredients: str) -> str: normalized = ingredients.lower() normalized = normalized.replace("œu", "oeu") normalized = normalized.replace("’", "'") return normalized def normalize_item_ingredients(item: Dict) -> Dict: item = item.copy()
# Spacy from spacy.lang.en import English from spacy.lang.es import Spanish from spacy.lang.fr import French from spacy.lang.zh import Chinese from spacy.lang.ru import Russian from spacy.lang.ar import Arabic from spacy.lang.de import German from spacy.lang.uk import Ukrainian from spacy.lang.ro import Romanian lang_id_to_spacy = { 'en': English(), 'es': Spanish(), 'fr': French(), 'zh-cn': Chinese(), 'ru': Russian(), 'ar': Arabic(), 'de': German(), 'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4', password='******',
import torch.nn.functional as F import torch.optim as optim import spacy from spacy.lang.fr import French # %% # python -m spacy download fr_core_news_sm spacy_fr = spacy.load("fr_core_news_sm") # %% [markdown] # ## Tokenizing the corpus # %% # Create a tokenizer for the french language tokenizer = French().Defaults.create_tokenizer() with open("data/20_000_lieues_sous_les_mers.txt", "r", encoding="utf-8") as f: document = tokenizer(f.read()) # Define a filtered set of tokens by iterating on `document` tokens = ... # Make a list of unique tokens and dictionary that maps tokens to # their index in that list. idx2tok = [] tok2idx = {} ... # %% [markdown] # ## The continuous bag of words model
from spacy.lang.de import German from spacy.lang.ru import Russian from spacy.lang.zh import Chinese from spacy.lang.ja import Japanese from spacy.lang.ca import Catalan from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True,
def __init__(self, lang=English): if lang == "fr": self.nlp = French() else: self.nlp = English() self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
def split_into_lemmas_spacy(desc): nlp = French() doc = nlp(desc) return [w.lemma_ for w in doc]
from spacy.lang.fr import French nlp = French() # Importe la classe Doc from ____ import ____ # Texte désiré : "spaCy est cool." words = ["spaCy", "est", "cool", "."] spaces = [True, True, False, False] # Crée un Doc à partir des mots et des espaces doc = ____(____, words=words, spaces=spaces) print(doc.text)
import json from spacy.lang.fr import French with open("exercises/fr/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) nlp = French() doc = nlp("La Tchéquie pourrait aider la Slovaquie à protéger son espace aérien") # Importe le PhraseMatcher et initialise-le from spacy.____ import ____ matcher = ____(____) # Crée des motifs objets Doc et ajoute-les au matcher # C'est la version rapide de : [nlp(country) for country in COUNTRIES] patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # Appelle le matcher sur le document de test et affiche le résultat matches = ____(____) print([doc[start:end] for match_id, start, end in matches])
import numpy as np from scipy import spatial import sys import unidecode #from sklearn.decomposition import PCA #QUERY Neighbours Ids_and_Score_bool directory = '../' argv = sys.argv nlp = spacy.load("fr_core_news_lg") pca = pickle.load(open(directory + 'models/pca_30.pkl', 'rb')) pca_space = np.load(directory + 'models/vectors_pca_30.npy', allow_pickle=True) id_table = list(np.load(directory + '../data/id_table.npy', allow_pickle=True)) tree = spatial.KDTree(pca_space) from spacy.lang.fr.stop_words import STOP_WORDS from spacy.lang.fr import French parser = French() stopwords = list(STOP_WORDS) def process_query(search_query): query = str(search_query).lower() clean_query = unidecode.unidecode(query) tokens = parser(clean_query) tokens = [word.lower_ for word in tokens] tokens = [word for word in tokens if word not in stopwords] tokens = " ".join([i for i in tokens]) return (tokens) def query2vec(search_query): x = nlp(search_query).vector #spacy 300d
import json from spacy.lang.fr import French from spacy.tokens import Doc with open("exercises/fr/bookquotes.json", encoding="utf8") as f: DATA = json.loads(f.read()) nlp = French() # Déclare l'extension de Doc "author" (défaut None) Doc.set_extension("author", default=None) # Déclare l'extension de Doc "book" (default None) Doc.set_extension("book", default=None) for doc, context in nlp.pipe(DATA, as_tuples=True): # Définis les attributs doc._.book et doc._.author à partir du contexte doc._.book = context["book"] doc._.author = context["author"] # Affiche le texte et les données des attributs personnalisés print(f"{doc.text}\n — '{doc._.book}' par {doc._.author}\n")