def spacy_analyze(fulltext, source_lang): """Use spacy to analyze input text Parameters: fulltext (string): text source_lang (string): language of the input text Returns: nlp: nlp object """ doc = None if (source_lang == 'fr'): try: nlp = fr_core_news_sm.load(disable=['parser', 'ner']) doc = nlp(fulltext) except: print(sys.exc_info()[0]) elif (source_lang == 'it'): try: nlp = it_core_news_sm.load(disable=['parser', 'ner']) doc = nlp(fulltext) except: print(sys.exc_info()[0]) return doc
def lemmatize(text): nlp = fr_core_news_sm.load() text = nlp(text) text = " ".join([ word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text ]) return text
def main(): parser = argparse.ArgumentParser(description="Prune the copula results") parser.add_argument("log") parser.add_argument("out") args = parser.parse_args() nlp = fr_core_news_sm.load() out = open(args.out, "w", encoding='UTF-8') with open(args.log, "r", encoding='UTF-8') as file: for line in file: subject, copula, attribute = line.split() # remove small words like c', ce, l', la if len(subject) <= 2: continue if len(attribute) <= 2: continue s = nlp(subject, disable=["ner", "dep"])[0] if not (s.pos_ == "NOUN" or s.pos_ == "PROPN"): continue a = nlp(attribute, disable=["ner", "dep"])[0] if not (a.pos_ == "NOUN" or a.pos_ == "PROPN"): continue print(subject, copula, attribute, file=out) out.close()
def __init__(self): self.nlp = spacy_lang.load() self.symbols = set(" ".join(string.punctuation + '0123456789' + '°').split(" ")) self.stopwords = set(stopwords.words('french')) self.accepted_words = set(['pas']) self.pos_to_remove = ['PUNCT', 'SPACE', 'NUM', 'DET', 'PROPN']
def LemmatizeWords(listWords): '''lemmatisation''' listLemmas = list() strWords = " ".join(str(word) for word in listWords) nlp = fr_core_news_sm.load() strLemmas = nlp(strWords) for frLemma in strLemmas: listLemmas.append(frLemma.lemma_) return listLemmas
def __init__(self, BATCH_SIZE, DEVICE): self.spacy_fr = fr_core_news_sm.load() self.spacy_en = en_core_web_sm.load() self.init_token = "<sos>" self.eos_token = "<eos>" self.pad_token = "<pad>" self.unk_token = "<unk>" self.BATCH_SIZE = BATCH_SIZE self.DEVICE = DEVICE
def LemmatizeWords(listWords): '''Lemmatisation :param listWords: Une liste des tokens :return: Une liste des tokens lemmatisés :rtype: List ''' listLemmas = list() strWords = " ".join(str(word) for word in listWords) nlp = fr_core_news_sm.load() strLemmas = nlp(strWords) for frLemma in strLemmas: listLemmas.append(frLemma.lemma_) return listLemmas
def profile(path): nlp = fr_core_news_sm.load() my_text = nlp(pdf2txt(path)) nounchunks = list(my_text.noun_chunks) tokens = [token.text for token in my_text if not token.is_stop] data = panda.read_csv(os.path.join(os.path.dirname('data'), 'profile.csv'),encoding ='latin1') profiles = list(data.columns.values) profile_set = [] for token in tokens: if token.lower() in profiles: profile_set.append(token) for token in nounchunks: token = token.text.lower().strip() if token in profiles: profile_set.append(token) return [i.capitalize() for i in set([i.lower() for i in profile_set])]
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def _nlp(spacy_module: str) -> Optional[NLP]: print("Loading spacy language model for '", spacy_module, "'") if spacy_module == 'en': nlp = en_core_web_sm.load() elif spacy_module == 'es': nlp = es_core_news_sm.load() elif spacy_module == 'de': nlp = de_core_news_sm.load() elif spacy_module == 'fr': nlp = fr_core_news_sm.load() elif spacy_module == 'it': nlp = it_core_news_sm.load() elif spacy_module == 'pt': nlp = pt_core_news_sm.load() else: raise ValueError(f'Unsupported language {spacy_module}') return nlp
def __init__(self, url): try: pattern = re.compile( "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$" ) if not pattern.match(url): print(f"{url} is not a valid url") self.url = url self.article = Article(self.url) self.article.download() self.article.parse() self.author = self.article.authors self.oneline = self.article.summary self.text = self.article.text.replace("\n", ".") if self.article.meta_lang == 'en' or (self.article.meta_lang == '' and url.find( "cnn.com", 0, 10)): import en_core_web_sm self.model = en_core_web_sm.load() elif self.article.meta_lang == 'it': import it_core_news_sm self.model = it_core_news_sm.load() elif self.article.meta_lang == 'fr': import fr_core_news_sm self.model = fr_core_news_sm.load() elif self.article.meta_lang == 'es': import es_core_news_sm self.model = es_core_news_sm.load() elif self.article.meta_lang == 'pt': import pt_core_news_sm self.model = pt_core_news_sm.load() else: print( f"The {self.article.meta_lang} language is not supported") self.data = [] self.vectorizer = TfidfVectorizer(strip_accents='unicode') except article.ArticleException: print( f"The url {url} is not supported, please write to [email protected] for further help" ) self.valid = False
def __init__(self, train_path, sequence_length=70): # Storing some variables self.seq_len = sequence_length self.train_path = train_path self.pad_idx = 0 # putting padding index at 0 print('Loading Spacy ....') # disable=['ner', 'tagger', 'parser'] for faster tokenization self.nlp = fr_core_news_sm.load(disable=['ner', 'tagger', 'parser']) self.cleaner = lambda x: [ str(a.lemma_).lower() for a in self.nlp(x) if not (a.is_stop or not a.is_alpha) ] self.train = pd.read_csv(train_path) self.n_classes = len(np.unique(self.train['Label'].values)) self.label_encode = LabelEncoder() self.label_encode.fit(self.train['Label'].values) print(f'Number of classes: {self.n_classes}') print self.vectorizer = TfidfVectorizer() self.vectorizer.fit(self.train['Texte'].apply(lambda x: ' '.join([ str(a.lemma_).lower() for a in self.nlp(x) if not (a.is_stop or not a.is_alpha) ]))) self.id2word, self.word2id = self.get_vocab_dicts() self.num_words = len(self.id2word) print(f'Number of unique words: {len(self.id2word)}')
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
import gensim from gensim.test.utils import datapath, get_tmpfile from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models import KeyedVectors import fr_core_news_sm from dateutil.parser import parse import spacy from spacy import displacy from collections import Counter import random avoid_tags = ["ADV_", "ADP_", "VERB", "PRON"] filepath_wac_fr_data = "data/frWac_no_postag_no_phrase_700_skip_cut50.bin" nlp_fr = fr_core_news_sm.load() french_model = KeyedVectors.load_word2vec_format(filepath_wac_fr_data, binary=True, unicode_errors="ignore") def is_date(string, fuzzy=False): """ Return whether the string can be interpreted as a date. :param string: str, string to check for date :param fuzzy: bool, ignore unknown tokens in string if True """ try: parse(string, fuzzy=fuzzy) return True
################# Preprocessing ##################### # lowercase strings X_train['designation'] = X_train['designation'].str.lower() # remove non aplha numeric characters def remove_characters(string): string = re.sub("([^\w]|[\d_])+", " ", string) return string X_train['designation'] = X_train['designation'].apply(remove_characters) # define language detectors language_detector = LanguageDetector() nlp_fr = fr_core_news_sm.load(disable=["tagger", "parser","ner","entity_linker","textcat","entity_ruler","sentencizer","merge_noun_chunks","merge_entities","merge_subtokens"]) nlp_fr.add_pipe(nlp_fr.create_pipe('sentencizer')) nlp_fr.add_pipe(language_detector) # add a column for languages X_train['language'] = X_train['designation'].str[:].apply(lambda row : nlp_fr(row)._.language['language']) # plot the different languages of the dataset fig, axes = plt.subplots(1, 1, figsize = (10,5)) ax = sns.countplot(x="language", data=X_train, order=['fr','en','it','ca'] )
def check_spacy_models(main, lang, pipeline): if lang == 'other': lang = 'eng' if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['tokenization', 'sentence_tokenization']: nlp_pipelines = ['sbd'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) if 'sbd' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sbd' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
import re from selenium import webdriver from selenium.webdriver.common import keys import spacy from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation from collections import Counter from heapq import nlargest import en_core_web_sm import fr_core_news_sm # Import summarize from gensim from gensim.summarization.summarizer import summarize from gensim.summarization import keywords # Import the library # to convert MSword doc to txt for processing. import docx2txt nlp_job = fr_core_news_sm.load() nlp_resume = fr_core_news_sm.load() from dotenv import load_dotenv, dotenv_values from getpass import getpass load_dotenv() config = dotenv_values(".env") def get_jobs_links(job_query, user, pwd): """[summary] Args: job_query ([type]): [description] """
either from texts or from documents. ''' #importing necessary modules from flask import Flask, render_template, url_for, request from flask_bootstrap import Bootstrap from collections import Counter from docx2python import docx2python from tika import parser import spacy import en_core_web_sm as en import fr_core_news_sm as fr #load the languages nlp_fr = fr.load() nlp_en = en.load() pos_tag = ['NOUN', 'PROPN', 'VERB', 'ADJ'] ''' Define a function that checks the type of file uploaded by the client and read it accordingly Define another function that does the counting of the words ''' def DocType(source): result = source.filename #split filename with (.) to get the file extension result_splitted = result.split('.') file_extension = result_splitted[-1] #check the extension type and use appropriate method to read if file_extension == "docx":
def main(): nlp = fr_core_news_sm.load() # Load the model takes 10-20 seconds. print(nlp)
# -*- coding: utf-8 -*- """ @author: LOX """ import spacy from spacy import displacy import requests from bs4 import BeautifulSoup import random import fr_core_news_sm # ======================================================= # Sentence Analysis # ======================================================= # nlp = spacy.load("fr_core_news_sm") nlp = fr_core_news_sm.load() #sentence = 'Mais, vous savez, moi je ne crois pas qu’il y ait de bonne ou de mauvaise situation. Moi, si je devais résumer ma vie aujourd’hui avec vous, je dirais que c’est d’abord des rencontres, des gens qui m’ont tendu la main, peut-être à un moment où je ne pouvais pas, où j’étais seul chez moi. Et c’est assez curieux de se dire que les hasards, les rencontres forgent une destinée… Parce que quand on a le goût de la chose, quand on a le goût de la chose bien faite, le beau geste, parfois on ne trouve pas l’interlocuteur en face, je dirais, le miroir qui vous aide à avancer. Alors ce n’est pas mon cas, comme je le disais là , puisque moi au contraire, j’ai pu ; et je dis merci à la vie, je lui dis merci, je chante la vie, je danse la vie… Je ne suis qu’amour ! Et finalement, quand beaucoup de gens aujourd’hui me disent : « Mais comment fais-tu pour avoir cette humanité ? » Eh bien je leur réponds très simplement, je leur dis que c’est ce goût de l’amour, ce goût donc qui m’a poussé aujourd’hui à entreprendre une construction mécanique, mais demain, qui sait, peut-être simplement à me mettre au service de la communauté, à faire le don, le don de soi...' #sentence = "puceau moi ? serieusement ^^ haha on me l avait pas sortie celle la depuis loooongtemps 🙂 demande a mes potes si je suis puceau tu vas voir les reponses que tu vas te prendre XD rien que la semaine passee j ai niquer donc chuuuuut ferme la puceau de merde car oui toi tu m as tout l air d un bon puceau de merde car souvent vous etes frustrer de ne pas BAISER 🙂 ses agreable de se faire un missionnaire ou un amazone avec une meuf hein? tu peux pas repondre car tu ne sais pas ce que c ou alors tu le sais mais tu as du taper dans ta barre de recherche 'missionnaire sexe' ou 'amazone sexe' pour comprendre ce que c etait mdddrrr !! c est grave quoiquil en soit.... pour revenir a moi, je pense que je suis le mec le moins puceau de ma bande de 11 meilleurs amis pas psk j ai eu le plus de rapport intime mais psk j ai eu les plus jolie femme que mes amis :) ses pas moi qui le dit, ses eux qui commente sous mes photos insta 'trop belle la fille que tu as coucher avec hier en boite notamment!' donc apres si tu veux que sa parte plus loi sa peut partir vraiment loi j habite dans la banlieue de niort sa te parle steven sanchez ? ses juste un cousin donc OKLM hahaha on verra si tu parles encore le puceau de merde mdddrrr pk insulter qd on est soi meme puceau tu me feras toujour marrer!" sentence = input() doc = nlp(sentence) # Text Preprocessing | Lemmatization print("\n" + f"Token\t\tLemma\t\tStopword\tDEP\t\tPOS".format( 'Token', 'Lemma', 'Stopword')) print("-" * 70) for token in doc: print( f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}\t\t{token.dep_}\t\t{token.pos_}" )
def load_spacy(): nlp = fr_core_news_sm.load(disable=["parser", "tagger"]) suffixes = nlp.Defaults.suffixes + [r"\d*?[\.,]?\d*\%"] suffix_regex = spacy.util.compile_suffix_regex(suffixes) nlp.tokenizer.suffix_search = suffix_regex.search return nlp
def evaulate_parsers(article_de, article_en, article_fr): # Get all articles from the database # print(f"German Article to parse: {article_de}") # print(f"English Article to parse: {article_en}") # print(f"French Article to parse: {article_fr}") # TODO Create AllenNLP Parsing Function, and call the proper values.... allen_scores = [] allen_scores.append(50) allen_scores.append(60) allen_scores.append(70) # Stanford Parser Baseline CODE """ The Stanford Parser is the Baseline for this Application, we compare every other parser to the output of the Stanford Parser. Define Models, Assign Dataframe to List """ config = "tokenize,mwt,pos,lemma,depparse" nlp_en = stanza.Pipeline(lang='en', processors=config) nlp_de = stanza.Pipeline(lang='de', processors=config) nlp_fr = stanza.Pipeline(lang='fr', processors=config) df_stanford_en = stanford.parse_stan(article_en, nlp_en) df_stanford_de = stanford.parse_stan(article_de, nlp_de) df_stanford_fr = stanford.parse_stan(article_fr, nlp_fr) """ Spacy Parser: Define Spacy Models, Assign Dataframe to List """ df_spacy_de = spacyparser.parse_spacy(article_de, de_core_news_sm.load()) df_spacy_en = spacyparser.parse_spacy(article_en, en_core_web_sm.load()) df_spacy_fr = spacyparser.parse_spacy(article_fr, fr_core_news_sm.load()) # Evaluate Parsers against each other.... df_complete_de = pd.concat([df_stanford_de, df_spacy_de], axis=1, sort=False) df_complete_en = pd.concat([df_stanford_en, df_spacy_en], axis=1, sort=False) df_complete_fr = pd.concat([df_stanford_fr, df_spacy_fr], axis=1, sort=False) # print("German Dataframe Combined") df_complete_de['spacy_eval_upos'] = df_complete_de['upos'].str.lower( ) == df_complete_de['sp_upos'].str.lower() df_complete_de['spacy_eval_deprel'] = df_complete_de['deprel'].str.lower( ) == df_complete_de['sp_deprel'].str.lower() df_complete_de['spacy_eval'] = df_complete_de[ 'spacy_eval_upos'] == df_complete_de['spacy_eval_deprel'] # print("English Dataframe Combined") df_complete_en['spacy_eval_upos'] = df_complete_en['upos'].str.lower( ) == df_complete_en['sp_upos'].str.lower() df_complete_en['spacy_eval_deprel'] = df_complete_en['deprel'].str.lower( ) == df_complete_en['sp_deprel'].str.lower() df_complete_en['spacy_eval'] = df_complete_en[ 'spacy_eval_upos'] == df_complete_en['spacy_eval_deprel'] # print("French Dataframe Combined") df_complete_fr['spacy_eval_upos'] = df_complete_fr['upos'].str.lower( ) == df_complete_fr['sp_upos'].str.lower() df_complete_fr['spacy_eval_deprel'] = df_complete_fr['deprel'].str.lower( ) == df_complete_fr['sp_deprel'].str.lower() df_complete_fr['spacy_eval'] = df_complete_fr[ 'spacy_eval_upos'] == df_complete_fr['spacy_eval_deprel'] # Evaluate the Parsers Against the Stanford Parse # print(df_stanford_de.equals(df_spacy_de)) allen_scores = [0, 0, 0] spacy_scores = [] stanford_scores = [1, 1, 1] spacy_de_scores = df_complete_de.spacy_eval.value_counts().tolist() spacy_en_scores = df_complete_en.spacy_eval.value_counts().tolist() spacy_fr_scores = df_complete_fr.spacy_eval.value_counts().tolist() spacy_de_score = calculate_score(spacy_de_scores[0], spacy_de_scores[0] + spacy_de_scores[1]) spacy_en_score = calculate_score(spacy_en_scores[0], spacy_en_scores[0] + spacy_en_scores[1]) spacy_fr_score = calculate_score(spacy_fr_scores[0], spacy_fr_scores[0] + spacy_fr_scores[1]) spacy_scores.append(spacy_de_score) spacy_scores.append(spacy_en_score) spacy_scores.append(spacy_fr_score) # The Report Data sets Stanford Parser Output to 100 by default, as it is the parser we wan't to compare against. The other parsers are set by their values of true and false in comparison to the stanford parser report_data = { 'de_stan': stanford_scores[0], 'en_stan': stanford_scores[1], 'fr_stan': stanford_scores[2], 'de_spacy': spacy_scores[0], 'en_spacy': spacy_scores[1], 'fr_spacy': spacy_scores[2], 'de_allen': allen_scores[0], 'en_allen': allen_scores[1], 'fr_allen': allen_scores[2] } # report_data = {'de_stan': 100, 'en_stan': 100, 'fr_stan': 100, 'de_spacy': 93.2, 'en_spacy': 92.6, 'fr_spacy': 90.7, 'de_allen': 87.9, 'en_allen': 88.6, 'fr_allen': 90.2} return (report_data)
#!/usr/bin/env python3 import sys import re import os import argparse import requests from bs4 import BeautifulSoup, Comment from random import shuffle from utils import splitIntoWords, filter_numbers, maybe_normalize, extract_sentences, check_output_dir, set_custom_boundaries import spacy try: import fr_core_news_sm #if it doesn't work, an alternative is: nlp = spacy.load('fr_core_news_sm') https://spacy.io/models/fr. See also line nlp = fr_core_news_sm.load(), at the bottom of the page nlp = fr_core_news_sm.load( ) #if it doesn't work, try: nlp = spacy.load('fr_core_news_sm'). See imports, and https://spacy.io/models/fr, https://spacy.io/models/fr, etc. except ModuleNotFoundError: from spacy.cli import download as spacy_model_download spacy_model_download('fr_core_news_sm') nlp = spacy.load('fr_core_news_sm') import nltk nltk.download('punkt') # - prose # - 19è + 20è siècle LIBRETHEATRE_URL = 'https://data.libretheatre.fr/ajax?__fromnavigation=1&rql=DISTINCT+Any+X%2CA%2CX%2CG%2CX%2CF%2CM%2CW+ORDERBY+XAT+WHERE+X+genre+G%2C+A+author_of+X%2C+X+preferred_form+XA%2C+X+text_form+F%2C+XA+title+XAT%2C+X+nb_men+M%2C+X+nb_women+W%2C+X+text_form+%22Prose%22%2C+X+timespan+B%2C+B+eid+IN(1742%2C+3181)&__force_display=1&vid=table.work.no-filter&divid=table_work_no_filter_28fab344fb3a4775b10b359c84710a16&fname=view&pageid=1403154733050406ce179a062b74023961c80756d6f8349' WORK_TEMPLATE = 'https://data.libretheatre.fr/work/%(workid)d' PD_LICENCE = 'https://data.libretheatre.fr/license/1747' mapping_specific = [
from resources.config_provider import get_config_default from viewer.spacy_viewer import view_spacy_docs import fr_core_news_sm # added by piter warnings.filterwarnings('ignore') config_training = get_config_default() model_dir_path = config_training["model_dir_path"] xml_dev_path = config_training["xml_dev_path"] number_of_paragraph_to_display = int( config_training["number_of_paragraph_to_display"]) # nlp = get_empty_model(load_labels_for_training=False) # nlp = nlp.from_disk(model_dir_path) nlp = fr_core_news_sm.load() #a added by piter DEV_DATA = get_paragraph_from_file(xml_dev_path, keep_paragraph_without_annotation=True) all_docs_to_view: List[Doc] = list() # last_case_spans = dict() last_case_docs: List[Doc] = list() former_case_id = None entity_typename_builder = EntityTypename() with tqdm(total=len(DEV_DATA[:number_of_paragraph_to_display]), unit=" paragraphs", desc="Find entities") as progress_bar: for (case_id, original_text, _, _) in DEV_DATA[:number_of_paragraph_to_display]: