def lemmatize(serie): """ Takes the panda series and lemmatizes each word using the spacylefff lemmatizer Parameters ---------- serie : pandas.series The column that is processes Returns ------- lemmatized : pandas.series The lemmatized column """ pos = POSTagger() french_lemmatizer = LefffLemmatizer(after_melt=True) nlp = spacy.load('fr_core_news_sm') nlp.add_pipe(pos, name='pos', after='parser') nlp.add_pipe(french_lemmatizer, name='lefff', after='pos') lemmatized = serie.map(lambda post: post.lower()).map( remove_hyperlink).map(lambda post: [doc.lemma_ for doc in nlp(post)]) return lemmatized
def _get_lang(self): """get Lang of the page according to `<html lang='en' >` attribute""" self.lang = self.soup.find('html')['lang'][:2] self.nlp = spacy.load(self.lang) if self.lang == 'fr': french_lemmatizer = LefffLemmatizer() self.nlp.add_pipe(french_lemmatizer, name='lefff')
def lemmatization(x): import spacy from spacy_lefff import LefffLemmatizer, POSTagger #spacy_lefff installed package to lemmatize french (since it is not available in NLTK) # install language model python -m spacy download fr nlp = spacy.load('fr_core_news_md') french_lemmatizer = LefffLemmatizer() nlp.add_pipe(french_lemmatizer, name='lefff') doc = nlp(x) return " ".join([d.lemma_ for d in doc])
def lemmatize(serie): pos = POSTagger() french_lemmatizer = LefffLemmatizer(after_melt = True) nlp = spacy.load('fr_core_news_sm') nlp.add_pipe(pos, name = 'pos', after = 'parser') nlp.add_pipe(french_lemmatizer, name = 'lefff', after = 'pos') lemmatized = serie.map( lambda x : [doc.lemma_ for doc in nlp(x)] ) return lemmatized
import re from urllib.parse import urlparse from bs4 import BeautifulSoup from spacy_lefff import LefffLemmatizer, POSTagger from collections import Counter from nltk.tokenize import word_tokenize from nltk import pos_tag from sklearn.feature_extraction.text import TfidfVectorizer import spacy import pprint ####### CONSTANTE ############## nlp_french = spacy.load('fr') french_lemmatizer = LefffLemmatizer() nlp_french.add_pipe(french_lemmatizer, name='lefff', before="ner") class Site(object): """dans site: mot clef, urls interne, url externe, nom de domaine, document_matrix""" def __init__(self, url): self.root_url = urlparse(url).netloc self.entry_point = url self.site_url = urlparse(url).scheme + "://" + self.root_url self.home_page = self.factory_page(url) # une factory method def factory_page(self, page_url): return Page(page_url, self.root_url, self.site_url)
app = Flask(__name__) model_fr = pickle.load(open('model_fr.pkl', 'rb')) model_en = pickle.load(open('model_en.pkl', 'rb')) class_review = ["neutral", "positive", "negative"] sws_fr = stopwords.words('french') #stopwords fr sws_en = stopwords.words('english') #stopwords en list_sw_en_more = ["n't", "not", "no"] sws_en = sws_en + list_sw_en_more FrenchStemmer = SnowballStemmer("french") #stemming fr porter = PorterStemmer() #stemming en WNlemmatizer = WordNetLemmatizer() #lem en en nlp = spacy.load("fr_core_news_sm") #lem en fr pos = POSTagger() french_lemmatizer = LefffLemmatizer(after_melt=True) nlp.add_pipe(pos, name='pos', after='parser') nlp.add_pipe(french_lemmatizer, name='lefff', after='pos') @app.route('/') def home(): name = "nao" return render_template('home.html', name=name) @app.route('/test', methods=['POST']) def test(): result = request.form r = result['review'] #prediction = "positive"
def nlp(): nlp = spacy.load('fr') french_lemmatizer = LefffLemmatizer() nlp.add_pipe(french_lemmatizer, after='parser') return nlp
def add_lefff_lemma_nlp(nlp_pos): french_lemmatizer = LefffLemmatizer(after_melt=True) nlp_pos.add_pipe(french_lemmatizer, after='POSTagger') return nlp_pos
def create_french_lemmatizer(nlp, name): return LefffLemmatizer(after_melt=True, default=True)
def test_lemmatizer_default(): french_lemmatizer = LefffLemmatizer(default=True) assert french_lemmatizer.lemmatize(u"Apple", u"NOUN") == u"apple"
def test_lemmatizer_exception(): french_lemmatizer = LefffLemmatizer() assert french_lemmatizer.lemmatize(u"unknow34", u"unknown") is None
def __init__(self): self.nlp = spacy.load('fr') self.lemmatizer = LefffLemmatizer() self.nlp.add_pipe(self.lemmatizer, name='lefff')
def create_french_lemmatizer(nlp, name): return LefffLemmatizer()