def _get_wordnet(): try: wn = wordnet.WordNetLemmatizer() wn.lemmatize("this is a test") except: print("Missing wordnet data -- attempting to download") import nltk nltk.download("wordnet") wn = wordnet.WordNetLemmatizer() return wn
def preprocess_tweets(df): corpus = [] stem = PorterStemmer() lem = wordnet.WordNetLemmatizer() for tweet in df['message']: words = [w for w in word_tokenize(tweet) if ((w.lower() not in stop) and username_filter(w))] words = [lem.lemmatize(w) for w in words if len(w) > 2] corpus.append(words) return corpus
def lemmatize1(essay): lemma = wn.WordNetLemmatizer() pos_list = pos_tag(essay) cleaned_essay = [] for tag in pos_list: if '@' not in tag[0]: cleaned_essay.append( lemma.lemmatize(tag[0], get_wordnet_pos(tag[1]))) # if word[1]!='NNP' or word[0][0]!="@": # essay.remove(essay[ind]) return cleaned_essay
def passive_voice_detection(sentence): lemma = wn.WordNetLemmatizer() error = 0 for i in range(len(sentence) - 1): if sentence[i][1][0] == 'V': check = lemma.lemmatize(sentence[i][0], pos="v") if check == 'be' and sentence[ i + 1][1][0] == 'V' and sentence[i + 1][1] != 'VBG': error = 1 break return error
def lemmatize(essay): lemma = wn.WordNetLemmatizer() lemma_word = [] for word in essay: if '@' not in word: word1 = lemma.lemmatize(word, pos="n") word2 = lemma.lemmatize(word1, pos="v") word3 = lemma.lemmatize(word2, pos="a") lemma_word.append(word3) #print() # print('lemmatization') # print(lemma_word) return lemma_word
def nlp_preprocess_text(text_proc): lemmatizer = wordnet.WordNetLemmatizer() text_proc = standardize_new_text(text_proc) cleaned_tokens = [] tokens = word_tokenize(text_proc.lower()) for token in tokens: # if token not in stop_words: if len(token) > 0 and len(token) < 20: # removes non words if not token[0].isdigit() and not token[-1].isdigit( ): # removes numbers token = spell(token) lemmed_tokens = lemmatizer.lemmatize(token) cleaned_tokens.append(lemmed_tokens) text_nlp_proc = ' '.join(wd for wd in cleaned_tokens) return text_nlp_proc
def token_lemma_stem(text): # 1. tokenize words tokenized_words = word_tokenize(preprocess(text)) tokenized_words = [ w.lower() for w in tokenized_words if w.lower() not in pass_words ] # 2. lemmatize words lemmatizer = wordnet.WordNetLemmatizer() tokenized_words = [ lemmatizer.lemmatize(w.lower()) for w in tokenized_words ] # 3. stem words with PorterStemmer porter = PorterStemmer() tokenized_words = [porter.stem(w) for w in tokenized_words] return tokenized_words
def text_normalization(text): text = str(text).lower() spl_char_text = re.sub(r'[^ a-z]', '', text) tokens = nltk.word_tokenize(spl_char_text) lema = wordnet.WordNetLemmatizer() tags_list = pos_tag(tokens, tagset=None) lema_words = [] for token, pos_token in tags_list: if pos_token.startswith('V'): pos_val = 'v' elif pos_token.startswith('J'): pos_val = 'a' elif pos_token.startswith('R'): pos_val = 'r' else: pos_val = 'n' lema_token = lema.lemmatize(token, pos_val) lema_words.append(lema_token) return " ".join(lema_words)
def fct_nltk(text, stop_words): """ Fonction pour supprimer : les step words la ponctuation les majuscules les pluriels """ # Création de l'objet lemma = wordnet.WordNetLemmatizer() # Tokenization et mise en minuscule words = word_tokenize(text.lower()) # Suppression des pluriels et de la ponctuation. Boule pour toutes les lignes new_sentence = [ lemma.lemmatize(x) for x in words if (not x in stop_words) and x.isalpha() ] # Sortie return new_sentence
def reduz_ao_radical_lem(lista_palavras): lemmatizer = wordnet.WordNetLemmatizer() for i in range(len(lista_palavras)): lista_palavras[i] = lemmatizer.lemmatize(lista_palavras[i]) return lista_palavras