class Token_Preprocessing_Engine(object): def __init__(self): if USE_LEMMATIZER: # WordnetStemmer (Lemmatizer) nltk.download('wordnet') self.engine = WordNetLemmatizer() else: # PorterStemmer self.engine = PorterStemmer() def process_token(self, token): if USE_LEMMATIZER: term = self.engine.lemmatize(token) else: term = self.engine.stem(token) return term.lower()
def stem(tokens, option=['PORTER', 'SNOWBALL', 'WORDNET'][2]): # reference: # http://stackoverflow.com/questions/1787110/what-is-the-true-difference-between-lemmatization-vs-stem stem_tks = [] if (option == 'PORTER'): stem_tool = PorterStemmer() elif (option == 'SNOWBALL'): stem_tool = SnowballStemmer('english') elif (option == 'WORDNET'): stem_tool = WordNetLemmatizer() for t_lst in tokens: if option in ['PORTER', 'SNOWBALL']: stem_tks.append([stem_tool.stem(t) for t in t_lst]) else: stem_tks.append([stem_tool.lemmatize(t) for t in t_lst]) return stem_tks
def stem_lem_text(s, type='Lancaster'): words = s.split() if type == 'Porter': choice = PorterStemmer() reformed = [choice.stem(word) for word in words] elif type == 'Snowball': choice = SnowballStemmer('english') reformed = [choice.stem(word) for word in words] elif type == 'Lemmatize': choice = WordNetLemmatizer() reformed = [choice.lemmatize(word) for word in words] else: choice = LancasterStemmer() reformed = [choice.stem(word) for word in words] reformed = " ".join(reformed) return reformed
class NLTKTokenizer(object): """ http://scikit-learn.org/stable/modules/feature_extraction.html http://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer http://nltk.org/api/nltk.tokenize.html """ def __init__(self): #self.wnl = LancasterStemmer() self.wnl = PorterStemmer() #best so far #self.wnl = GermanStemmer() #self.wnl = EnglishStemmer(ignore_stopwords=True) #self.wnl = WordNetStemmer() def __call__(self, doc): words = [word_tokenize(t) for t in sent_tokenize(doc)] words = [item for sublist in words for item in sublist] if hasattr(self.wnl, 'stem'): words = [self.wnl.stem(t) for t in words] else: words = [self.wnl.lemmatize(t) for t in words] return words
class NLTKTokenizer(object): """ http://scikit-learn.org/stable/modules/feature_extraction.html http://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer http://nltk.org/api/nltk.tokenize.html """ def __init__(self): #self.wnl = LancasterStemmer() self.wnl = PorterStemmer()#best so far #self.wnl = GermanStemmer() #self.wnl = EnglishStemmer(ignore_stopwords=True) #self.wnl = WordNetStemmer() def __call__(self, doc): words=[word_tokenize(t) for t in sent_tokenize(doc)] words=[item for sublist in words for item in sublist] if hasattr(self.wnl,'stem'): words=[self.wnl.stem(t) for t in words] else: words=[self.wnl.lemmatize(t) for t in words] return words
def process_text(self, text_corpus): # tokenize and normalize textual phrases processed_corpus = dict() # define word normalizer using NLTK if self.norm: text_normalizer = PorterStemmer( ) if self.norm == 'stem' else WordNetLemmatizer() # loop over all corpus for key in text_corpus.keys(): processed_list = list() for sent in text_corpus[key]: # tokenize sentence tokens = word_tokenize(sent) # normalize sentence (if required) if self.norm == 'stem': tokens = [text_normalizer.stem(word) for word in tokens] elif self.norm == 'lemma': tokens = [ text_normalizer.lemmatize(word) for word in tokens ] processed_list.append(tokens) processed_corpus[key] = processed_list return processed_corpus
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Dec 23 08:50:02 2019 @author: student """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import nltk from nltk.stem.porter import PorterStemmer ps = PorterStemmer() print(ps.stem('working')) #Does not Understand Context and just removes es from the end. ps = PorterStemmer() print(ps.stem('increases')) #Considers Context of the word and thus gives better Output from nltk.stem import WordNetLemmatizer nltk.download('wordnet') ps = WordNetLemmatizer() print(ps.lemmatize('increases')) #Stemming is faster but is less accurate #Lemmatizer is slower but much more accurate as it considers the context as well