class Token_Preprocessing_Engine(object):
    def __init__(self):
        if USE_LEMMATIZER:
            # WordnetStemmer (Lemmatizer)
            nltk.download('wordnet')
            self.engine = WordNetLemmatizer()
        else:
            # PorterStemmer
            self.engine = PorterStemmer()

    def process_token(self, token):
        if USE_LEMMATIZER:
            term = self.engine.lemmatize(token)
        else:
            term = self.engine.stem(token)
        return term.lower()
Ejemplo n.º 2
0
def stem(tokens, option=['PORTER', 'SNOWBALL', 'WORDNET'][2]):
    # reference:
    # http://stackoverflow.com/questions/1787110/what-is-the-true-difference-between-lemmatization-vs-stem
    stem_tks = []
    if (option == 'PORTER'):
        stem_tool = PorterStemmer()
    elif (option == 'SNOWBALL'):
        stem_tool = SnowballStemmer('english')
    elif (option == 'WORDNET'):
        stem_tool = WordNetLemmatizer()

    for t_lst in tokens:
        if option in ['PORTER', 'SNOWBALL']:
            stem_tks.append([stem_tool.stem(t) for t in t_lst])
        else:
            stem_tks.append([stem_tool.lemmatize(t) for t in t_lst])
    return stem_tks
Ejemplo n.º 3
0
def stem_lem_text(s, type='Lancaster'):
    words = s.split()

    if type == 'Porter':
        choice = PorterStemmer()
        reformed = [choice.stem(word) for word in words]
    elif type == 'Snowball':
        choice = SnowballStemmer('english')
        reformed = [choice.stem(word) for word in words]
    elif type == 'Lemmatize':
        choice = WordNetLemmatizer()
        reformed = [choice.lemmatize(word) for word in words]
    else:
        choice = LancasterStemmer()
        reformed = [choice.stem(word) for word in words]

    reformed = " ".join(reformed)
    return reformed
Ejemplo n.º 4
0
class NLTKTokenizer(object):
    """
    http://scikit-learn.org/stable/modules/feature_extraction.html
    http://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    http://nltk.org/api/nltk.tokenize.html
    """
    def __init__(self):
        #self.wnl = LancasterStemmer()
        self.wnl = PorterStemmer()  #best so far
        #self.wnl = GermanStemmer()
        #self.wnl = EnglishStemmer(ignore_stopwords=True)
        #self.wnl = WordNetStemmer()
    def __call__(self, doc):
        words = [word_tokenize(t) for t in sent_tokenize(doc)]
        words = [item for sublist in words for item in sublist]
        if hasattr(self.wnl, 'stem'):
            words = [self.wnl.stem(t) for t in words]
        else:
            words = [self.wnl.lemmatize(t) for t in words]
        return words
Ejemplo n.º 5
0
class NLTKTokenizer(object):
    """
    http://scikit-learn.org/stable/modules/feature_extraction.html
    http://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    http://nltk.org/api/nltk.tokenize.html
    """
    def __init__(self):
      #self.wnl = LancasterStemmer()
      self.wnl = PorterStemmer()#best so far
      #self.wnl = GermanStemmer()
      #self.wnl = EnglishStemmer(ignore_stopwords=True)
      #self.wnl = WordNetStemmer()
    def __call__(self, doc):
      words=[word_tokenize(t) for t in sent_tokenize(doc)]
      words=[item for sublist in words for item in sublist]
      if hasattr(self.wnl,'stem'):
	  words=[self.wnl.stem(t) for t in words]
      else:
	  words=[self.wnl.lemmatize(t) for t in words]
      return words
Ejemplo n.º 6
0
 def process_text(self, text_corpus):
     # tokenize and normalize textual phrases
     processed_corpus = dict()
     # define word normalizer using NLTK
     if self.norm:
         text_normalizer = PorterStemmer(
         ) if self.norm == 'stem' else WordNetLemmatizer()
     # loop over all corpus
     for key in text_corpus.keys():
         processed_list = list()
         for sent in text_corpus[key]:
             # tokenize sentence
             tokens = word_tokenize(sent)
             # normalize sentence (if required)
             if self.norm == 'stem':
                 tokens = [text_normalizer.stem(word) for word in tokens]
             elif self.norm == 'lemma':
                 tokens = [
                     text_normalizer.lemmatize(word) for word in tokens
                 ]
             processed_list.append(tokens)
         processed_corpus[key] = processed_list
     return processed_corpus
Ejemplo n.º 7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 23 08:50:02 2019

@author: student
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
print(ps.stem('working'))

#Does not Understand Context and just removes es from the end.
ps = PorterStemmer()
print(ps.stem('increases'))

#Considers Context of the word and thus gives better Output
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
ps = WordNetLemmatizer()
print(ps.lemmatize('increases'))

#Stemming is faster but is less accurate
#Lemmatizer is slower but much more accurate as it considers the context as well