def tokeniseText(self, doc, isFile, stemFlag): stemmer = PorterStemmer() tokens = dict() stopWords = self.loadStopWords() fh = list() if isFile is True: fh = open(doc) else: fh.append(doc) for line in fh: line = re.sub('(<.*>)', '', line) line = re.sub('[^0-9a-zA-Z]+', ' ', line) line = line.strip().lower() words = line.split() if stemFlag is True: for word in words: if word not in stopWords: word = stemmer.stem(word, 0, len(word) - 1) if len(word) > 1 and word not in stopWords: tokens[word] = tokens.get(word, 0) + 1 else: for word in words: if len(word) > 1: tokens[word] = tokens.get(word, 0) + 1 return tokens
def group_stems(total_count, individual_counts, occurence_per_doc): """Use the Porter Stemmer algorithm to take only the stems of words and then group them together as a single count. For instance, run and running might both be in the counts, hence we reduce this to just run.""" stemmer = PorterStemmer() new_individual_counts = {} new_total_counts = Counter() new_occurences_per_doc = Counter() for file_name, counts in individual_counts.iteritems(): file_counts = Counter() for word, count in counts.iteritems(): word_stem = stemmer.stem(word, 0, len(word) - 1) file_counts[word_stem] += count new_individual_counts[file_name] = file_counts for word, count in total_count.iteritems(): word_stem = stemmer.stem(word, 0, len(word) - 1) new_total_counts[word_stem] += count for word, count in occurence_per_doc.iteritems(): word_stem = stemmer.stem(word, 0, len(word) -1) new_occurences_per_doc[word_stem] += count print "Finished grouping words by their stems." return new_total_counts, new_individual_counts, new_occurences_per_doc
def load_data(stem=True): ps = PorterStemmer() positive = [] negative = [] directory = os.fsencode("./NEG") for file in os.listdir(directory): filename = os.fsdecode(file) f = open('./NEG/' + filename, 'r') text = f.read() if stem: text = ps.stem(text, 0, len(text) - 1) negative.append(text) directory = os.fsencode("./POS") for file in os.listdir(directory): filename = os.fsdecode(file) f = open('./POS/' + filename, 'r') text = f.read() if stem: text = ps.stem(text, 0, len(text) - 1) positive.append(text) target_pos = [] target_neg = [] for i in range(0, 1000): target_pos.append(0) for i in range(0, 1000): target_neg.append(1) X = positive + negative y = target_pos + target_neg return X, y
def __init__(self, stopwords_io_stream = None): self.stemmer = PorterStemmer() if(not stopwords_io_stream): stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r') self.stopwords = stopwords_io_stream.read().split()
def processTweet(tweet): # Remove HTML special entities (e.g. &) tweet = re.sub(r'\&\w*;', '', tweet) #remove @username tweet = re.sub('@[^\s]+', '', tweet) # Remove tickers tweet = re.sub(r'\$\w*', '', tweet) # To lowercase tweet = tweet.lower() # Remove hyperlinks tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet) # Remove hashtags tweet = re.sub(r'#\w*', '', tweet) # Remove Punctuation and split 's, 't, 've with a space for filter tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', tweet) # Remove words with 2 or fewer letters tweet = re.sub(r'\b\w{1,2}\b', '', tweet) # Remove whitespace (including new line characters) tweet = re.sub(r'\s\s+', ' ', tweet) # Remove single space remaining at the front of the tweet. tweet = tweet.lstrip(' ') # Removing Stopwords from tweet using sklearn.feature_extraction split_list = tweet.split(" ") tweet = [ word for word in split_list if word not in stop_words.ENGLISH_STOP_WORDS ] # Stemming the ps = PorterStemmer() tweet = [ps.stem(word) for word in tweet] tweet = ' '.join(tweet) return tweet
def stem(words: List[str]): p = PorterStemmer() output = [] for word in words: if word.isalpha(): output.append(p.stem(word, 0,len(word)-1)) return output
def stem(): for line in sys.stdin: line = line.strip('\n') if line: token = line.split('\t')[1] ps = PorterStemmer().stem(token, 0, len(token) - 1) print line + '\t' + ps
class Tokenizer(object): def __init__(self): self.stemmer = PorterStemmer() def __call__(self, tweet): # TODO: This function takes in a single tweet (just the text part) # then it will process/clean the tweet and return a list of tokens (words). # For example, if tweet was 'I eat', the function returns ['i', 'eat'] # Lowercase the tweet and strip punctuations parsed = tweet.lower().replace(',',' ').replace('.',' ').replace('?',' ').replace('!',' ').replace('-',' ').replace('#',' ').replace(':',' ').replace('(',' ').replace(')',' ').replace('=',' ').replace('...',' ').split() for i, word in enumerate(parsed): # do replacement if '@' in word: parsed[i] = 'AT_USER' if ('www.' in word) or ('.com' in word) or ('http://' in word) or ('http(s)://' in word): parsed[i] = 'URL' # Replacing three or more occurrence of the same character with one occurrence using regular expression parsed[i] = re.sub(r'(.)\1{2,}', r'\1', parsed[i]) # Apply stemming to each term to restore its original form res = [self.stemmer.stem(kw, 0, len(kw)-1) for kw in parsed] # De-duplicate the term list to make it only contains distinct terms res = list(set(res)) # You will not need to call this function explictly. # Once you initialize your vectorizer with this tokenizer, # then 'vectorizer.fit_transform()' will implictly call this function to # extract features from the training set, which is a list of tweet texts. # So once you call 'fit_transform()', the '__call__' function will be applied # on each tweet text in the training set (a list of tweet texts), return res
def __call__(self, tweet): # TODO: This function takes in a single tweet (just the text part) # then it will process/clean the tweet and return a list of tokens (words). # For example, if tweet was 'I eat', the function returns ['i', 'eat'] # You will not need to call this function explictly. # Once you initialize your vectorizer with this tokenizer, # then 'vectorizer.fit_transform()' will implictly call this function to # extract features from the training set, which is a list of tweet texts. # So once you call 'fit_transform()', the '__call__' function will be applied # on each tweet text in the training set (a list of tweet texts), tweet.lower() # 1. Lowercase all letters for i in string.punctuation: tweet = tweet.replace(i, " ") words = tweet.split() result = [] for word in words: if word[0] == "@": # 7. Removing user references word = "AT_USER" if word[0] == "\#": # 5. Removing hashtags word[0] = word[0].replace("\#", "") if word[0].isalpha( ): # Ignoring words that don't start with an alphabet letter if word.startswith("www.") or word.startswith( "https://") or word.startswith("http://"): word = "URL" word = PorterStemmer().stem(word, 0, len(word) - 1) # 2. Applying stemming word = re.sub(r'([a-z])\1+', r'\1\1', word) result.append(word) return result
class Tokenizer: def __init__(self, ngrams=1): self.stemmer = PorterStemmer() self.ngrams = ngrams print ' -- initializing tokenizer with maximum ngram = %d' % ngrams def tokenize(self, tweet): words = set() ng = [] # fix the most common HTML escapes tweet = tweet.replace('"', '').replace(' ', ' ').replace('&', ' ') # replace certain word-splitting with spaces (e.g. 'foo...bar' should # be split). do not include '.' because this will incorrectly split URLs for c in ('...', '..', '&', ','): tweet = tweet.replace(c, ' ') for w in tweet.split(): w = w.lower() # convert to lowercase # get rid of links and user mentions for s in links: if s in w: continue if w.startswith('@'): continue w = w.strip(punct) # strip punctuation (including hashtag) if len(w) == 0: continue # ignore now-blank words if w in stopwords: continue # ignore stopwords # replace two or more occurrence of same char with two notrip = '' for i, x in enumerate(w): if i < 2 or x != w[i - 2] or x != w[i - 1]: notrip += x notrip = self.stemmer.stem( notrip, 0, len(notrip) - 1) # apply stemming using Porter Stemmer if not notrip[0].isalpha(): continue words.add(notrip) # ignore words that don't start with alphabet if self.ngrams > 1: ng.append(notrip) # Generate all ngrams and add them if self.ngrams > 1 and len(words) > 1: if self.ngrams > len(words): self.ngrams = len(words) for i in range(len(ng) - self.ngrams + 1): ngram = ng[i] for j in range(1, self.ngrams): ngram += ' ' + ng[i + j] if ngram == 'madison squar garden': print 'msg' words.add(ngram) return words
class Tokenizer: def __init__(self, ngrams=1): self.stemmer = PorterStemmer() self.ngrams = ngrams print ' -- initializing tokenizer with maximum ngram = %d' % ngrams def tokenize(self, tweet): words = set() ng = [] # fix the most common HTML escapes tweet = tweet.replace('"','').replace(' ',' ').replace('&',' ') # replace certain word-splitting with spaces (e.g. 'foo...bar' should # be split). do not include '.' because this will incorrectly split URLs for c in ('...','..','&',','): tweet = tweet.replace(c,' ') for w in tweet.split(): w = w.lower() # convert to lowercase # get rid of links and user mentions for s in links: if s in w: continue if w.startswith('@'): continue w = w.strip(punct) # strip punctuation (including hashtag) if len(w) == 0: continue # ignore now-blank words if w in stopwords: continue # ignore stopwords # replace two or more occurrence of same char with two notrip = '' for i,x in enumerate(w): if i < 2 or x != w[i-2] or x != w[i-1]: notrip += x notrip = self.stemmer.stem(notrip, 0, len(notrip)-1) # apply stemming using Porter Stemmer if not notrip[0].isalpha(): continue words.add(notrip) # ignore words that don't start with alphabet if self.ngrams>1: ng.append(notrip) # Generate all ngrams and add them if self.ngrams>1 and len(words)>1: if self.ngrams > len(words): self.ngrams = len(words) for i in range(len(ng)-self.ngrams+1): ngram = ng[i] for j in range(1,self.ngrams): ngram += ' '+ng[i+j] if ngram == 'madison squar garden': print 'msg' words.add(ngram) return words
class Tokenizer(object): def __init__(self): self.stemmer = PorterStemmer() # only admit non-number with length>2 def qualify(self, word): return len(word)>2 and not word.isdigit() # TODO: Change the text processing here # You only need to edit the below function def process_tweet(self, tweet): tweet = tweet.lower() exclude = set(string.punctuation) tweet = ''.join(ch for ch in tweet if ch not in exclude) words = tweet.split() for i in range(len(words)): words[i] = self.stemmer.stem(words[i], 0, len(words[i]) - 1) tweet = ' '.join(words) return tweet def __call__(self, tweet): # This function takes in a single tweet (just the text part) # then it will process/clean the tweet and return a list of tokens (words). # For example, if tweet was 'I eat', the function returns ['i', 'eat'] # You will not need to call this function explictly. # Once you initialize your vectorizer with this tokenizer, # then 'vectorizer.fit_transform()' will implictly call this function to # extract features from the training set, which is a list of tweet texts. # So once you call 'fit_transform()', the '__call__' function will be applied # on each tweet text in the training set (a list of tweet texts), features = [] for word in self.process_tweet(tweet).split(): if self.qualify(word): # Stem word = self.stemmer.stem(word, 0, len(word) - 1) features.append(word) return features
class Tokenizer(object): def __init__(self, stopwords_file): self.stemmer = PorterStemmer() self.stop_words = set(line.strip() for line in open(stopwords_file)) def tokenize(self, text): tokens = [] words = text.lower().split() for word in words: word = re.sub(r'[^\w\s]', '', word) # remove punctuation word = self.stemmer.stem(word, 0, len(word) - 1) # stem # throw out words in stop words and those starting with non alphabet if word and word not in self.stop_words and word[0].isalpha(): tokens.append(word) return tokens
class Parser: STOP_WORDS_FILE = '%s/../data/english.stop' % os.path.dirname( os.path.realpath(__file__)) stemmer = None stopwords = [] def __init__(self, stopwords_io_stream=None): self.stemmer = PorterStemmer() if (not stopwords_io_stream): stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r') self.stopwords = stopwords_io_stream.read().split() def tokenise_and_remove_stop_words(self, document_list): if not document_list: return [] vocabulary_string = " ".join(document_list) tokenised_vocabulary_list = self._tokenise(vocabulary_string) clean_word_list = self._remove_stop_words(tokenised_vocabulary_list) return clean_word_list def _remove_stop_words(self, list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords] def _tokenise(self, string): """ break string up into tokens and stem words """ string = self._clean(string) words = string.split(" ") return [self.stemmer.stem(word, 0, len(word) - 1) for word in words] def _clean(self, string): """ remove any nasty grammar tokens from string """ string = "".join(l for l in string if l not in stg.punctuation) #string = string.replace("\""," ") #string = string.replace("\'"," ") #string = string.replace(":","") string = string.replace("\s+", " ") string = string.lower() return string
class Parser: STOP_WORDS_FILE = '%s/../data/english.stop' % os.path.dirname(os.path.realpath(__file__)) stemmer = None stopwords = [] def __init__(self, stopwords_io_stream = None): self.stemmer = PorterStemmer() if(not stopwords_io_stream): stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r') self.stopwords = stopwords_io_stream.read().split() def tokenise_and_remove_stop_words(self, document_list): if not document_list: return [] vocabulary_string = " ".join(document_list) tokenised_vocabulary_list = self._tokenise(vocabulary_string) clean_word_list = self._remove_stop_words(tokenised_vocabulary_list) return clean_word_list def _remove_stop_words(self, list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords ] def _tokenise(self, string): """ break string up into tokens and stem words """ string = self._clean(string) words = string.split(" ") return [self.stemmer.stem(word, 0, len(word)-1) for word in words] def _clean(self, string): """ remove any nasty grammar tokens from string """ string = string.replace(".","") string = string.replace("\s+"," ") string = string.lower() return string
class Tokenizer(object): def __init__(self): self.stemmer = PorterStemmer() def process_review(self, tweet): #TODO: pre-process tweet # this is a helper function for __call__ tweet = tweet.lower() # To lower case tweet = re.sub('[^\x20-\x7E]*','',tweet) tweet = re.sub(r'([a-z0-9])\1+',r'\1\1',tweet) #Replace two or more occurrences of the same character with two occurrences stopword_file= open("stopwords.txt") stopwords =set() stopword_content = stopword_file.readlines() for word in stopword_content: stopwords.add(word.strip().lower()) tweet_words = tweet.split() tweet = "" for w in tweet_words: if len(w)< 1 or w in stopwords: continue elif not w[0].isalpha(): continue else: tweet+= " "+w tweet = tweet.strip() tweet = ''.join(ch for ch in tweet if ch not in string.punctuation) tweet_words = [self.stemmer.stem(word, 0, len(word)-1) for word in tweet.split(" ")] return tweet_words def __call__(self, doc): # this function will tokenize the given document and return a list of extracted features (tokens) processed_doc = self.process_tweet(doc) #TODO: return a list of features extracted from processed_doc return processed_doc
def __init__(self): self.stemmer = PorterStemmer()
# dic = {"joy": 0, "surprise": 0, "sad": 0, "angry": 0} def get_text_fromcsv(filename): with open(filename, "r") as file: f = csv.reader(file) d = [] next(file) for r in f: l = get_label(r[10:]) if l != "None": cleaned = ' '.join(clean_words(special_split(r[1]))) # dic[l] = dic[l] + 1 d.append(cleaned + " __label__" + l) # d.append(cleaned) return d delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '#', '$', '[', ']', '(', ')', '-', '=', '@', '%', '&', '*', '_', '>', '<', '{', '}', '|', '/', '\\', '\'', '"', '\t', '+', '~', '^'] stop_words = load_stop_words() porter = PorterStemmer() data = get_text_fromcsv("combined.csv") fi = open("preprocess_combined.txt", "w") for line in data: fi.write(line + "\n") fi.close()
from porter_stemmer import PorterStemmer stemmer = PorterStemmer() resume = open('assets/resume.txt', 'r').read() print(stemmer.stem_document(resume))
def __init__(self, ngrams=1): self.stemmer = PorterStemmer() self.ngrams = ngrams print ' -- initializing tokenizer with maximum ngram = %d' % ngrams
from porter_stemmer import PorterStemmer stemmer = PorterStemmer() print stemmer.stem_word("adoption") print stemmer.stem_word("controll") print stemmer.stem_word("roll") print stemmer.stem_word("agreed")
def __init__(self, stopwords_file): self.stemmer = PorterStemmer() self.stop_words = set(line.strip() for line in open(stopwords_file))
# custom porter stemmer from porter_stemmer import PorterStemmer stemmer = PorterStemmer() document = "day" print(stemmer.stem_document(document)) # using the nltk library # from nltk.stem import PorterStemmer # # stemmer = PorterStemmer() # print(stemmer.stem('day'))
def extract_sentiment_for_movies(self, preprocessed_input): """Creative Feature: Extracts the sentiments from a line of pre-processed text that may contain multiple movies. Note that the sentiments toward the movies may be different. You should use the same sentiment values as extract_sentiment, described above. Hint: feel free to call previously defined functions to implement this. Example: sentiments = chatbot.extract_sentiment_for_text( chatbot.preprocess( 'I liked both "Titanic (1997)" and "Ex Machina".')) print(sentiments) // prints [("Titanic (1997)", 1), ("Ex Machina", 1)] :param preprocessed_input: a user-supplied line of text that has been pre-processed with preprocess() :returns: a list of tuples, where the first item in the tuple is a movie title, and the second is the sentiment in the text toward that movie """ #don't need to consider the case where some movies are in the database while the rest are not title_array = self.extract_titles(preprocessed_input) if len(title_array) == 1: return [(title_array[0], self.extract_sentiment(preprocessed_input))] stemmer = PorterStemmer() split_input = preprocessed_input.lower().split() negate = 1 num_conjunctions = 0 count = 0 in_quotes = False power = 1 conjunctions = ['and', 'nor', 'but', 'or', 'yet'] neg_list = [ "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't", "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't", "won't", "wont", "hadn't", "never", "none", "nobody", "nothing", "neither", "nowhere", "isn't", "can't", "cannot", "mustn't", "mightn't", "shan't", "without", "needn't" ] power_list = [ "really", "reeally", "loved", "love", "hate", "hated", "terrible", "amazing", "fantastic", "incredible", "dreadful", "horrible", "horrid", "horrendous" ] sentiment_list = [] for word in split_input: word = word.strip() word_no_punc = word.rstrip(",.") stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1) if stem.endswith('i'): stem = stem[:-1] + 'y' if word.startswith("\""): in_quotes = True if word.endswith("\""): in_quotes = False continue if in_quotes: continue if word in neg_list and not word.endswith( "," ): # if word in neg_list but ends in comma, negate would be positive negate = -1 # or have negate * -1 else: has_comma = False # maybe include other punctuation? if word.endswith(","): has_comma = True if self.creative: if word_no_punc in power_list or stem in power_list or word.endswith( "!"): power = 2 if word_no_punc in conjunctions or stem in conjunctions: if (count == 0): if num_conjunctions != 0: sentiment_list.append( sentiment_list[num_conjunctions - 1]) else: sentiment_list.append(0) else: sentiment_list.append(count) count = 0 num_conjunctions += 1 if word_no_punc in self.sentiment: if self.sentiment[word_no_punc] == "pos": count += 1 * negate else: count += -1 * negate elif stem in self.sentiment: if self.sentiment[stem] == "pos": count += 1 * negate else: count += -1 * negate if has_comma: negate = 1 if (count == 0): sentiment_list.append(sentiment_list[num_conjunctions - 1]) else: sentiment_list.append(count) res = [] i = 0 for title in title_array: curr_count = 0 if sentiment_list[i] > 0: curr_count = 1 * power elif sentiment_list[i] < 0: curr_count = -1 * power res.append((title, curr_count)) i += 1 return res
class Tokenizer(object): def __init__(self): self.stemmer = PorterStemmer() # only admit non-number with length>2 def qualify(self, word): return len(word) > 2 and not word.isdigit() def process_desc(self, desc): ndesc = [] for word in desc.split(): # lowercase all characters word = word.lower() # replace words with hashtags with just the words if word[0] == "#": word = word[1:] # replace words with @ with "AT_USER" elif word[0] == "@": word = "AT_USER" # replace words with url beginnings with "URL" elif len(word) > 4: if word[:4] == "www.": word = "URL" elif len(word) > 10: if word[:10] == "http(s)://": word = "URL" # strip punctuation using translate string method translator = str.maketrans('', '', string.punctuation) word = word.translate(translator) # use stop words list to filter out low value words if word not in stop: # ignore words that are one letter long if len(word) > 1: # check to see if the first letter of the word is an alphabetic character if word[0].isalpha() == True: # finally check for duplicates if word not in ndesc: ndesc.append(word) return ' '.join(ndesc) def __call__(self, desc): # This function takes in a single desc (just the text part) # then it will process/clean the desc and return a list of tokens (words). # For example, if desc was 'I eat', the function returns ['i', 'eat'] # You will not need to call this function explictly. # Once you initialize your vectorizer with this tokenizer, # then 'vectorizer.fit_transform()' will implictly call this function to # extract features from the training set, which is a list of desc texts. # So once you call 'fit_transform()', the '__call__' function will be applied # on each desc text in the training set (a list of desc texts), features = [] for word in self.process_desc(desc).split(): if self.qualify(word): # Stem word = self.stemmer.stem(word, 0, len(word) - 1) features.append(word) return features
def tokenise(self, string): """ break string up into tokens and stem words """ stemmer = PorterStemmer() string = self.clean(string) words = string.split(" ") return [stemmer.stem(word, 0, len(word) - 1) for word in words]
def extract_sentiment(self, preprocessed_input): """Extract a sentiment rating from a line of pre-processed text. You should return -1 if the sentiment of the text is negative, 0 if the sentiment of the text is neutral (no sentiment detected), or +1 if the sentiment of the text is positive. As an optional creative extension, return -2 if the sentiment of the text is super negative and +2 if the sentiment of the text is super positive. Example: sentiment = chatbot.extract_sentiment(chatbot.preprocess( 'I liked "The Titanic"')) print(sentiment) // prints 1 :param preprocessed_input: a user-supplied line of text that has been pre-processed with preprocess() :returns: a numerical value for the sentiment of the text """ stemmer = PorterStemmer() split_input = preprocessed_input.lower().split() negate = 1 count = 0 in_quotes = False power = 1 neg_list = [ "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't", "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't", "won't", "wont", "hadn't", "never", "none", "nobody", "nothing", "neither", "nor", "nowhere", "isn't", "can't", "cannot", "mustn't", "mightn't", "shan't", "without", "needn't" ] power_list = [ "really", "reeally", "loved", "love", "hate", "hated", "terrible", "amazing", "fantastic", "incredible", "dreadful", "horrible", "horrid", "horrendous" ] for word in split_input: word = word.strip() word_no_punc = word.rstrip(",.") stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1) if stem.endswith('i'): stem = stem[:-1] + 'y' if word.startswith("\""): in_quotes = True if word.endswith("\""): in_quotes = False continue if in_quotes: continue if word in neg_list and not word.endswith( "," ): # if word in neg_list but ends in comma, negate would be positive negate = -1 # or have negate * -1 else: has_comma = False # maybe include other punctuation? if word.endswith(","): has_comma = True if self.creative: if word_no_punc in power_list or stem in power_list or word.endswith( "!"): power = 2 if word_no_punc in self.sentiment: if self.sentiment[word_no_punc] == "pos": count += 1 * negate else: count += -1 * negate elif stem in self.sentiment: if self.sentiment[stem] == "pos": count += 1 * negate else: count += -1 * negate if has_comma: negate = 1 if count > 0: return 1 * power elif count < 0: return -1 * power return 0