def __init__(self, stop_words): self.stemmer = porter_stemmer.PorterStemmer() self.stops = [] if(stop_words is not None): with open(stop_words, 'r') as file: for line in file: self.stops.append(line)
def get_words(text): exclude = set(string.punctuation) words = text.split() proc_words = [] for word in words: word = ''.join(ch for ch in word if ch not in exclude and ch.isalnum()) word = word.lower() #if not word in stop_words: word = porter_stemmer.PorterStemmer().stem(word, 0,len(word)-1) if len(word) > 0 and word != "\n" and word != "\r": proc_words.append(word) return proc_words
def stemmer(wordList: [str]) -> [str]: ''' This stemmer takes a list of token and convert all tokens in the list to its stemmed form This stemmer uses Porter Stemmer. The source code can be found on https://tartarus.org/martin/PorterStemmer/ ''' myStemmer = ps.PorterStemmer() stemmedWordList = [] for word in wordList: if(word.isalpha()):# if word is an English word (contains only alphabets) word = myStemmer.stem(word, 0, len(word)-1) stemmedWordList.append(word) return stemmedWordList
def stem(word_list): stemmed_list = [] p = porter_stemmer.PorterStemmer() if len(word_list) > 0: for x in word_list: output = '' word = '' line = x + "\n" if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() stemmed_list.append(output.strip("\n")) #print(stemmed_list) return stemmed_list
import os import porter_stemmer p = porter_stemmer.PorterStemmer() positiveFileList = sorted( list(filter(lambda s: s.endswith(".tag"), os.listdir("./POS")))) negativeFileList = sorted( list(filter(lambda s: s.endswith(".tag"), os.listdir("./NEG")))) for f in positiveFileList: infile = open("./POS/" + f, 'r') outputString = '' while 1: output = '' word = '' line = infile.readline() if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() outputString += output
def run_samples(): """ """ stemmer = porter_stemmer.PorterStemmer() words_1 = [ ("caresses", "caress"), ("ponies", "poni"), ("ties", "ti"), ("caress", "caress"), ("cats", "cat"), ("feed", "feed"), ("agreed", "agree"), ("plastered", "plaster"), ("bled", "bled"), ("motoring", "motor"), ("sing", "sing"), ("conflated", "conflate"), ("troubled", "trouble"), ("sized", "size"), ("hopping", "hop"), ("tanned", "tan"), ("falling", "fall"), ("hissing", "hiss"), ("fizzed", "fizz"), ("failing", "fail"), ("filing", "file"), ("happy", "happi"), ("sky", "sky"), ] words_2 = [ ("relational", "relate"), ("conditional", "condition"), ("rational", "rational"), ("valenci", "valence"), ("hesitanci", "hesitance"), ("digitizer", "digitize"), ("conformabli", "conformable"), ("radicalli", "radical"), ("differentli", "different"), ("vileli", "vile"), ("analogousli", "analogous"), ("vietnamization", "vietnamize"), ("predication", "predicate"), ("operator", "operate"), ("feudalism", "feudal"), ("decisiveness", "decisive"), ("hopefulness", "hopeful"), ("callousness", "callous"), ("formaliti", "formal"), ("sensitiviti", "sensitive"), ("sensibiliti", "sensible"), ] words_3 = [ ("triplicate", "triplic"), ("formative", "form"), ("formalize", "formal"), ("electriciti", "electric"), ("electrical", "electric"), ("hopeful", "hope"), ("goodness", "good"), ] words_4 = [ ("revival", "reviv"), ("allowance", "allow"), ("inference", "infer"), ("airliner", "airlin"), ("gyroscopic", "gyroscop"), ("adjustable", "adjust"), ("defensible", "defens"), ("irritant", "irrit"), ("replacement", "replac"), ("adjustment", "adjust"), ("dependent", "depend"), ("adoption", "adopt"), ("homologou", "homolog"), ("communism", "commun"), ("activate", "activ"), ("angulariti", "angular"), ("homologous", "homolog"), ("effective", "effect"), ("bowdlerize", "bowdler"), ] words_5 = [ ("probate", "probat"), ("rate", "rate"), ("cease", "ceas"), ("controll", "control"), ("roll", "roll"), ] for w1, w2 in words_1: print(w1, w2, stemmer.step_1(w1)) for w1, w2 in words_2: print(w1, w2, stemmer.step_2(w1)) for w1, w2 in words_3: print(w1, w2, stemmer.step_3(w1)) for w1, w2 in words_4: print(w1, w2, stemmer.step_4(w1)) for w1, w2 in words_5: print(w1, w2, stemmer.step_5(w1))
def __init__(self): self.inverted_index = load_data() self.myStemmer = ps.PorterStemmer()
def preprocess(dir, stoplist_filename): # Create Porter stemmer object for stemming words p = porter_stemmer.PorterStemmer() with open(stoplist_filename) as file: stoplist = file.read().split() topic_counts = dict() for dirname, subdirnames, filenames in os.walk(dir): #filenames = ['reut2-000.sgm'] # DON'T FORGET TO REMOVE THIS WHEN USING ALL ARTICLES!!!!!!!!!!!!!!!!!!!!! for filename in filenames: if ( os.path.splitext(filename)[-1].lower() == '.sgm' ): # Only pick out sgm files with open(dir + '/' + filename) as file: soup = BeautifulSoup(file, 'html.parser') articles = soup.find_all('reuters') for article in articles: if ( (len(article.topics.contents) == 1) & (article.body is not None) ): # Only count articles with a single topic topic_counts[article.topics.text] = topic_counts.get(article.topics.text, 0) + 1 sorted_topics = sorted(topic_counts, key=topic_counts.get, reverse=True) popular_topics = sorted_topics[0:20] # Pick out top 20 occurring topics filtered_articles = dict() article_tokens = dict() # Walk back through files and add files with appropriate topics to dictionary for dirname, subdirnames, filenames in os.walk(dir): #filenames = ['reut2-000.sgm'] # DON'T FORGET TO REMOVE THIS WHEN USING ALL ARTICLES!!!!!!!!!!!!!!!!!!!!! for filename in filenames: if ( os.path.splitext(filename)[-1].lower() == '.sgm' ): with open(dir + '/' + filename) as file: soup = BeautifulSoup(file, 'html.parser') articles = soup.find_all('reuters') for article in articles: if (article.body is not None): # Don't consider articles with no body if ( (len(article.topics.contents) == 1) & (article.topics.text in popular_topics) ): text = replace_non_alphanumer(ascii_only(article.body.text).lower()) text_set = set(text.split(' ')) stemmed_text = stem_text(text_set, stoplist, p) add_tokens(stemmed_text, article_tokens) # Create new dictionary of frequent tokens frequent_tokens = dict() for word in article_tokens: if article_tokens[word] >= 5: frequent_tokens[word] = article_tokens[word] # Sort tokens alphabetically and change values to be index of words alphabetic_tokens = sorted(frequent_tokens) i = 0 for word in alphabetic_tokens: frequent_tokens[word] = i i += 1 #Open files for writing norm_freq_file = open('freq.csv', 'w') norm_sqrt_freq_file = open('sqrtfreq.csv', 'w') norm_log_freq_file = open('log2freq.csv', 'w') class_file = open('reuters21578.class', 'w') label_file = open('reuters21578.clabel', 'w') write_label_file(alphabetic_tokens, label_file) # Walk through files again to determine number of times each token is used in articles for dirname, subdirnames, filenames in os.walk(dir): #filenames = ['reut2-000.sgm'] # DON'T FORGET TO REMOVE THIS WHEN USING ALL ARTICLES!!!!!!!!!!!!!!!!!!!!! for filename in filenames: if ( os.path.splitext(filename)[-1].lower() == '.sgm' ): # Only pick out .sgm files with open(dir + '/' + filename) as file: soup = BeautifulSoup(file, 'html.parser') articles = soup.find_all('reuters') for article in articles: if (article.body is not None): if ( (len(article.topics.contents) == 1) & (article.topics.text in popular_topics) ): text = replace_non_alphanumer(ascii_only(article.body.text).lower()) [freq, ind] = sparse_article(text, frequent_tokens, stoplist, p) norm_freq = normalize(freq) norm_sqrt_freq = normalize(1 + np.sqrt(freq)) norm_log_freq = normalize(1 + np.log2(freq)) write_freq_file(norm_freq, ind, article['newid'], norm_freq_file) write_freq_file(norm_sqrt_freq, ind, article['newid'], norm_sqrt_freq_file) write_freq_file(norm_log_freq, ind, article['newid'], norm_log_freq_file) write_article_class(article.topics.text, article['newid'], class_file) norm_freq_file.close() norm_sqrt_freq_file.close() norm_log_freq_file.close() class_file.close() label_file.close() return