Exemple #1
0
	def __init__(self, stop_words):
		self.stemmer = porter_stemmer.PorterStemmer()
		self.stops = []
		if(stop_words is not None):
			with open(stop_words, 'r') as file:
				for line in file:
					self.stops.append(line)
Exemple #2
0
def get_words(text):	
	exclude = set(string.punctuation)
	words = text.split()
	proc_words = []
	for word in words:
		word = ''.join(ch for ch in word if ch not in exclude and ch.isalnum())
		word = word.lower()
		#if not word in stop_words:
		word = porter_stemmer.PorterStemmer().stem(word, 0,len(word)-1)
		if len(word) > 0 and word != "\n" and word != "\r":
			proc_words.append(word)
	return proc_words
Exemple #3
0
def stemmer(wordList: [str]) -> [str]:
    '''
    This stemmer takes a list of token and convert all tokens in the list to its stemmed form
    This stemmer uses Porter Stemmer.
    The source code can be found on https://tartarus.org/martin/PorterStemmer/
    '''
    myStemmer = ps.PorterStemmer()
    stemmedWordList = []
    for word in wordList:
        if(word.isalpha()):# if word is an English word (contains only alphabets)
            word = myStemmer.stem(word, 0, len(word)-1)
        stemmedWordList.append(word)
    return stemmedWordList
Exemple #4
0
def stem(word_list):

    stemmed_list = []
    p = porter_stemmer.PorterStemmer()
    if len(word_list) > 0:
        for x in word_list:
            output = ''
            word = ''
            line = x + "\n"
            if line == '':
                break
            for c in line:
                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        output += p.stem(word, 0, len(word) - 1)
                        word = ''
                    output += c.lower()
            stemmed_list.append(output.strip("\n"))

    #print(stemmed_list)
    return stemmed_list
import os
import porter_stemmer

p = porter_stemmer.PorterStemmer()

positiveFileList = sorted(
    list(filter(lambda s: s.endswith(".tag"), os.listdir("./POS"))))
negativeFileList = sorted(
    list(filter(lambda s: s.endswith(".tag"), os.listdir("./NEG"))))

for f in positiveFileList:
    infile = open("./POS/" + f, 'r')

    outputString = ''

    while 1:
        output = ''
        word = ''
        line = infile.readline()
        if line == '':
            break
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if word:
                    output += p.stem(word, 0, len(word) - 1)
                    word = ''
                output += c.lower()

        outputString += output
def run_samples():
    """
    """
    stemmer = porter_stemmer.PorterStemmer()

    words_1 = [
        ("caresses", "caress"),
        ("ponies", "poni"),
        ("ties", "ti"),
        ("caress", "caress"),
        ("cats", "cat"),
        ("feed", "feed"),
        ("agreed", "agree"),
        ("plastered", "plaster"),
        ("bled", "bled"),
        ("motoring", "motor"),
        ("sing", "sing"),
        ("conflated", "conflate"),
        ("troubled", "trouble"),
        ("sized", "size"),
        ("hopping", "hop"),
        ("tanned", "tan"),
        ("falling", "fall"),
        ("hissing", "hiss"),
        ("fizzed", "fizz"),
        ("failing", "fail"),
        ("filing", "file"),
        ("happy", "happi"),
        ("sky", "sky"),
    ]
    words_2 = [
        ("relational", "relate"),
        ("conditional", "condition"),
        ("rational", "rational"),
        ("valenci", "valence"),
        ("hesitanci", "hesitance"),
        ("digitizer", "digitize"),
        ("conformabli", "conformable"),
        ("radicalli", "radical"),
        ("differentli", "different"),
        ("vileli", "vile"),
        ("analogousli", "analogous"),
        ("vietnamization", "vietnamize"),
        ("predication", "predicate"),
        ("operator", "operate"),
        ("feudalism", "feudal"),
        ("decisiveness", "decisive"),
        ("hopefulness", "hopeful"),
        ("callousness", "callous"),
        ("formaliti", "formal"),
        ("sensitiviti", "sensitive"),
        ("sensibiliti", "sensible"),
    ]
    words_3 = [
        ("triplicate", "triplic"),
        ("formative", "form"),
        ("formalize", "formal"),
        ("electriciti", "electric"),
        ("electrical", "electric"),
        ("hopeful", "hope"),
        ("goodness", "good"),
    ]
    words_4 = [
        ("revival", "reviv"),
        ("allowance", "allow"),
        ("inference", "infer"),
        ("airliner", "airlin"),
        ("gyroscopic", "gyroscop"),
        ("adjustable", "adjust"),
        ("defensible", "defens"),
        ("irritant", "irrit"),
        ("replacement", "replac"),
        ("adjustment", "adjust"),
        ("dependent", "depend"),
        ("adoption", "adopt"),
        ("homologou", "homolog"),
        ("communism", "commun"),
        ("activate", "activ"),
        ("angulariti", "angular"),
        ("homologous", "homolog"),
        ("effective", "effect"),
        ("bowdlerize", "bowdler"),
    ]
    words_5 = [
        ("probate", "probat"),
        ("rate", "rate"),
        ("cease", "ceas"),
        ("controll", "control"),
        ("roll", "roll"),
    ]

    for w1, w2 in words_1:
        print(w1, w2, stemmer.step_1(w1))
    for w1, w2 in words_2:
        print(w1, w2, stemmer.step_2(w1))
    for w1, w2 in words_3:
        print(w1, w2, stemmer.step_3(w1))
    for w1, w2 in words_4:
        print(w1, w2, stemmer.step_4(w1))
    for w1, w2 in words_5:
        print(w1, w2, stemmer.step_5(w1))
Exemple #7
0
	def __init__(self):
		self.inverted_index = load_data()
		self.myStemmer = ps.PorterStemmer()
Exemple #8
0
def preprocess(dir, stoplist_filename):
    # Create Porter stemmer object for stemming words
    p = porter_stemmer.PorterStemmer()

    with open(stoplist_filename) as file:
        stoplist = file.read().split()

    topic_counts = dict()

    for dirname, subdirnames, filenames in os.walk(dir):
        #filenames = ['reut2-000.sgm']      # DON'T FORGET TO REMOVE THIS WHEN USING ALL ARTICLES!!!!!!!!!!!!!!!!!!!!!
        for filename in filenames:
            if ( os.path.splitext(filename)[-1].lower() == '.sgm' ):   # Only pick out sgm files
                with open(dir + '/' +  filename) as file:
                    soup = BeautifulSoup(file, 'html.parser')
                    articles = soup.find_all('reuters')
                    for article in articles:
                        if ( (len(article.topics.contents) == 1) & (article.body is not None) ):     # Only count articles with a single topic
                            topic_counts[article.topics.text] = topic_counts.get(article.topics.text, 0) + 1

    sorted_topics = sorted(topic_counts, key=topic_counts.get, reverse=True)
    popular_topics = sorted_topics[0:20]      # Pick out top 20 occurring topics

    filtered_articles = dict()
    article_tokens = dict()

    # Walk back through files and add files with appropriate topics to dictionary
    for dirname, subdirnames, filenames in os.walk(dir):
        #filenames = ['reut2-000.sgm']      # DON'T FORGET TO REMOVE THIS WHEN USING ALL ARTICLES!!!!!!!!!!!!!!!!!!!!!
        for filename in filenames:
            if ( os.path.splitext(filename)[-1].lower() == '.sgm' ):
                with open(dir + '/' + filename) as file:
                    soup = BeautifulSoup(file, 'html.parser')
                    articles = soup.find_all('reuters')
                    for article in articles:
                        if (article.body is not None):       # Don't consider articles with no body
                            if ( (len(article.topics.contents) == 1) & (article.topics.text in popular_topics) ):
                                text = replace_non_alphanumer(ascii_only(article.body.text).lower())
                                text_set = set(text.split(' '))
                                stemmed_text = stem_text(text_set, stoplist, p)
                                add_tokens(stemmed_text, article_tokens)

    # Create new dictionary of frequent tokens
    frequent_tokens = dict()
    for word in article_tokens:
        if article_tokens[word] >= 5:
            frequent_tokens[word] = article_tokens[word]

    # Sort tokens alphabetically and change values to be index of words
    alphabetic_tokens = sorted(frequent_tokens)
    i = 0
    for word in alphabetic_tokens:
        frequent_tokens[word] = i
        i += 1

    #Open files for writing
    norm_freq_file = open('freq.csv', 'w')
    norm_sqrt_freq_file = open('sqrtfreq.csv', 'w')
    norm_log_freq_file = open('log2freq.csv', 'w')

    class_file = open('reuters21578.class', 'w')
    label_file = open('reuters21578.clabel', 'w')

    write_label_file(alphabetic_tokens, label_file)

    # Walk through files again to determine number of times each token is used in articles
    for dirname, subdirnames, filenames in os.walk(dir):
        #filenames = ['reut2-000.sgm']       # DON'T FORGET TO REMOVE THIS WHEN USING ALL ARTICLES!!!!!!!!!!!!!!!!!!!!!
        for filename in filenames:
            if ( os.path.splitext(filename)[-1].lower() == '.sgm' ):     # Only pick out .sgm files
                with open(dir + '/' + filename) as file:
                    soup = BeautifulSoup(file, 'html.parser')
                    articles = soup.find_all('reuters')
                    for article in articles:
                        if (article.body is not None):
                            if ( (len(article.topics.contents) == 1) & (article.topics.text in popular_topics) ):
                                text = replace_non_alphanumer(ascii_only(article.body.text).lower())
                                [freq, ind] = sparse_article(text, frequent_tokens, stoplist, p)
                                norm_freq = normalize(freq)
                                norm_sqrt_freq = normalize(1 + np.sqrt(freq))
                                norm_log_freq = normalize(1 + np.log2(freq))

                                write_freq_file(norm_freq, ind, article['newid'], norm_freq_file)
                                write_freq_file(norm_sqrt_freq, ind, article['newid'], norm_sqrt_freq_file)
                                write_freq_file(norm_log_freq, ind, article['newid'], norm_log_freq_file)

                                write_article_class(article.topics.text, article['newid'], class_file)

    norm_freq_file.close()
    norm_sqrt_freq_file.close()
    norm_log_freq_file.close()
    class_file.close()
    label_file.close()

    return