Esempio n. 1
0
def main():

	# construct the argument parser and parse the arguments
	ap = argparse.ArgumentParser()
	ap.add_argument("-i",type=str, required=True,
		help="path to inputfile for the first task")
	ap.add_argument("-s", type=str, default="stopword_list.txt",
		help="The path to the stopword list used for the task")
	args = vars(ap.parse_args())


	s=open(args["s"], "r")

	if s.mode == 'r':
		stopwords =s.readlines()
		for j in range(len(stopwords)):
			stopwords[j] = stopwords[j].split("\n")[0]
			stopwords[j] = stopwords[j].split("\t")[0]
	else:
		return "error reading stopwords file"

	p = stemmer.PorterStemmer()

	i=open(args["i"], "r")
	if i.mode == 'r':
		contents =i.readlines()
		for line in contents:

			document = line

			words = document.split(" ")
			for j in range(len(words)):
				words[j] = words[j].split(".")[0]
				words[j] = words[j].split(",")[0]
				words[j] = words[j].split("!")[0]
				words[j] = words[j].split(":")[0]
				words[j] = words[j].split("?")[0]
				words[j] = words[j].split(";")[0]
				words[j] = words[j].split("/")[0]
				words[j] = words[j].split("\n")[0]
				words[j] = words[j].lower()

			word_list = []
			for word in words :
				if word not in stopwords :
					if not word.isnumeric():
						word_list.append(word)

			for j in range(len(word_list)):
				word_list[j] = p.stem(word_list[j], 0,len(word_list[j])-1)
			print(word_list)
	else :
		return "error reading input file"
Esempio n. 2
0
def stemWords(wordlist):
    """ receives a list of words and returns the list with stemmed words"""

    #using Porter Stemmer for a start
    # lancStemmer = nltk.stem.lancaster.LancasterStemmer()
    porterStemmer = porterstemmer.PorterStemmer()

    i = 0
    endreached = False

    while endreached != True:
        #stem word at position i
        wordlist[i] = porterStemmer.stem(wordlist[i], 0, len(wordlist[i]) - 1)

        i += 1
        if i == len(wordlist):
            endreached = True

    return wordlist
Esempio n. 3
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov  5 17:56:19 2019

@author: Peter
"""
import math
import pandas as pd
#This may need to be changed if you do not have porterstemmer imported locally
#It will be included in my zip folder
import porterstemmer

p = porterstemmer.PorterStemmer()
'''Method to turn a sentence string into a list of tokens'''


def tokenize(anArray):
    splitWordsArray = []
    for _ in range(0, len(sentences)):
        temp = []
        mystr = sentences[_].lower()
        for c in mystr[::1]:
            if (not c.isalpha() and c != " "):
                mystr = mystr.replace(c, "")
            for x in nums:
                mystr = mystr.replace(x, "")
            temp = mystr.split()
        splitWordsArray.append(temp)
    return splitWordsArray

Esempio n. 4
0
def main():

    # construct the argument parser and parse the arguments (input, output path and stopwords)
    ap = argparse.ArgumentParser()
    ap.add_argument("-i",
                    type=str,
                    required=True,
                    help="path to inputfile for the first task")
    ap.add_argument(
        "-o",
        type=str,
        required=True,
        help="path to output file if it exists, if not, it will be created")
    ap.add_argument("-s",
                    type=str,
                    default="stopword_list.txt",
                    help="The path to the stopword list used for the task")
    args = vars(ap.parse_args())

    #opening the stopwords file and preparing the stopwords inside
    s = open(args["s"], "r")
    if s.mode == 'r':
        stopwords = s.readlines()
        for j in range(len(stopwords)):
            stopwords[j] = stopwords[j].split("\n")[0]
            stopwords[j] = stopwords[j].split("\t")[0]
    else:
        return "error reading stopwords file"

    # Initializing the stemmer tool
    p = stemmer.PorterStemmer()

    # Opening the input file
    i = open(args["i"], "r")
    if i.mode == 'r':
        # Reading the lines in the input file
        contents = i.readlines()

        #Initializing the output variables
        titles = []
        output_doc = []

        #Parsing the lines and preprocessing the content
        for line in contents:
            title, document = line.split("\t")
            titles.append(title)
            words = document.split(" ")
            for j in range(len(words)):
                words[j] = words[j].split(".")[0]
                words[j] = words[j].split(",")[0]
                words[j] = words[j].split("!")[0]
                words[j] = words[j].split(":")[0]
                words[j] = words[j].split("?")[0]
                words[j] = words[j].split(";")[0]
                words[j] = words[j].split("/")[0]
                temp = words[j].split('"')
                if len(temp) > 1:
                    words[j] = temp[1]
                else:
                    words[j] = temp[0]
                words[j] = words[j].split("\n")[0]
                words[j] = words[j].lower()

            word_list = []

            # Eliminating stopwords and numerical values
            for word in words:
                if word not in stopwords:
                    if not word.isnumeric():
                        word_list.append(word)

            # Stemming the content
            for j in range(len(word_list)):
                word_list[j] = p.stem(word_list[j], 0, len(word_list[j]) - 1)

            output_doc.append(word_list)
    else:
        return "error reading input file"

    # Opening the output file ( or creating it if it doesnt exist)
    o = open(args["o"], "w+")

    # organizing output lines
    for (title, doc) in zip(titles, output_doc):
        output_line = title + "\t"
        for j in range(len(doc)):
            output_line += doc[j]
            if j < len(doc) - 1:
                output_line += " "

        # Writing the outputs in the output file
        o.write(output_line + "\n")

    # Closing the file readers
    o.close()
    i.close()
    s.close()
Esempio n. 5
0
def stem(sword):
    ps = porterstemmer.PorterStemmer()
    return ps.stem(sword, 0, len(sword) - 1)