def file_processing(file,root,stop_words): p = PorterStemmer() with open(file) as f: length = len(f.readlines())-1 bar = IncrementalBar('In progress', max=length) with open(file, 'r') as csvFile: reader = csv.reader(csvFile) next(reader) for row ,i in zip(reader,range(1,length+1)): if not os.path.exists(root+row[1]): os.mkdir(root+row[1]) # Remove stop words first example = row[0] word_tokens = word_tokenize(example) filtered_sentence = [w for w in word_tokens if not w in stop_words] joined_sentence = (" ").join(filtered_sentence)+'\n' # Do stemming output = '' word = '' line = joined_sentence if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() path = root+row[1]+'/'+row[2]+'.txt' with open(path, "w") as cursor: # Write file cursor.write(output) bar.next() bar.finish()
def file_processing(file,stop_words): p = PorterStemmer() rows = [] with open(file, 'r') as csvFile: reader = csv.reader(csvFile) next(reader) for row in reader: # Remove stop words first example = row[1] word_tokens = word_tokenize(example) filtered_sentence = [w for w in word_tokens if not w in stop_words] joined_sentence = (" ").join(filtered_sentence)+'\n' # Do stemming output = '' word = '' line = joined_sentence if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() new_row=[] new_row.append(output.rstrip('\n')) new_row.append('?') rows.append(new_row) with open("new_test.csv", "w") as csvFile: # Write file csvwriter = csv.writer(csvFile) csvwriter.writerow(['text', 'class']) csvwriter.writerows(rows)
import math import sys import re from stemming import PorterStemmer p = PorterStemmer() # control values stem = False stopwords = False stopList = [] wordList = [] numberList = [] docInfo = {} documentList={} contextList = {} frequencyList = {} locationList = {} totalList = {} # check if number to remove numbers and titles def contains_digits(s): return any(char.isdigit() for char in s) # read doc def readDoc(doc, context): lists = doc.split(" ") for w in range(len(lists)): i = lists[w]
def __init__(self, stop_list_filename): self.stop_list = map(str.rstrip, open(stop_list_filename, 'r').readlines()) # Removing \n at the end of each word self.stemmer = PorterStemmer()