def pre_process(): source_data_file = listdir(path1) for i in range(len(source_data_file)): if path.exists(path2 + '/' + source_data_file[i]) == False: mkdir(path2 + '/' + source_data_file[i]) source_data = listdir(path1 + '/' + source_data_file[i]) for j in range(len(source_data)): target_dir = path2 + '/' + source_data_file[i] + '/' + source_data[ j] f = open(target_dir, 'w') source_dir = path1 + '/' + source_data_file[i] + '/' + source_data[ j] data = open(source_dir, 'rb').readlines() for line in data: line = line.decode("utf-8", errors='ignore') #nltk.download('stopwords') #nltk.download('punkt') text_list = re.sub("[^a-zA-Z]", " ", line).split() stoplist = stopwords.words('english') #wordlist = nltk.word_tokenize(sentence) for word in text_list: word = PorterStemmer().stem(word.lower()) if word not in stoplist: f.write('%s\n' % word) f.close()
def read_file(): print('Read data:') for floder in os.listdir(srcPath): if path.exists(toPath + '/' + floder) == False: mkdir(toPath + '/' + floder) floderPath = srcPath + '/' + floder + '/' for file in os.listdir(floderPath): filePath = floderPath + file tarfilePath = toPath + '/' + floder + '/' + file targetfile = open(tarfilePath, 'w') for line in open(filePath, 'rb').readlines(): line = line.decode("utf-8", errors='ignore') text_list = re.sub("[^a-zA-Z]", " ", line).split() stoplist = stopwords.words('english') for word in text_list: word = PorterStemmer().stem(word.lower()) if word not in stoplist: targetfile.write('%s\n' % word) targetfile.close()