/
stem_processed_stories.py
executable file
·51 lines (42 loc) · 1.81 KB
/
stem_processed_stories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/python2.5
import sys, re, time
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.regexp import WordPunctTokenizer
from utilities import DELIMITER, check_num_arguments, report_time_elapsed, \
open_safely, write_2d_iterable
from process_data import NEW_STORIES_TITLE_INDEX
NUM_ARGUMENTS = 2
"""
The expected number of arguments to this module when executed as a script. The
path to this file is included in this count.
"""
PROGRAM_USAGE = "Usage: %s <stories_file_path>" % __file__
# A description of how to run execute this program from the command-line.
STEMMED_STORIES_EXTENSION = ".stemmed"
############################################
def stem_processed_stories(input_file_path):
"""
"""
start_time = time.time()
if not isinstance(input_file_path, str):
raise TypeError("Expected input_file_path to be of type str.")
stemmer = PorterStemmer()
stories_list = []
prog = re.compile('\W+')
story_stream = open_safely(input_file_path)
for story_as_str in story_stream:
story_as_list = story_as_str[:-1].lower().split(DELIMITER)
story_title = story_as_list[NEW_STORIES_TITLE_INDEX]
tok_contents = WordPunctTokenizer().tokenize(story_title)
stem_contents = [stemmer.stem(word) for word in tok_contents if \
prog.match(word) is None]
story_as_list[NEW_STORIES_TITLE_INDEX] = " ".join(stem_contents)
stories_list.append(story_as_list)
story_stream.close()
output_file_path = input_file_path + STEMMED_STORIES_EXTENSION
write_2d_iterable(stories_list, output_file_path)
print("Output stemmed stories to %s" % output_file_path)
report_time_elapsed(start_time)
if __name__ == "__main__":
check_num_arguments(NUM_ARGUMENTS, PROGRAM_USAGE)
stem_processed_stories(sys.argv[1])