/
fetch_story_contents.py
171 lines (150 loc) · 8 KB
/
fetch_story_contents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/python2.5
"""Create processed Pulse log files with story contents when run as a program.
The only public function is fetch_story_contents, which behaves like the
program.
"""
import sys, os, time, socket, html2text
from os import path
from utilities import DELIMITER, check_num_arguments, write_iterable, \
write_2d_iterable
from process_data import TIMEOUT_LENGTH, MIN_STORY_LENGTH, STORIES_FILENAME, \
READS_FILENAME, CLICKTHROUGHS_FILENAME, USER_IDS_FILENAME, \
STORIES_DESCRIPTOR, READS_DESCRIPTOR, CLICKTHROUGHS_DESCRIPTOR, \
NEW_STORIES_URL_INDEX, NEW_STORIES_TITLE_INDEX, EVENTS_STORY_ID_INDEX, \
open_safely, report_time_elapsed, get_user_ids
NUM_ARGUMENTS = 2
"""The expected number of arguments to this module when executed as a script.
The path to this file is included in this count.
"""
PROGRAM_USAGE = "Usage: %s <input_directory>" % __file__
# A description of how to run execute this program from the command-line.
SUB_DIRECTORY_NAME = "Processed Data with Story Contents/"
# The name of the directory in which to place the output log files.
def _read_stories(input_file_path):
"""Return a list of stories with full contents and a dict with new IDs.
Generate list elements in tuple form, where each element corresponds to a
single line of the given processed stories log file. Do not trim the
newline off the end of the last element of the tuple. Append a space
followed by the full story contents to the title of each story. Omit
stories for which the full story contents could not be fetched. Generate
dict entries mapping from story IDs in the input file to story IDs in the
output file. Maintain the ordering of the input file in the list. This
ordering is equivalent to ascending order of both old story IDs and new
story IDs. Be warned that fetching full story contents is quite slow and
consumes a great deal of bandwidth, so you or other users on your network
may experience connectivity problems while executing this function.
input_file_path, a str, is the file path to the processed Pulse stories log
file that contains story URLs and titles but not the full contents of the
stories themselves.
"""
start_time = time.time()
old_story_id = 0
new_story_id = 0
stories_list = []
story_id_dict = {}
story_contents_dict = {}
socket.setdefaulttimeout(TIMEOUT_LENGTH)
input_stream = open_safely(input_file_path)
for story_as_str in input_stream:
story_as_list = story_as_str.split(DELIMITER)
story_url = story_as_list[NEW_STORIES_URL_INDEX]
if story_url in story_contents_dict:
story_contents = story_contents_dict[story_url]
else:
story_contents = html2text.extractFromURL(story_url)
if (story_contents is not None) and \
(len(story_contents) <= MIN_STORY_LENGTH):
story_contents = None
story_contents_dict[story_url] = story_contents
if story_contents is not None:
story_as_list[NEW_STORIES_TITLE_INDEX] += " " + story_contents
stories_list.append(tuple(story_as_list))
story_id_dict[old_story_id] = new_story_id
new_story_id += 1
old_story_id += 1
input_stream.close()
num_stories_discarded = old_story_id - new_story_id
discard_rate = float(100 * num_stories_discarded) / float(old_story_id)
print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \
"because their full contents could not be fetched.") % \
(old_story_id, STORIES_DESCRIPTOR, num_stories_discarded,
discard_rate))
report_time_elapsed(start_time)
return (stories_list, story_id_dict)
def _read_events(story_id_dict, input_file_path, event_descriptor):
""""Return a list of the events in the given file with the given story IDs.
Generate list elements in [user_id, new_story_id, time_occurred] form, and
maintain the ordering of the input file.
inpurt_file_path, a str, is the file path to the Pulse event log file to
read in.
story_id_dict, a dict, maps from old story IDs to new story IDs. Only
events with story IDs that are keys in story_id_dict are retained in the
output, but these old story IDs are replaced with the corresponding new
story IDs.
event_descriptor, a str, briefly describes the events in the plural form and
is used to notify the user upon completion.
"""
events_list = []
num_events = 0
num_events_kept = 0
input_stream = open_safely(input_file_path)
for event_as_str in input_stream:
event_as_list = map(int, event_as_str[:-1].split(DELIMITER))
old_user_id = event_as_list[EVENTS_STORY_ID_INDEX]
if old_user_id in story_id_dict:
event_as_list[EVENTS_STORY_ID_INDEX] = story_id_dict[old_user_id]
events_list.append(event_as_list)
num_events_kept += 1
num_events += 1
input_stream.close()
num_events_discarded = num_events - num_events_kept
discard_rate = float(100 * num_events_discarded) / float(num_events)
print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \
"because the full contents of the associated story could not be " + \
"fetched.") % (num_events, event_descriptor, num_events_discarded,
discard_rate))
return events_list
def fetch_story_contents(input_directory):
"""Write processed Pulse log files with full story contents.
Output files by the same name in a sub-directory of the given directory
named SUB_DIRECTORY_NAME. Append a space followed by the full story
contents to the story titles. Remove stories for which no content could not
be fetched and events involving these stories. Reassign story and user IDs
to 0, 1, 2, etc. to fill the resulting gaps in the ID sequences. Output a
user IDs log file in which row numbers correspond to the new user IDs, and
row values correspond to the old user IDs.
input_directory, a str, is the file path to a directory containing processed
Pulse log files lacking full story contents.
"""
if not isinstance(input_directory, str):
raise TypeError("Expected input_directory to be of type str.")
if not path.isdir(input_directory):
raise ValueError("Could not find given directory: %s" % input_directory)
input_stories_path = path.join(input_directory, STORIES_FILENAME)
stories_list, story_id_dict = _read_stories(input_stories_path)
input_reads_path = path.join(input_directory, READS_FILENAME)
reads_list = _read_events(story_id_dict, input_reads_path, READS_DESCRIPTOR)
input_clickthroughs_path = path.join(input_directory,
CLICKTHROUGHS_FILENAME)
clickthroughs_list = _read_events(story_id_dict, input_clickthroughs_path,
CLICKTHROUGHS_DESCRIPTOR)
user_ids_list = get_user_ids(reads_list, clickthroughs_list)
reads_list = [map(str, read) for read in reads_list]
clickthroughs_list = [map(str, clickthrough) for clickthrough in \
clickthroughs_list]
user_ids_list = map(str, user_ids_list)
output_directory = path.join(input_directory, SUB_DIRECTORY_NAME)
if not path.exists(output_directory):
os.mkdir(output_directory)
output_stories_path = path.join(output_directory, STORIES_FILENAME)
write_2d_iterable(stories_list, output_stories_path, "")
output_reads_path = path.join(output_directory, READS_FILENAME)
write_2d_iterable(reads_list, output_reads_path)
output_clickthroughs_path = path.join(output_directory,
CLICKTHROUGHS_FILENAME)
write_2d_iterable(clickthroughs_list, output_clickthroughs_path)
output_users_path = path.join(output_directory, USER_IDS_FILENAME)
write_iterable(user_ids_list, output_users_path)
if __name__ == "__main__":
check_num_arguments(NUM_ARGUMENTS, PROGRAM_USAGE)
fetch_story_contents(sys.argv[1])