forked from awwong1/topic-traceability
/
load_and_preprocess.py
executable file
·424 lines (366 loc) · 15.9 KB
/
load_and_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/usr/bin/env python3
"""
Load necessary values from the coursera sql extract into a sqlite3 database.
"""
import json
import os
import sys
from csv import reader, field_size_limit
from datetime import datetime
from sqlite3 import connect, Error
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import preprocess_string
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
DB_NAME = "dump_coursera_partial.sqlite3"
DB_FILE = os.path.join(DIR_PATH, DB_NAME)
DATA_PATH = os.path.join(DIR_PATH, "data")
COURSES = [
"agile_planning_for_software_products",
"client_needs_and_software_requirements",
"design_patterns",
"introduction_to_software_product_management",
"object_oriented_design",
"reviews_and_metrics_for_software_improvements",
"service_oriented_architecture",
"software_architecture",
"software_processes_and_agile_practices",
"software_product_management_capstone"
]
CSV_KWARGS = {
"delimiter": ",",
"quotechar": "\"",
"escapechar": "\\"
}
def create_database(conn):
"""create necessary database tables"""
sql_create_courses = """
CREATE TABLE IF NOT EXISTS courses (
course_id VARCHAR(50),
course_slug VARCHAR(2000),
course_name VARCHAR(2000),
course_launch_ts DATETIME,
course_update_ts DATETIME,
course_deleted BOOLEAN,
course_graded BOOLEAN,
course_desc VARCHAR(10000),
course_restricted BOOLEAN,
course_verification_enabled_at_ts DATETIME,
primary_translation_equivalent_course_id VARCHAR(50),
course_preenrollment_ts DATETIME,
course_workload VARCHAR(100),
course_session_enabled_ts DATETIME,
course_promo_photo_s3_bucket VARCHAR(255),
course_promo_photo_s3_key VARCHAR(10000),
course_level VARCHAR(50),
course_planned_launch_date_text VARCHAR(255),
course_header_image_s3_bucket VARCHAR(255),
course_header_image_s3_key VARCHAR(10000),
PRIMARY KEY (course_id)
)"""
sql_create_course_branches = """
CREATE TABLE IF NOT EXISTS course_branches (
course_id VARCHAR(50),
course_branch_id VARCHAR(50),
course_branch_changes_description VARCHAR(65535),
authoring_course_branch_name VARCHAR(255),
authoring_course_branch_created_ts DATETIME,
PRIMARY KEY (course_id, course_branch_id)
)"""
sql_create_course_branch_modules = """
CREATE TABLE IF NOT EXISTS course_branch_modules (
course_branch_id VARCHAR(50),
course_module_id VARCHAR(50),
course_branch_module_order INT8,
course_branch_module_name VARCHAR(2000),
course_branch_module_desc VARCHAR(10000)
)"""
sql_create_course_branch_lessons = """
CREATE TABLE IF NOT EXISTS course_branch_lessons (
course_branch_id VARCHAR(50),
course_lesson_id VARCHAR(50),
course_module_id VARCHAR(50),
course_branch_lesson_order INT8,
course_branch_lesson_name VARCHAR(10000)
);
"""
sql_create_course_branch_items = """
CREATE TABLE IF NOT EXISTS course_branch_items (
course_branch_id VARCHAR(255),
course_item_id VARCHAR(255),
course_lesson_id VARCHAR(255),
course_branch_item_order INT8,
course_item_type_id INT8,
course_branch_item_name VARCHAR(255),
course_branch_item_optional BOOLEAN,
atom_id VARCHAR(255),
atom_version_id INT8,
course_branch_atom_is_frozen BOOLEAN,
PRIMARY KEY (course_branch_id, course_item_id)
)"""
sql_create_course_item_types = """
CREATE TABLE IF NOT EXISTS course_item_types (
course_item_type_id INT8,
course_item_type_desc VARCHAR(255),
course_item_type_category VARCHAR(255),
course_item_type_graded BOOLEAN,
atom_content_type_id INT8,
PRIMARY KEY (course_item_type_id)
)"""
sql_create_discussion_course_forums = """
CREATE TABLE IF NOT EXISTS discussion_course_forums (
discussion_forum_id VARCHAR(50),
course_branch_id VARCHAR(50),
discussion_course_forum_title VARCHAR(20000),
discussion_course_forum_description VARCHAR(20000),
discussion_course_forum_order INT8
)"""
sql_create_discussion_questions = """
CREATE TABLE IF NOT EXISTS discussion_questions (
discussion_question_id VARCHAR(50),
ualberta_user_id VARCHAR(50) NOT NULL,
discussion_question_title VARCHAR(20000),
discussion_question_details VARCHAR(20000),
discussion_question_context_type VARCHAR(50),
course_id VARCHAR(50),
course_module_id VARCHAR(50),
course_item_id VARCHAR(50),
discussion_forum_id VARCHAR(50),
country_cd VARCHAR(2),
group_id VARCHAR(50),
discussion_question_created_ts DATETIME,
discussion_question_updated_ts DATETIME
)"""
sql_create_discussion_answers = """
CREATE TABLE IF NOT EXISTS discussion_answers (
discussion_answer_id VARCHAR(50),
ualberta_user_id VARCHAR(50) NOT NULL,
course_id VARCHAR(50),
discussion_answer_content VARCHAR(20000),
discussion_question_id VARCHAR(50),
discussion_answer_parent_discussion_answer_id VARCHAR(50),
discussion_answer_created_ts DATETIME,
discussion_answer_updated_ts DATETIME
)"""
c = conn.cursor()
c.execute(sql_create_courses)
c.execute(sql_create_course_branches)
c.execute(sql_create_course_branch_modules)
c.execute(sql_create_course_branch_lessons)
c.execute(sql_create_course_item_types)
c.execute(sql_create_course_branch_items)
c.execute(sql_create_discussion_course_forums)
c.execute(sql_create_discussion_questions)
c.execute(sql_create_discussion_answers)
conn.commit()
def load_data_from_csv(csv_path, conn, tbl_name):
c = conn.cursor()
with open(csv_path) as csvfile:
csv_reader = reader(csvfile, **CSV_KWARGS)
headers = next(csv_reader)
for line in csv_reader:
q_s = ",".join(["?", ] * len(line))
c.execute(
f"INSERT OR REPLACE INTO {tbl_name} VALUES ({q_s})", line)
conn.commit()
def load_course_data(course_data_path, conn):
for course_file in sorted(os.listdir(course_data_path)):
csv_path = os.path.join(course_data_path, course_file)
if course_file == "courses.csv":
load_data_from_csv(csv_path, conn, "courses")
elif course_file == "course_branches.csv":
load_data_from_csv(csv_path, conn, "course_branches")
elif course_file == "course_branch_modules.csv":
load_data_from_csv(csv_path, conn, "course_branch_modules")
elif course_file == "course_branch_lessons.csv":
load_data_from_csv(csv_path, conn, "course_branch_lessons")
elif course_file == "course_branch_items.csv":
load_data_from_csv(csv_path, conn, "course_branch_items")
elif course_file == "course_item_types.csv":
load_data_from_csv(csv_path, conn, "course_item_types")
elif course_file == "discussion_course_forums.csv":
load_data_from_csv(csv_path, conn, "discussion_course_forums")
elif course_file == "discussion_questions.csv":
load_data_from_csv(csv_path, conn, "discussion_questions")
elif course_file == "discussion_answers.csv":
load_data_from_csv(csv_path, conn, "discussion_answers")
def parse_and_load_course_branch_item(course_data_path, conn, course_zip_name):
"""take all of the course branch item content and create vocabulary
"""
content_path = os.path.join(course_data_path, "course_branch_item_content")
course_slug = course_zip_name.replace("_", "-")
sql_select_course_id = (
"SELECT DISTINCT course_branch_items.course_branch_id, " +
"course_item_id, course_branch_module_name, " +
"course_branch_lesson_name, course_branch_item_name FROM " +
"course_branch_modules, course_branch_lessons, course_branch_items, " +
"course_branches, courses WHERE course_slug = (?) " +
"AND courses.course_id == course_branches.course_id " +
"AND course_branches.course_branch_id == course_branch_items.course_branch_id " +
"AND course_branch_items.course_lesson_id == course_branch_lessons.course_lesson_id " +
"AND course_branch_lessons.course_module_id == course_branch_modules.course_module_id"
)
c = conn.cursor()
c.execute(sql_select_course_id, (course_slug,))
# module name > lesson name > item name > to processed vocabulary (list of words)
course_vocabulary = {}
rows = c.fetchmany()
while rows:
for row in rows:
(course_branch_id, course_item_id, course_branch_module_name,
course_branch_lesson_name, course_branch_item_name,) = row
# load the raw json file for branch item
course_branch_item_path = os.path.join(
content_path, "{}-{}.json".format(course_branch_id, course_item_id))
with open(course_branch_item_path, "r") as cbif:
# attempt to load the json file, otherwise continue
try:
raw_cbi = json.load(cbif)
except Exception as e:
print(e)
continue
try:
if raw_cbi["message"] == "" and raw_cbi["statusCode"] == 204 and raw_cbi["reason"] == "ignore assesments":
continue
except KeyError:
pass
try:
if raw_cbi["message"] == "" and raw_cbi["statusCode"] == 404:
continue
except KeyError:
pass
try:
if raw_cbi["message"] == None and raw_cbi["errorCode"] == "Not Authorized":
continue
except KeyError:
pass
try:
if raw_cbi["message"].startswith("No item ItemId(") and raw_cbi["errorCode"] == None:
continue
except KeyError:
pass
normalized_processed_text = None
try:
# try to get the definition value of the item
definition_raw_html = raw_cbi["linked"]["openCourseAssets.v1"][0]["definition"]["value"]
definition_text = " ".join(BeautifulSoup(
definition_raw_html, "html.parser").stripped_strings)
normalized_processed_text = preprocess_string(
definition_text)
update_course_vocabulary(
course_vocabulary, course_branch_module_name,
course_branch_lesson_name, course_branch_item_name,
normalized_processed_text)
continue
except KeyError:
pass
try:
# check if the branch item is a video with subtitles, get subtitles
subtitles_lookup = raw_cbi["linked"]["onDemandVideos.v1"][0]["subtitlesTxt"]
if not subtitles_lookup.keys():
continue # no subtitles for the video
subtitle_filepath = course_branch_item_path + ".subtitles.txt"
with open(subtitle_filepath, "r") as subfp:
subtitle_raw_text = "".join(subfp.readlines())
normalized_processed_text = preprocess_string(
subtitle_raw_text)
update_course_vocabulary(
course_vocabulary, course_branch_module_name,
course_branch_lesson_name, course_branch_item_name,
normalized_processed_text)
continue
except KeyError:
pass
raise Error("unhandled cbi")
rows = c.fetchmany()
# save the course_vocabulary to disk
vocab_filepath = os.path.join(
course_data_path, "..", "vocabulary.{}.json".format(course_slug))
with open(vocab_filepath, "w") as vocab_file:
json.dump(course_vocabulary, vocab_file)
def update_course_vocabulary(course_vocabulary, course_branch_module_name, course_branch_lesson_name, course_branch_item_name, normalized_processed_text):
course_branch_module = course_vocabulary.get(course_branch_module_name, {})
course_branch_lesson = course_branch_module.get(
course_branch_lesson_name, {})
course_branch_item = course_branch_lesson.get(course_branch_item_name, [])
course_branch_item.extend(normalized_processed_text)
course_branch_lesson[course_branch_item_name] = course_branch_item
course_branch_module[course_branch_lesson_name] = course_branch_lesson
course_vocabulary[course_branch_module_name] = course_branch_module
def parse_and_load_discussion_questions(course_data_path, conn, course_zip_name):
"""load, parse, process discussion questions
"""
course_slug = course_zip_name.replace("_", "-")
sql_select_discussion_question = (
"SELECT discussion_question_id, discussion_question_title, " +
"discussion_question_details " +
"FROM discussion_questions, courses WHERE " +
"discussion_questions.course_id == courses.course_id AND " +
"courses.course_slug == (?)"
)
c = conn.cursor()
c.execute(sql_select_discussion_question, (course_slug,))
course_questions = {}
rows = c.fetchmany()
while rows:
for row in rows:
question_id, question_title, question_details = row
course_questions[question_id] = (
preprocess_string(question_title) +
preprocess_string(question_details)
)
rows = c.fetchmany()
# save the course_questions to disk
questions_filepath = os.path.join(
course_data_path, "..", "questions.{}.json".format(course_slug))
with open(questions_filepath, "w") as questions_file:
json.dump(course_questions, questions_file)
def parse_and_load_discussion_answers(course_data_path, conn, course_zip_name):
"""load, parse, process discussion answers
"""
course_slug = course_zip_name.replace("_", "-")
sql_select_discussion_answer = (
"SELECT discussion_answer_id, discussion_answer_content " +
"FROM discussion_answers, courses WHERE " +
"discussion_answers.course_id == courses.course_id AND " +
"courses.course_slug == (?)"
)
c = conn.cursor()
c.execute(sql_select_discussion_answer, (course_slug,))
course_answers = {}
rows = c.fetchmany()
while rows:
for row in rows:
answer_id, answer_content = row
course_answers[answer_id] = preprocess_string(answer_content)
rows = c.fetchmany()
# save the course_answers to disk
answers_filepath = os.path.join(
course_data_path, "..", "answers.{}.json".format(course_slug))
with open(answers_filepath, "w") as answers_file:
json.dump(course_answers, answers_file)
def main():
conn = None
try:
field_size_limit(sys.maxsize) # GHMatches csv threw error
conn = connect(DB_FILE)
sc_start = datetime.now()
print(f"Started {sc_start.now()}")
create_database(conn)
for course in COURSES:
print(course)
course_data_path = os.path.join(DATA_PATH, course)
load_course_data(course_data_path, conn)
parse_and_load_course_branch_item(course_data_path, conn, course)
parse_and_load_discussion_questions(course_data_path, conn, course)
parse_and_load_discussion_answers(course_data_path, conn, course)
conn.commit()
sc_end = datetime.now()
print(f"Ended {sc_end}")
print(f"Elapsed: {sc_end - sc_start}")
except Error as e:
print(e)
finally:
if conn:
conn.close()
if __name__ == "__main__":
main()