def process_item(self, item, spider): if isinstance(item, CoursesItem): course_code = ''.join(item['code']) year = item['year'] semester = item['semester'] #Check if entry already exists course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): print "course record not found, creating" with db.atomic(): try: Course.create(code=course_code, name=''.join(item['title']), year=item['year'], semester=semester, url=''.join(item['link']), path='raw_data'.join(item['link'])) except peewee.OperationalError as e: print "Could not create a record for {} due to {}".format( course_code, e) return item
def persist_lecture_dict(self, lecture_data, rem_words): initial = lecture_data for w in rem_words: for lecture, lecture_dict in lecture_data: if w in lecture_dict: del lecture_dict[w] print "lecture_dict: {}".format(lecture_dict) print "lecture_data: {}".format(lecture_data) # Compose data set for mass insert persistent_tokens = [ self.__compose_lecture_rows(entry) for entry in lecture_data ] # One atomic bulk insert for faster performance res = [x for y in persistent_tokens for x in y] print "len(res) {}".format(len(res)) if (len(res) == 0): print "returning initial" return initial else: with db.atomic(): for idx in range(0, len(res), 500): n = len(LectureWord.select()) print "LectureWords: {}".format(n) LectureWord.insert_many( res[idx:(len(res) if idx + 500 > len(res) else idx + 500)]).execute()
def lda_over_courses(self): """ Perform LDA over all courses, no material/lecture level details. """ courses = Course.select() courses_size = Course.select(Course.code).distinct().count() courses_dict = [] for course in courses: course_words = CourseWord.select().where( CourseWord.course == course) courses_dict.append(dict([(x.word, x.count) for x in course_words])) print "Performing LDA over all courses.." model, vocab = self.__perform_lda_default(courses_dict, courses_size) log_likelihoods = [] for i, x in enumerate(model.loglikelihoods_): row_dict = {'iteration': i * 10, 'loglikelihood': round(x, 2)} log_likelihoods.append(row_dict) norm_topic_word_rows = self.__resolve_topic_words( self.__normalize(model.topic_word_), vocab, 2) topic_word_rows = self.__resolve_topic_words(model.topic_word_, vocab, 1) # Document-topic distributions doc_topic = model.doc_topic_ course_topic_rows = [] for i in range(courses.count()): top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1] topic_probs = doc_topic[i][top_topics] for top_topic, top_weight in zip(top_topics, topic_probs): row_dict = { 'course': courses[i], 'topic': top_topic, 'weight': round(top_weight * 100, 2) } course_topic_rows.append(row_dict) if self.debug: doc_topic_str = ", ".join([ str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs) ]) print("{} (top {} topics: {})".format( courses[i].name.encode('utf-8'), self.n_top_topic, doc_topic_str)) with db.atomic(): self.__insert_rows(LDALogLikelihood, log_likelihoods) self.__insert_rows(TopicWord, norm_topic_word_rows) self.__insert_rows(TopicWord, topic_word_rows) self.__insert_rows(CourseTopic, course_topic_rows)
def __resolve_topic_names(self, table, topic_bucket): rows = [] for topic_id, topics in topic_bucket.items(): rows.append({ 'topic': topic_id, 'name': self.__resolve_topic_name(topics) }) with db.atomic(): table.insert_many(rows).execute() return rows
def persist_course_dict(self, courses_data, rem_words): for w in rem_words: for course_id, course_info in courses_data.items(): if w in course_info[1]: del course_info[1][w] result_courses = [ self.__compose_course_rows(entry) for entry in courses_data.items() ] res = [x for y in result_courses for x in y] with db.atomic(): for idx in range(0, len(res), 500): CourseWord.insert_many( res[idx:(len(res) if idx + 500 > len(res) else idx + 500)]).execute()
def persist_lecture_dict(self, lecture_data, rem_words): for w in rem_words: for lecture, lecture_dict in lecture_data: if w in lecture_dict: del lecture_dict[w] # Compose data set for mass insert persistent_tokens = [ self.__compose_lecture_rows(entry) for entry in lecture_data ] # One atomic bulk insert for faster performance res = [x for y in persistent_tokens for x in y] with db.atomic(): for idx in range(0, len(res), 500): LectureWord.insert_many( res[idx:(len(res) if idx + 500 > len(res) else idx + 500)]).execute()
def name_topics(self): topics = {} for topic_entry in CourseTopic.select(): topic_id = topic_entry.topic if topic_id not in topics or topics[ topic_id].weight < topic_entry.weight: topics[topic_id] = topic_entry rows = [] for topic_id, topic in topics.items(): rows.append({ 'topic': topic_id, 'name': self.__resolve_topic_name(topic) }) with db.atomic(): CourseTopicInfo.insert_many(rows).execute() return rows
def __persist(self, results): rows = [] for k, v in results.items(): course = Course.select().where(Course.code == k.code, Course.year == k.year, Course.semester == k.semester) if not course.exists(): print "Non-existing course in SIS data: {}".format(k) continue rows.append({ 'course': course, 'url': '', 'path': self.filename, 'name': 'SISdata', 'content': v.decode('latin-1').encode('utf-8'), 'time': datetime.datetime.now(), 'size': 0 }) with db.atomic(): Lecture.insert_many(rows).execute()
def create_corpus_tokens(self, courses_data): corpus_dict = {} for course_id, course_info in courses_data.items(): course = course_info[0] course_dict = course_info[1] for word, count in course_dict.items(): if word in corpus_dict: corpus_dict[word][0] += count corpus_dict[word][1].add(course.code) else: word_courses = [course.code] corpus_dict[word] = [count, set(word_courses)] rem_words = [] for word, word_info in corpus_dict.items(): count = word_info[0] courses_count = len(word_info[1]) if count < 5 or courses_count < 3: rem_words.append(word) for word in rem_words: del corpus_dict[word] result_corpus = [ self.__compose_corpus_rows(item) for item in corpus_dict.items() ] with db.atomic(): for idx in range(0, len(result_corpus), 500): CorpusWord.insert_many( result_corpus[idx:(len(result_corpus) if idx + 500 > len(result_corpus) else idx + 500)]).execute() return rem_words
def lda_over_lectures(self): """ Peform LDA over lectures within the scope of an individual course. Basically we perform as many LDA modellings as there are courses. """ lectures = [] for course in Course.select(): course_lectures = list( Lecture.select().where(Lecture.course == course)) lda_tools = [ DictVectorizer(), lda.LDA(n_topics=len(course_lectures), n_iter=1000, random_state=1) ] lectures.append((course, course_lectures, LectureWord, lda_tools)) res = self.pool.map(self.__lda_for_course_material, lectures) with db.atomic(): LectureTopicWord.insert_many([x for y in res for x in y[0]]).execute() LectureTopic.insert_many([x for y in res for x in y[1]]).execute()
def process_item(self, item, spider): if isinstance(item, DataItem): url = ''.join(item['link']) dir_name = 'raw_data' + ''.join(item['path']) + '/' course_code = ''.join(item['course_code']) content = ''.join(item['content']) path = '' year = ''.join(item['year']) semester = ''.join(item['semester']) prefix = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '/' course = Course.select().where(Course.code == course_code, Course.year == year, Course.semester == semester) if not course.exists(): print "Non-existing course: {}".format(course_code) return if len(content) == 0 and not os.path.exists(dir_name): try: os.makedirs(dir_name) except OSError as e: print "Could not create directory: {} due to {}".format( dir_name, e) lecture = Lecture.select().where(Lecture.course == course, Lecture.url == url) file_size = 0 # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url if len(content) == 0: try: info = urllib.urlopen(url).info() if 'Content-Length' in info: file_size = float(info['Content-Length']) except Exception as e: print "Failed to retrieve file size for {} due to {}".format( url, e) if not lecture.exists(): path = self.__download(url, dir_name) else: lecture_instance = lecture.first() # Re-download only if the file has been updated if lecture_instance.size == 0 or lecture_instance.size != file_size: os.remove(prefix + lecture_instance.path) self.__download(url, dir_name) else: content = lecture_instance.content # No need to re-extract content later if not lecture.exists(): print "Lecture record not found, creating ..." title = self.__get_title(url) with db.atomic(): try: Lecture.create(course=course, url=url, path=path, name=title, content=content, size=file_size, time=datetime.datetime.now()) except peewee.OperationalError as e: print "Could not create a record for course {} lecture {} due to {}".format( course_code, url, e) else: with db.atomic(): try: lecture_instance = lecture.first() lecture_instance.content = content lecture_instance.time = datetime.datetime.now() lecture_instance.save() except peewee.OperationalError as e: print e return item
def lda_over_all_material(self): """ Perform LDA over all material without any course limitations. The topic count is 1/10 of the material count. """ lectures = Lecture.select() lectures_dict = [] for lecture in lectures: lecture_words = LectureWord.select().where( LectureWord.lecture == lecture) lectures_dict.append( dict([(x.word, x.count) for x in lecture_words])) topic_count = int(len(lectures_dict) / 10) print "Performing LDA over all material.." model, vocab = self.__perform_lda_default(lectures_dict, topic_count) topic_word_rows = [] # Iterate over topic word distributions for i, topic_dist in enumerate(model.topic_word_): top_topic_words = np.array(vocab)[self.__max_values( topic_dist, self.n_top_words)] top_word_probs = topic_dist[np.argsort( topic_dist)][:-self.n_top_words - 1:-1] for top_word, top_weight in zip(top_topic_words, top_word_probs): row_dict = { 'topic': i, 'word': top_word, 'weight': round(top_weight * 100, 2) } topic_word_rows.append(row_dict) if self.debug: top_word_str = ", ".join([ x.encode('utf-8') + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topic_words, top_word_probs) ]) print('Topic {}: {}'.format(i, top_word_str)) # Document-topic distributions doc_topic = model.doc_topic_ lecture_topic_rows = [] for i in range(lectures.count()): top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1] topic_probs = doc_topic[i][top_topics] for top_topic, top_weight in zip(top_topics, topic_probs): rounded_weight = round(top_weight * 100, 2) if rounded_weight < 10: continue row_dict = { 'lecture': lectures[i], 'topic': top_topic, 'weight': rounded_weight } lecture_topic_rows.append(row_dict) if self.debug: doc_topic_str = ", ".join([ str(x) + "(" + str(round(y * 100, 2)) + "%)" for x, y in zip(top_topics, topic_probs) ]) print("{} (top {} topics: {})".format( lectures[i].name.encode('utf-8'), self.n_top_topic, doc_topic_str)) with db.atomic(): self.__insert_rows(MaterialTopicWord, topic_word_rows) self.__insert_rows(MaterialTopic, lecture_topic_rows)
import sys def is_valid_semester(course_entry, allowed): return any([ x[0] == course_entry.year and x[1] == course_entry.semester for x in allowed ]) if __name__ == '__main__': prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/' lectures = Lecture.select().where(Lecture.time.is_null(True)) for lec in lectures: path = prefix + lec.path if lec.path and os.path.exists(path): os.remove(path) semesters = [] if len(sys.argv) == 2: semesters = parse_semesters(sys.argv[1]) courses = Course.select() with db.atomic(): for lec in lectures: lec.delete_instance() for course in courses: if is_valid_semester(course, semesters): continue course.delete_instance(recursive=True)