Example #1
0
    def process_item(self, item, spider):
        if isinstance(item, CoursesItem):
            course_code = ''.join(item['code'])
            year = item['year']
            semester = item['semester']

            #Check if entry already exists
            course = Course.select().where(Course.code == course_code,
                                           Course.year == year,
                                           Course.semester == semester)

            if not course.exists():
                print "course record not found, creating"
                with db.atomic():
                    try:
                        Course.create(code=course_code,
                                      name=''.join(item['title']),
                                      year=item['year'],
                                      semester=semester,
                                      url=''.join(item['link']),
                                      path='raw_data'.join(item['link']))
                    except peewee.OperationalError as e:
                        print "Could not create a record for {} due to {}".format(
                            course_code, e)

        return item
Example #2
0
    def persist_lecture_dict(self, lecture_data, rem_words):
        initial = lecture_data
        for w in rem_words:
            for lecture, lecture_dict in lecture_data:
                if w in lecture_dict:
                    del lecture_dict[w]
                print "lecture_dict: {}".format(lecture_dict)

        print "lecture_data: {}".format(lecture_data)
        # Compose data set for mass insert
        persistent_tokens = [
            self.__compose_lecture_rows(entry) for entry in lecture_data
        ]

        # One atomic bulk insert for faster performance
        res = [x for y in persistent_tokens for x in y]
        print "len(res) {}".format(len(res))
        if (len(res) == 0):
            print "returning initial"
            return initial
        else:
            with db.atomic():
                for idx in range(0, len(res), 500):
                    n = len(LectureWord.select())
                    print "LectureWords: {}".format(n)
                    LectureWord.insert_many(
                        res[idx:(len(res) if idx + 500 > len(res) else idx +
                                 500)]).execute()
Example #3
0
    def lda_over_courses(self):
        """
        Perform LDA over all courses, no material/lecture level details.
        """

        courses = Course.select()
        courses_size = Course.select(Course.code).distinct().count()
        courses_dict = []
        for course in courses:
            course_words = CourseWord.select().where(
                CourseWord.course == course)
            courses_dict.append(dict([(x.word, x.count)
                                      for x in course_words]))

        print "Performing LDA over all courses.."
        model, vocab = self.__perform_lda_default(courses_dict, courses_size)

        log_likelihoods = []
        for i, x in enumerate(model.loglikelihoods_):
            row_dict = {'iteration': i * 10, 'loglikelihood': round(x, 2)}
            log_likelihoods.append(row_dict)

        norm_topic_word_rows = self.__resolve_topic_words(
            self.__normalize(model.topic_word_), vocab, 2)
        topic_word_rows = self.__resolve_topic_words(model.topic_word_, vocab,
                                                     1)

        # Document-topic distributions
        doc_topic = model.doc_topic_
        course_topic_rows = []
        for i in range(courses.count()):
            top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1]
            topic_probs = doc_topic[i][top_topics]

            for top_topic, top_weight in zip(top_topics, topic_probs):
                row_dict = {
                    'course': courses[i],
                    'topic': top_topic,
                    'weight': round(top_weight * 100, 2)
                }
                course_topic_rows.append(row_dict)

            if self.debug:
                doc_topic_str = ", ".join([
                    str(x) + "(" + str(round(y * 100, 2)) + "%)"
                    for x, y in zip(top_topics, topic_probs)
                ])
                print("{} (top {} topics: {})".format(
                    courses[i].name.encode('utf-8'), self.n_top_topic,
                    doc_topic_str))

        with db.atomic():
            self.__insert_rows(LDALogLikelihood, log_likelihoods)
            self.__insert_rows(TopicWord, norm_topic_word_rows)
            self.__insert_rows(TopicWord, topic_word_rows)
            self.__insert_rows(CourseTopic, course_topic_rows)
    def __resolve_topic_names(self, table, topic_bucket):
        rows = []
        for topic_id, topics in topic_bucket.items():
            rows.append({
                'topic': topic_id,
                'name': self.__resolve_topic_name(topics)
            })

        with db.atomic():
            table.insert_many(rows).execute()

        return rows
Example #5
0
    def persist_course_dict(self, courses_data, rem_words):
        for w in rem_words:
            for course_id, course_info in courses_data.items():
                if w in course_info[1]:
                    del course_info[1][w]

        result_courses = [
            self.__compose_course_rows(entry)
            for entry in courses_data.items()
        ]

        res = [x for y in result_courses for x in y]
        with db.atomic():
            for idx in range(0, len(res), 500):
                CourseWord.insert_many(
                    res[idx:(len(res) if idx + 500 > len(res) else idx +
                             500)]).execute()
Example #6
0
    def persist_lecture_dict(self, lecture_data, rem_words):
        for w in rem_words:
            for lecture, lecture_dict in lecture_data:
                if w in lecture_dict:
                    del lecture_dict[w]

        # Compose data set for mass insert
        persistent_tokens = [
            self.__compose_lecture_rows(entry) for entry in lecture_data
        ]

        # One atomic bulk insert for faster performance
        res = [x for y in persistent_tokens for x in y]
        with db.atomic():
            for idx in range(0, len(res), 500):
                LectureWord.insert_many(
                    res[idx:(len(res) if idx + 500 > len(res) else idx +
                             500)]).execute()
    def name_topics(self):
        topics = {}
        for topic_entry in CourseTopic.select():
            topic_id = topic_entry.topic
            if topic_id not in topics or topics[
                    topic_id].weight < topic_entry.weight:
                topics[topic_id] = topic_entry

        rows = []
        for topic_id, topic in topics.items():
            rows.append({
                'topic': topic_id,
                'name': self.__resolve_topic_name(topic)
            })

        with db.atomic():
            CourseTopicInfo.insert_many(rows).execute()

        return rows
Example #8
0
    def __persist(self, results):
        rows = []
        for k, v in results.items():
            course = Course.select().where(Course.code == k.code,
                                           Course.year == k.year,
                                           Course.semester == k.semester)
            if not course.exists():
                print "Non-existing course in SIS data: {}".format(k)
                continue

            rows.append({
                'course': course,
                'url': '',
                'path': self.filename,
                'name': 'SISdata',
                'content': v.decode('latin-1').encode('utf-8'),
                'time': datetime.datetime.now(),
                'size': 0
            })

        with db.atomic():
            Lecture.insert_many(rows).execute()
Example #9
0
    def create_corpus_tokens(self, courses_data):
        corpus_dict = {}

        for course_id, course_info in courses_data.items():
            course = course_info[0]
            course_dict = course_info[1]
            for word, count in course_dict.items():
                if word in corpus_dict:
                    corpus_dict[word][0] += count
                    corpus_dict[word][1].add(course.code)
                else:
                    word_courses = [course.code]
                    corpus_dict[word] = [count, set(word_courses)]

        rem_words = []
        for word, word_info in corpus_dict.items():
            count = word_info[0]
            courses_count = len(word_info[1])
            if count < 5 or courses_count < 3:
                rem_words.append(word)

        for word in rem_words:
            del corpus_dict[word]

        result_corpus = [
            self.__compose_corpus_rows(item) for item in corpus_dict.items()
        ]

        with db.atomic():
            for idx in range(0, len(result_corpus), 500):
                CorpusWord.insert_many(
                    result_corpus[idx:(len(result_corpus) if idx +
                                       500 > len(result_corpus) else idx +
                                       500)]).execute()

        return rem_words
Example #10
0
    def lda_over_lectures(self):
        """
        Peform LDA over lectures within the scope of an individual course.
        Basically we perform as many LDA modellings as there are courses.
        """

        lectures = []
        for course in Course.select():
            course_lectures = list(
                Lecture.select().where(Lecture.course == course))
            lda_tools = [
                DictVectorizer(),
                lda.LDA(n_topics=len(course_lectures),
                        n_iter=1000,
                        random_state=1)
            ]
            lectures.append((course, course_lectures, LectureWord, lda_tools))

        res = self.pool.map(self.__lda_for_course_material, lectures)

        with db.atomic():
            LectureTopicWord.insert_many([x for y in res
                                          for x in y[0]]).execute()
            LectureTopic.insert_many([x for y in res for x in y[1]]).execute()
Example #11
0
    def process_item(self, item, spider):
        if isinstance(item, DataItem):
            url = ''.join(item['link'])
            dir_name = 'raw_data' + ''.join(item['path']) + '/'
            course_code = ''.join(item['course_code'])
            content = ''.join(item['content'])
            path = ''
            year = ''.join(item['year'])
            semester = ''.join(item['semester'])
            prefix = os.path.dirname(os.path.dirname(
                os.path.abspath(__file__))) + '/'

            course = Course.select().where(Course.code == course_code,
                                           Course.year == year,
                                           Course.semester == semester)
            if not course.exists():
                print "Non-existing course: {}".format(course_code)
                return

            if len(content) == 0 and not os.path.exists(dir_name):
                try:
                    os.makedirs(dir_name)
                except OSError as e:
                    print "Could not create directory: {} due to {}".format(
                        dir_name, e)

            lecture = Lecture.select().where(Lecture.course == course,
                                             Lecture.url == url)
            file_size = 0
            # if no lecture record and no content, then download data (pdf, pptx, etc.) according to url
            if len(content) == 0:
                try:
                    info = urllib.urlopen(url).info()
                    if 'Content-Length' in info:
                        file_size = float(info['Content-Length'])
                except Exception as e:
                    print "Failed to retrieve file size for {} due to {}".format(
                        url, e)
                if not lecture.exists():
                    path = self.__download(url, dir_name)
                else:
                    lecture_instance = lecture.first()

                    # Re-download only if the file has been updated
                    if lecture_instance.size == 0 or lecture_instance.size != file_size:
                        os.remove(prefix + lecture_instance.path)
                        self.__download(url, dir_name)
                    else:
                        content = lecture_instance.content  # No need to re-extract content later

            if not lecture.exists():
                print "Lecture record not found, creating ..."
                title = self.__get_title(url)
                with db.atomic():
                    try:
                        Lecture.create(course=course,
                                       url=url,
                                       path=path,
                                       name=title,
                                       content=content,
                                       size=file_size,
                                       time=datetime.datetime.now())
                    except peewee.OperationalError as e:
                        print "Could not create a record for course {} lecture {} due to {}".format(
                            course_code, url, e)
            else:
                with db.atomic():
                    try:
                        lecture_instance = lecture.first()
                        lecture_instance.content = content
                        lecture_instance.time = datetime.datetime.now()
                        lecture_instance.save()
                    except peewee.OperationalError as e:
                        print e
        return item
Example #12
0
    def lda_over_all_material(self):
        """
        Perform LDA over all material without any course limitations. The topic count is 1/10 of the material count.
        """

        lectures = Lecture.select()
        lectures_dict = []
        for lecture in lectures:
            lecture_words = LectureWord.select().where(
                LectureWord.lecture == lecture)
            lectures_dict.append(
                dict([(x.word, x.count) for x in lecture_words]))

        topic_count = int(len(lectures_dict) / 10)

        print "Performing LDA over all material.."
        model, vocab = self.__perform_lda_default(lectures_dict, topic_count)

        topic_word_rows = []
        # Iterate over topic word distributions
        for i, topic_dist in enumerate(model.topic_word_):
            top_topic_words = np.array(vocab)[self.__max_values(
                topic_dist, self.n_top_words)]
            top_word_probs = topic_dist[np.argsort(
                topic_dist)][:-self.n_top_words - 1:-1]

            for top_word, top_weight in zip(top_topic_words, top_word_probs):
                row_dict = {
                    'topic': i,
                    'word': top_word,
                    'weight': round(top_weight * 100, 2)
                }
                topic_word_rows.append(row_dict)

            if self.debug:
                top_word_str = ", ".join([
                    x.encode('utf-8') + "(" + str(round(y * 100, 2)) + "%)"
                    for x, y in zip(top_topic_words, top_word_probs)
                ])
                print('Topic {}: {}'.format(i, top_word_str))

        # Document-topic distributions
        doc_topic = model.doc_topic_
        lecture_topic_rows = []
        for i in range(lectures.count()):
            top_topics = np.argsort(doc_topic[i])[:-self.n_top_topic - 1:-1]
            topic_probs = doc_topic[i][top_topics]

            for top_topic, top_weight in zip(top_topics, topic_probs):
                rounded_weight = round(top_weight * 100, 2)
                if rounded_weight < 10:
                    continue
                row_dict = {
                    'lecture': lectures[i],
                    'topic': top_topic,
                    'weight': rounded_weight
                }
                lecture_topic_rows.append(row_dict)

            if self.debug:
                doc_topic_str = ", ".join([
                    str(x) + "(" + str(round(y * 100, 2)) + "%)"
                    for x, y in zip(top_topics, topic_probs)
                ])
                print("{} (top {} topics: {})".format(
                    lectures[i].name.encode('utf-8'), self.n_top_topic,
                    doc_topic_str))

        with db.atomic():
            self.__insert_rows(MaterialTopicWord, topic_word_rows)
            self.__insert_rows(MaterialTopic, lecture_topic_rows)
Example #13
0
import sys


def is_valid_semester(course_entry, allowed):
    return any([
        x[0] == course_entry.year and x[1] == course_entry.semester
        for x in allowed
    ])


if __name__ == '__main__':
    prefix = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'

    lectures = Lecture.select().where(Lecture.time.is_null(True))
    for lec in lectures:
        path = prefix + lec.path
        if lec.path and os.path.exists(path):
            os.remove(path)

    semesters = []
    if len(sys.argv) == 2:
        semesters = parse_semesters(sys.argv[1])
        courses = Course.select()
        with db.atomic():
            for lec in lectures:
                lec.delete_instance()
            for course in courses:
                if is_valid_semester(course, semesters):
                    continue
                course.delete_instance(recursive=True)