コード例 #1
0
ファイル: course_spider.py プロジェクト: KitFung/aims_crawler
    def parse_course_item(self, response):
        selector = response.selector
        full_header = selector\
            .xpath('//html/body/div[5]/form/b[1]/text()')\
            .extract_first()
        unit = selector\
            .xpath('//html/body/div[5]/form/b[2]/text()')\
            .extract_first()
        a_requirement = selector\
            .xpath('//html/body/div[5]/form/b[4]/font/text()')\
            .extract_first()
        a_exclusive = selector\
            .xpath('//html/body/div[5]/form/b[6]/font/text()')\
            .extract_first()

        table = selector\
            .xpath('//html/body/div[5]/form/table/tr')

        table.pop(0)
        details = self.parse_course_table(table)

        operators = {'and': 1, 'or': 2}
        requirement_text = custom_escape(a_requirement)
        requirement_formula = convert_to_reverse_polish_notation(
                                string_to_element_array(requirement_text),
                                operators
                              )
        exclusive_text = custom_escape(a_exclusive)
        exclusive_formula = string_to_element_array(exclusive_text)

        item = CourseItem()
        item['term'] = response.meta['term']
        item['full_header'] = full_header.replace('Course : ', '')
        item['code'] = item['full_header'].split(' ')[0]
        item['unit'] = unit.replace('Offering Academic Unit: ', '')
        item['requirement_text'] = requirement_text
        item['requirement_formula'] = requirement_formula
        item['exclusive_text'] = exclusive_text
        item['exclusive_formula'] = exclusive_formula
        item['details'] = details
        yield item
コード例 #2
0
ファイル: course_spider.py プロジェクト: KitFung/aims_crawler
    def parse_course_table(self, table):
        details = []
        classes = {}
        # one class one attribute
        attributes = [('CRN', None), ('Section', None),
                      ('Credit', str_to_float), ('Campus', None),
                      ('WEB', str_to_bool), ('Level', None),
                      ('Avail', str_to_int), ('Cap', str_to_int),
                      ('Waitlist_Avail', str_to_bool)]
        num_of_attr = len(attributes)
        # one attribite may map to multiple sub_attributes
        sub_attributes = [('Date', str_to_daterange),
                          ('Day', None), ('Time', str_to_timerange),
                          ('Bldg', None), ('Room', None),
                          ('Instructor', str_to_list)]
        restriction_pair = [('only for Major: ', 'only_majors'),
                            ('not for Major: ', 'not_allow_majors'),
                            ('only for College: ', 'only_colleges'),
                            ('not for College: ', 'not_allow_colleges'),
                            ('only for Degree: ', 'only_degrees'),
                            ('not for Degree: ', 'not_allow_degrees'),
                            ('only for Programme: ', 'only_programmes'),
                            ('not for Programme: ', 'not_allow_programmes')]

        for tr in table:
            if len(tr.xpath('./td')) > 2:
                CRN = custom_escape(
                    tr.xpath('./td[1]/text()').extract_first()
                ).replace(' ', '')
                if len(CRN) > 0:
                    if len(classes) > 0:
                        details.append(classes)
                    classes = {}
                    classes['lessons'] = []
                    for idx, (key, parser) in enumerate(attributes):
                        value = tr.xpath('./td[{}]/text()'.format(idx + 1))\
                                     .extract_first()
                        # Special Case "FULL"
                        value = custom_escape(value)
                        if parser:
                            value = parser(value)
                        classes[key] = value
                lessons = {}
                for idx, (key, parser) in enumerate(sub_attributes):
                    j = num_of_attr + idx + 1
                    value = tr.xpath('./td[{}]/text()'.format(j))\
                              .extract_first()
                    # Special Case "FULL"
                    value = custom_escape(value)
                    if parser:
                        value = parser(value)
                    lessons[key] = value
                classes['lessons'].append(lessons)
            else:
                text = tr.xpath('./td[2]/text()').extract_first()
                is_done = False
                for start_word, key in restriction_pair:
                    if not text.find(start_word) == -1:
                        l = text.replace(start_word, '').split(',')
                        classes[key] = [t.strip() for t in l]
        details.append(classes)
        return details