Beispiel #1
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        university = "Keele"
        programme = response.xpath('').extract()
        programme = ' '.join(programme)
Beispiel #2
0
    def parse_item(self, response):
        print('==================================', response.url)

        item = HooliItem()

        url = response.url
        print(1, url)

        university = "EXETER postgraduate study and research"
        print(2, university)

        department = 'NULL'

        country = 'UK'
        city = 'NULL'
        website = 'https://www.exeter.ac.uk'

        programme = response.xpath(
            '//div[@id="left-col"]//h1//text()').extract()
        programme = ''.join(programme)
        # Course = Course.replace('\r\n', '')
        print(3, programme)

        ucas_code = 'NULL'
        degree_level = '1'

        degree_type = response.xpath(
            '//div[@id="left-col"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        print(4, degree_type)

        duration = response.xpath(
            '//div[@class="panel-padding"]/table/tbody/tr/td/span[1]/text()'
        ).extract()
        duration = ''.join(duration)
        print(5, duration)

        start_date = response.xpath(
            '//div[@class="panel-padding"]//table//tbody//tr[4]//td//text()'
        ).extract()
        start_date = ''.join(start_date).replace('\r\n', '')
        print(6, start_date)

        location = response.xpath(
            '//div[@class="panel-padding"]//text()').extract()
        location = ''.join(location).replace('\r\n', '')
        print(7, location)

        ATAS = 'NULL'

        overview = response.xpath('//div[@id="Overview"]//text()').extract()
        overview = ''.join(overview).replace('\r\n', '')
        overview = overview.replace('\n', '')
        print(8, overview)

        mode = response.xpath(
            '//td[@class="exeter-course-duration"]/span/text()').extract()
        mode = ''.join(mode)
        print(9, mode)

        modules = response.xpath('//div[@id="myTabContent"]//text()').extract()
        modules = ''.join(modules).replace('\r\n', '')
        modules = modules.replace('\n', '')
        print(10, modules)

        teaching = 'NULL'

        assessment = response.xpath('//div[@id="Learning"]//text()').extract()
        assessment = ''.join(assessment).replace('\r\n', '')
        # teaching_assessment = teaching_assessment.replace('\n', '')
        print(11, assessment)

        career = response.xpath('//div[@id="Careers"]//text()').extract()
        career = ''.join(career).replace('\r\n', '')
        career = career.replace('\n', '')
        print(12, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        entry_requirements = response.xpath(
            '//div[@id="Entry-requirements"]//p[1]//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        print(13, entry_requirements)

        chinese_requirements = 'NULL'

        TOEFL = response.xpath(
            '//div[@id="Entry-requirements"]//p[6]//text()').extract()
        TOEFL = ''.join(TOEFL)
        print(14, TOEFL)

        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        IELTS = response.xpath(
            '//div[@id="Entry-requirements"]//p[5]//text()').extract()
        IELTS = ''.join(IELTS)
        print(15, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        tuition_fee = response.xpath(
            '//div[@class="highlight-panel-fees"]//ul//li[2]//text()').extract(
            )
        tuition_fee = ''.join(tuition_fee).replace('\r\n', '')
        print(16, tuition_fee)

        Alevel = 'NULL'

        IB = 'NULL'

        # crawltime = datetime.datetime.now().strftime('%Y-%m-%d')
        # print(16,crawltime)

        GPA = 'UNLL'
        average_score = 'NULL'
        accredited_university = 'NULL'
        GRE = 'NULL'
        GMAT = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'
        working_experience = 'NULL'
        interview = 'NULL'
        portfolio = 'NULL'
        application_documents = 'NULL'
        how_to_apply = 'NULL'
        school_test = 'NULL'
        SATII = 'NULL'
        degree_description = 'NULL'
        SATI = 'NULL'
        SAT_code = 'NULL'
        ACT = 'NULL'
        ACT_code = 'NULL'
        other = response.xpath('//div[@class="span9"]//text()').extract()
        other = ''.join(other).replace('\r\n', '')
        other = other.replace('\n', '')
        print(17, other)

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(18, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #3
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'Goldsmiths UNIVERSITY OF LONDON'
        print(2, university)

        department_str = response.xpath(
            '//div[@class="hero__content"]/ul[@class="split-list split-list--hero"]/li//text()'
        ).extract()
        department_str = ''.join(department_str)
        try:
            if "Department" in department_str:
                start = department_str.find("Department")
                department = department_str[start:]
                department = department[:50]
                item["department"] = department
            else:
                department = "NULL"
        except:
            department = "报错"
        print(3, department)

        country = 'UK'
        city = 'NULL'

        programme = response.xpath(
            '//div[@class="hero__content"]/h1/text()').extract()
        Programme = ''.join(programme)
        print(4, Programme)

        ucas_code = 'NULL'
        # Master = ''.join(Master)

        degree_type = response.xpath(
            '//div[@class="hero__content"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        print(5, degree_type)
        degree_level = '1'

        website = 'https://www.gold.ac.uk/pg'

        start_date = 'NULL'
        # start_date = response.xpath('//div[@class="rich-content rich-content-section full-wrap"]/p[6]/text()').extract()
        # start_date = ''.join(start_date)
        # print(5,start_date)

        overview = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]//text()'
        ).extract()
        overview = ''.join(overview).replace('\r\n', '')
        print(6, overview)

        mode = 'NULL'

        duration = response.xpath(
            '//div[@class="hero__content"]/ul[@class="split-list split-list--hero"]/li/text()'
        ).extract()
        duration = ''.join(duration).replace('\r\n', '')
        # Duration = Duration.replace('   ','')
        print(7, duration)

        modules = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]//text()'
        ).extract()
        modules = ''.join(modules).replace('\r\n', '')
        modules = modules.replace('\n', '')
        print(8, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]//text()'
        ).extract()
        assessment = ''.join(assessment).replace('\r\n', '')
        print(9, assessment)

        career = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]//text()'
        ).extract()
        career = ''.join(career).replace('\r\n', '')
        print(10, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = 'NULL'
        # tuition_fee = ''.join(tuition_fee).replace('\r\n','')
        # tuition_fee = tuition_fee.replace('   ','')
        # print(12, Tuition_Fee)

        location = 'NULL'
        # location = ''.join(location)
        # print(13,location)

        accredited_university = 'NULL'
        ATAS = 'NULL'

        GPA = 'NULL'

        IELTS_str = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]/p//text()'
        ).extract()
        IELTS_str = ''.join(IELTS_str).replace('\r\n', '')
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        try:
            if "IELTS" in IELTS_str:
                start = IELTS_str.find("IELTS")
                end = IELTS_str.find("If you need")
                IELTS = IELTS_str[start:end]
                # IELTS = IELTS[:80]
                item["IELTS"] = IELTS

            else:
                IELTS = 'NULL'
        except:
            IELTS = '报错'
        print(11, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'
        MCAT = 'NULL'

        average_score = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]/ul/li/text()'
        ).extract()
        entry_requirements = ''.join(entry_requirements).replace('\n', '')
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(12, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #4
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'University of Worcester'
        print(2, university)

        country = 'UK'

        city = 'worcester'

        website = 'https://www.worcester.ac.uk'

        department = 'NULL'

        # programme = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract()
        programme_s = response.xpath(
            '//section[@class="pageHead"]/h1/text()').extract()
        programme_s = ''.join(programme_s)
        if len(programme_s) > 0:
            programme = programme_s
        else:
            programme = 'NULL'
        print(3, programme)

        ucas_code = 'NULL'

        degree_level = '1'
        # degree_type = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract()
        degree_type = response.xpath(
            '//section[@class="pageHead"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        degree_type = self.getDegree_type(degree_type)
        print(4, degree_type)

        start_date = 'NULL'
        # start_date = ''.join(start_date)
        # print(5,start_date)

        degree_description = 'NULL'

        # overview = response.xpath('//div[@class="left logo-bg"]//text()').extract()
        overview = response.xpath(
            '//div[@class="body-copy"]/ul/li/text()').extract()
        overview = ''.join(overview)
        print(5, overview)

        mode = 'NULL'
        # mode = ''.join(mode).replace('\r\n','')
        # mode = mode.replace('\n','')
        # mode = mode.replace('      ','')
        # print(7,mode)

        duration = 'NULL'
        # duration = ''.join(duration).replace('\r\n','')
        # duration = duration.replace('\n','')
        # duration = duration.replace('    ','')
        # print(8,duration)

        modules = response.xpath(
            '//dd[@class="accordion__content rte"]//text()').extract()
        modules = ''.join(modules)
        # modules = modules.replace('\n','')
        print(6, modules)

        assessment = 'NULL'
        # assessment = ''.join(assessment)
        # print(7, assessment)

        teaching = 'NULL'

        career = response.xpath(
            '//*[@id="#content"]/div/div/dl/dd/div//text()').extract()
        career = ''.join(career)
        print(8, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = 'NULL'
        # tuition_fee = ''.join(tuition_fee)
        # # tuition_fee = tuition_fee.replace('\n','')
        # # tuition_fee = tuition_fee.replace('    ','')
        # print(9, tuition_fee)

        location = 'worcester'
        # location = ''.join(location)
        # print(13,location)

        ATAS = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS = 'NULL'
        # IELTS = ''.join(IELTS)
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(10, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'

        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = response.xpath(
            '//*[@id="#content"]/div/div/dl/dd/div//text()').extract()
        how_to_apply = ''.join(how_to_apply)
        print(11, how_to_apply)

        entry_requirements = response.xpath(
            '//*[@id="#content"]/div/div/dl/dd/div//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(12, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #5
0
    def parse_item(self,response):
        print('==================================',response.url)
        item = HooliItem()
        url = response.url
        print(url)

        university = 'Loughbrough University'
        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'NULL'

        programme= response.xpath('//div[@class="hero__content"]//h1//text()').extract()
        programme= ' '.join(programme)
        print(programme,1)

        ucas_code = 'NULL'

        degree_level = '0'

        degree_type = 'NULL'

        start_date = 'NULL'

        overview = 'NULL'

        mode = 'NULL'

        duration = 'NULL'

        modules = response.xpath('//div[@class="toggle_container"]').extract()
        modules = ''.join(modules)
        modules = str(modules)
        print(2,modules)

        teaching = 'NULL'

        assessment = response.xpath('//div[@class="content-wrapper"]/p[3]/text()').extract()
        assessment = ' '.join(assessment)
        assessment = str(assessment)
        print(assessment, 3)

        career = 'NULL'

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = 'NULL'

        location = 'NULL'

        GPA = 'NULL'
        average_score = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        accredited_university = 'NULL'

        IELTS = 'NULL'
        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        ATAS = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience =  'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = 'NULL'

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #6
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'Queen Mary Universty of London'
        print(2, university)

        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'http://search.qmul.ac.uk'

        programme = response.xpath(
            '//section[@id="count"]/article/header//text()').extract()
        programme = ''.join(programme)
        print(3, programme)

        ucas_code = 'NULL'
        degree_level = '1'

        degree_type = response.xpath(
            '//section[@id="count"]/article/header/h2/text()').extract()
        degree_type = ''.join(degree_type)
        try:
            if "BSc" in degree_type:
                degree_type = 'Bsc'
            elif "MSc" in degree_type:
                degree_type = "MSc"
            elif "BA" in degree_type:
                degree_type = 'BA'
            elif "MNSW" in degree_type:
                degree_type = 'MNSW'
            elif "PGCert" in degree_type:
                degree_type = 'PGCert'
            elif "MBA" in degree_type:
                degree_type = 'MBA'
            elif "MA" in degree_type:
                degree_type = 'MA'
            elif "MComp" in degree_type:
                degree_type = 'MComp'
            elif "PhD" in degree_type:
                degree_type = 'PhD'
            elif "FdA" in degree_type:
                degree_type = 'FdA'
            elif "PGCE" in degree_type:
                degree_type = 'PGCE'
            elif "IFP" in degree_type:
                degree_type = 'IFP'
            elif "LLB" in degree_type:
                degree_type = 'LLB'
            elif "MHealth Res" in degree_type:
                degree_type = 'MHealth Res'
            elif "MRes" in degree_type:
                degree_type = 'MRes'
            elif "MMed" in degree_type:
                degree_type = 'MMed'
            elif "MSci" in degree_type:
                degree_type = 'MSci'
            elif "MCh" in degree_type:
                degree_type = 'MCh'
            elif "LLM" in degree_type:
                degree_type = "LLM"
            elif "Y2QF" in degree_type:
                degree_type = "Y2QF"
            elif "Y2QG" in degree_type:
                degree_type = "Y2QG"
            else:
                degree_type = 'Ordinary degree'
        except:
            degree_type = "NULL"
        print(4, degree_type)

        start_date = 'NULL'
        # Sstart_date = ''.join(start_date)
        # print(5,start_date)

        overview = response.xpath('//div[@id="first"]//text()').extract()
        overview = ''.join(overview).replace('\n', '')
        print(6, overview)

        mode = response.xpath(
            '//section[@id="count"]/article/header/h2/text()').extract()
        mode = ''.join(mode).replace('\r\n', '')
        mode = mode.replace('   ', '')
        print(7, mode)

        duration = response.xpath(
            '//section[@id="count"]/article/header/h2/text()').extract()
        duration = ''.join(duration).replace('\r\n', '')
        duration = duration.replace('   ', '')
        print(8, duration)

        modules = response.xpath('//div[@id="second"]//text()').extract()
        modules = ''.join(modules).replace('\r\n', '')
        modules = modules.replace('\n', '')
        print(9, modules)

        teaching = 'NULL'

        assessment = response.xpath('//div[@id="fourth"]//text()').extract()
        assessment = ''.join(assessment)
        print(10, assessment)

        career = 'NULL'
        # career = ''.join(career).replace('\n', '')
        # print(11, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee_s = response.xpath('//div[@id="fifth"]//text()').extract()
        tuition_fee_s = ''.join(tuition_fee_s).replace('\r\n', '')
        tuition_fee_s = tuition_fee_s.replace('   ', '')
        tuition_fee_s = self.getTuition_fee(tuition_fee_s)
        try:
            if tuition_fee_s > 0:
                tuition_fee = tuition_fee_s
            else:
                tuition_fee = 'NULL'
        except:
            tuition_fee = '报错!'

        print(12, tuition_fee)

        location = 'NULL'
        # location = ''.join(location)
        # print(13,location)

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS = 'NULL'
        # IELTS = ''.join(IELTS).replace('\r\n','')
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(14, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        ATAS = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'
        chinese_requirements = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath(
            '//div[@id="third"]//text()').extract()
        entry_requirements = ''.join(entry_requirements).replace('\n', '')
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(15, entry_requirements)

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #7
0
    def parse_item(self, response):

        item = HooliItem()
        # item = {}
        url = response.url
        print(url)
        print('----------------------------------------------------')
        # var1=url.split('/')
        # if 'ug' in var1:
        #     print('----------------------------------------',response.url)
        # else:
        #     print('```````````````````````')

        university = 'Goldsmiths University of London'
        print(1, university)

        department_str = response.xpath(
            '//*[@id="maincontent"]/article/header/div/div/div/div/div//text()'
        ).extract()
        department_str = ' '.join(department_str)
        try:
            if "Department" in department_str:
                start = department_str.find("Department")
                department = department_str[start:]
                department = department[:50]
                item["department"] = department
            else:
                department = "NULL"
        except:
            department = "报错"
        print(2, department)

        country = 'UK'
        city = "NULL"
        website = 'https://www.gold.ac.uk'

        programme = response.xpath(
            '//div[@class="hero__content"]//h1//text()').extract()
        programme = ''.join(programme)
        print(3, programme)

        ucas_code = response.xpath(
            '//ul[@class="split-list split-list--hero"]/li/text()').extract()
        ucas_code = ''.join(ucas_code)
        print(4, ucas_code)

        degree_level = '0'

        degree_type = response.xpath(
            '//div[@class="hero__content"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        print(5, degree_type)

        start_date = 'NULL'

        overview = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]/p/text()'
        ).extract()
        overview = ''.join(overview)
        print(6, overview)

        mode = 'NULL'

        duration = response.xpath(
            '//ul[@class="split-list split-list--hero"]/li/text()').extract()
        duration = ''.join(duration)
        print(7, duration)

        modules = response.xpath(
            '//div[@class="grid-push grid-push--two"]/div[@class="rich-content rich-content-section full-wrap"]/p/text()'
        ).extract()
        modules = ' '.join(modules)
        modules = str(modules)
        print(8, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]/p[11]/text()'
        ).extract()
        assessment = ' '.join(assessment)
        # Evaluation_method = Evaluation_method.replace('\n', '')
        assessment = str(assessment)
        print(9, assessment)

        career_lists = response.xpath(
            '//section[@class="section section--accordion"]//text()').extract(
            )
        career_str = ' '.join(career_lists)
        if "Skills & careers" in career_str:
            cstart = career_str.find("Skills & careers")
            cend = career_str.find("Fees & funding")
            career = career_str[cstart:cend]
            item["career"] = career
        else:
            career = 'NULL'
        print(10, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = 'NULL'

        location = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = response.xpath(
            '//div[@class="hero__content"]/ul/li/text()').extract()
        Alevel = ' '.join(Alevel)
        print(11, Alevel)

        IB = response.xpath(
            '//div[@class="hero__content"]/ul/li/text()').extract()
        IB = ' '.join(IB)
        print(12, IB)

        IELTS_str = response.xpath(
            '//div[@class="rich-content rich-content-section full-wrap"]//p//text()'
        ).extract()
        IELTS_str = ' '.join(IELTS_str)
        try:
            if "IELTS" in IELTS_str:
                start = IELTS_str.find("IELTS")
                end = IELTS_str.find("If you need")
                IELTS = IELTS_str[start:end]
                # IELTS = IELTS[:80]
                item["IELTS"] = IELTS
            else:
                IELTS = 'NULL'
        except:
            IELTS = '报错'

        print(13, IELTS)

        IELTS_L = "NULL"
        IELTS_S = "NULL"
        IELTS_R = "NULL"
        IELTS_W = "NULL"

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'UNLL'

        ATAS = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = 'NULL'
        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #8
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'Loughborough University'
        print(2, university)

        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'http://www.lboro.ac.uk'
        degree_level = '1'

        programme = response.xpath(
            '//div[@class="programme-column programme-details"]/h1[@id="top"]//text()'
        ).extract()
        programme = ''.join(programme)
        print(3, programme)

        degree_type = response.xpath(
            '//div[@class="programme-column programme-details"]/h1[@id="top"]/span/text()'
        ).extract()
        degree_type = ''.join(degree_type)
        print(4, degree_type)

        ucas_code = 'NULL'

        start_date = response.xpath(
            '//div[@class="list__content icon icon--calendar"]/dd/text()'
        ).extract()
        start_date = ''.join(start_date)
        print(5, start_date)

        overview = response.xpath(
            '//div[@class="content-type content-type--main"]/div[@class="content-type__container"]/div[@class="editor"]//text()'
        ).extract()
        overview = ''.join(overview)
        print(6, overview)

        mode = response.xpath(
            '//div[@class="list__content icon icon--clock"]//text()').extract(
            )
        mode = ''.join(mode).replace('\r\n', '')
        mode = mode.replace('   ', '')
        print(7, mode)

        duration = response.xpath(
            '//div[@class="list__content icon icon--clock"]//text()').extract(
            )
        duration = ''.join(duration).replace('\r\n', '')
        duration = duration.replace('   ', '')
        print(8, duration)

        modules_lists = response.xpath(
            '//div[@class="container"]//text()').extract()
        modules_str = ''.join(modules_lists).replace('\r\n', '')
        # modules = modules.replace('\n','')
        if "Modules" in modules_str:
            mstart = modules_str.find("Modules")
            mend = modules_str.find("How you'll be assessed")
            modules = modules_str[mstart:mend]
            # modules = ''.join(modules).replace('\r\n', '')
            # modules = modules.replace('\n', '')
            item["modules"] = modules
        else:
            modules = 'NULL'
        print(9, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//div[@class="content-type__container"]/div[@class="editor"]/p/span/text()'
        ).extract()
        assessment = ''.join(assessment)
        print(10, assessment)

        career_lists = response.xpath(
            '//div[@class="container"]//text()').extract()
        career_str = ''.join(career_lists).replace('\r\n', '')
        if "Your personal and professional development" in career_str:
            cstart = career_str.find(
                "Your personal and professional development")
            cend = career_str.find("Fees and funding")
            career = career_str[cstart:cend]
            # career = ''.join(career).replace('\r\n', '')
            item["career"] = career
        else:
            career = 'NULL'
        print(11, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = response.xpath(
            '//div[@class="list__content icon icon--money"]//text()').extract(
            )
        tuition_fee = ''.join(tuition_fee).replace('\r\n', '')
        tuition_fee = tuition_fee.replace('   ', '')
        print(12, tuition_fee)

        location = response.xpath(
            '//dd[@class="list__item list__item--definition"]/a/text()'
        ).extract()
        location = ''.join(location)
        print(13, location)

        ATAS = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS = 'NULL'
        # IELTS = ''.join(IELTS).replace('\r\n','')
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(14, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements_lists = response.xpath(
            '//div[@class="container"]//text()').extract()
        entry_requirements_str = ''.join(entry_requirements_lists).replace(
            '\r\n', '')
        # EntryRequirements = EntryRequirements.replace(' ','')
        if "Entry requirements" in entry_requirements_str:
            erstart = entry_requirements_str.find(
                "Who should study this programme?")
            erend = entry_requirements_str.find(
                "English Language requirements")
            entry_requirements = entry_requirements_str[erstart:erend]

            item["entry_requirements"] = entry_requirements
        #     print('===========================')
        else:
            entry_requirements = 'NULL'
        print(15, entry_requirements, '==================================')

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #9
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'BATH UNIVERSITY'
        print(2, university)

        department = 'NULL'
        # department = ''.join(department)
        # print(3,department)

        country = 'UK'

        city = 'NULL'
        website = 'http://www.bath.ac.uk'

        programme = response.xpath(
            '//h1[@class="page-heading text-center reverse"]/text()').extract(
            )
        programme = ''.join(programme)
        print(3, programme)

        ucas_code = response.xpath(
            '//div[@class="sidebar"]/dl/dd/text()').extract()
        ucas_code = ''.join(ucas_code).replace('\n', '')
        print(4, ucas_code)

        degree_level = "1"

        degree_type = response.xpath(
            '//h1[@class="page-heading text-center reverse"]/span/text()'
        ).extract()
        degree_type = ''.join(degree_type).replace('\n', '')
        print(5, degree_type)

        start_date = response.xpath(
            '//h2[@class="page-subheading text-center reverse"]/text()'
        ).extract()
        start_date = ''.join(start_date)
        print(6, start_date)

        degree_description = 'NULL'

        overview = response.xpath(
            '//div[@class="column medium-15 end"]/p/text()').extract()
        overview = ''.join(overview).replace('\n', '')
        print(7, overview)

        mode = response.xpath(
            '//h2[@class="page-subheading text-center reverse"]/text()'
        ).extract()
        mode = ''.join(mode).replace('\n', '')
        print(8, mode)

        duration = response.xpath(
            '//h2[@class="page-subheading text-center reverse"]/text()'
        ).extract()
        duration = ''.join(duration).replace('\n', '')
        # duration = duration.replace('   ','')
        print(9, duration)

        modules = response.xpath(
            '//div[@class="row medium-up-2"]//text()').extract()
        modules = ''.join(modules).replace('\n', '')
        # modules = modules.replace('\n','')
        print(10, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//div[@class="column medium-12"]//text()').extract()
        assessment = ''.join(assessment).replace('\n', '')
        print(11, assessment)

        career = response.xpath(
            '//div[@class="column medium-15 end"]/p/text()').extract()
        career = ''.join(career).replace('\n', '')
        print(12, career)

        application_date = 'NULL'

        deadline = response.xpath(
            '//div[@class="aside style-dark-red key-information"]/ul/li/p/text()'
        ).extract()
        deadline = ''.join(deadline)
        print(13, deadline)

        application_fee = 'NULL'

        tuition_fee = response.xpath(
            '//div[@class="aside style-dark-red key-information"]/ul/li[3]/p[2]/text()'
        ).extract()
        tuition_fee = ''.join(tuition_fee).replace('\r\n', '')
        # tuition_fee = tuition_fee.replace('   ','')
        tuition_fee = self.getTuition_fee(tuition_fee)
        try:
            if tuition_fee > 0:
                tuition_fee = tuition_fee
            else:
                tuition_fee = 'NULL'
        except:
            tuition_fee = 'NULL'
        print(14, tuition_fee)

        location = response.xpath(
            '//div[@class="sidebar"]/dl/dd/a/text()').extract()
        location = ''.join(location)
        print(15, location)

        GPA = 'NULL'
        ATAS = 'NULL'

        average_score = 'NULL'
        # average_score = ''.join(average_score)
        # print(16,average_score)

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_str = response.xpath(
            '//div[@class="sidebar seperator reverse"]//text()').extract()
        IELTS_str = ''.join(IELTS_str)
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        if "English language requirements" in IELTS_str:
            Istart = IELTS_str.find("IELTS:")
            Iend = IELTS_str.find(
                "The Pearson Test of English Academic (PTE Academic):")
            IELTS = IELTS_str[Istart:Iend]
            item["IELTS"] = IELTS
        else:
            IELTS = 'NULL'
        print(17, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL_str = response.xpath(
            '//div[@class="sidebar seperator reverse"]//text()').extract()
        TOEFL_str = ''.join(TOEFL_str)
        if "English language requirements" in TOEFL_str:
            Istart = TOEFL_str.find("TOEFL IBT:")
            # Iend = IELTS_str.find("The Pearson Test of English Academic (PTE Academic):")
            TOEFL = TOEFL_str[Istart:]
            TOEFL = TOEFL[:100]
            item["TOEFL"] = TOEFL
        else:
            TOEFL = 'NULL'
        print(18, TOEFL)

        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath(
            '//div[@class="column medium-15 end"]/p//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        print(19, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time
Beispiel #10
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'Swansea University Prifysgol Abertawe'
        print(2, university)

        department = 'NULL'

        country = 'UK'
        city = "NULL"
        website = 'NULL'

        programme = response.xpath(
            '//div[@id="contentHeader"]/h1/text()').extract()
        programme = ''.join(programme)
        print(3, programme)

        ucas_code = 'NULL'
        degree_level = '1'

        degree_type = response.xpath(
            '//div[@id="contentHeader"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        degree_type = self.getDegree_type(degree_type)
        print(4, degree_type)

        start_date = 'NULL'
        # start_date = ''.join(start_date)
        # print(5,start_date)

        overview = response.xpath(
            '//div[@id="description-contents"]/p/text()').extract()
        overview = ''.join(overview).replace('\n', '')
        print(6, overview)

        mode_s = response.xpath(
            '//div[@id="tuition-fees-contents"]//text()').extract()
        mode_s = ''.join(mode_s).replace('\r\n', '')
        mode_s = mode_s.replace('\n', '')
        mode_s = mode_s.replace('      ', '')
        try:
            if "Full-time" in mode_s:
                mode = "Full-time"
            else:
                mode = "Part-time"
        except:
            mode = "报错!"

        print(7, mode)

        duration = response.xpath(
            '//div[@id="content-items"]/div/div/ol/li//text()').extract()
        duration = ''.join(duration).replace('\r\n', '')
        duration = duration.replace('\n', '')
        duration = duration.replace('    ', '')
        print(8, duration)

        modules = response.xpath(
            '//div[@id="modules"]/div[@id="modules-contents"]//text()'
        ).extract()
        modules = ''.join(modules).replace('\r\n', '')
        # modules = modules.replace('\n','')
        print(9, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//*[@id="teaching-assessment"]//text()').extract()
        assessment = ''.join(assessment)
        print(10, assessment)

        career = 'NULL'
        # career = ''.join(career).replace('\n', '')
        # print(11, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee_s = response.xpath(
            '//div[@id="tuition-fees-contents"]//text()').extract()
        tuition_fee_s = ''.join(tuition_fee_s).replace('\r\n', '')
        tuition_fee_s = tuition_fee_s.replace('\n', '')
        tuition_fee_s = tuition_fee_s.replace('    ', '')
        tuition_fee_s = self.getTuition_fee(tuition_fee_s)
        try:
            if tuition_fee_s > 0:
                tuition_fee = tuition_fee_s
            else:
                tuition_fee = 'NULL'
        except:
            tuition_fee = '报错!'

        print(11, tuition_fee)

        location = 'NULL'
        # location = ''.join(location)
        # print(13,location)

        ATAS = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_s = response.xpath(
            '//div[@id="entry-requirements-contents"]//text()').extract()
        IELTS_s = ''.join(IELTS_s).replace('\r\n', '')
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        try:
            if "IELTS" in IELTS_s:
                start = IELTS_s.find("IELTS")
                IELTS = IELTS_s[start:]
                IELTS = IELTS[:100]
            else:
                IELTS = 'NULL'
        except:
            IELTS = '报错!'
        print(12, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = response.xpath(
            '//div[@id="how-to-apply"]//text()').extract()
        how_to_apply = ''.join(how_to_apply).replace('\n', '')
        print(13, how_to_apply)

        entry_requirements = response.xpath(
            '//div[@id="entry-requirements-contents"]//text()').extract()
        entry_requirements = ''.join(entry_requirements).replace('\n', '')
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(14, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
 def parse_item(self, response):
     print('==================================', response.url)
     item = HooliItem()
Beispiel #12
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'BATH UNIVERSITY'
        print(2, university)

        department = 'NULL'
        # department = ''.join(department)
        # print(3,department)

        programme = response.xpath(
            '//h1[@class="page-title"]/text()|//*[@id="content"]/div/div/h2//text()'
        ).extract()
        programme = ''.join(programme)
        print(3, programme)

        country = 'UK'

        city = 'NULL'
        website = 'http://www.bath.ac.uk'

        ucas_code = response.xpath(
            '//div[@class="sidebar"]/dl/dd/text()').extract()
        ucas_code = ''.join(ucas_code).replace('\n', '')
        print(4, ucas_code)

        degree_level = '1'

        degree_type = response.xpath(
            '//h1[@class="page-title"]/text()').extract()
        degree_type = ''.join(degree_type).replace('\n', '')
        print(5, degree_type)

        start_date_str = response.xpath(
            '//div[@class="columns small-12 medium-4 content-aside"]//text()'
        ).extract()
        start_date_str = ''.join(start_date_str).replace('\n', '')
        start_date_str = start_date_str.replace('    ', '')
        try:
            if "Course start date" in start_date_str:
                sdstart = start_date_str.find('Course start date')
                sdend = start_date_str.find('Fees')
                start_date = start_date_str[sdstart:sdend]
                item["start_date"] = start_date
            else:
                start_date = 'NULL'
        except:
            start_date = 'NULL'
        print(6, start_date)

        degree_description = 'NULL'

        overview = response.xpath(
            '//div[@class="columns small-12 medium-7 content-area"]/p/text()'
        ).extract()
        overview = ''.join(overview).replace('\n', '')
        print(7, overview)

        mode_str = response.xpath(
            '//div[@class="columns small-12 medium-4 content-aside"]//text()'
        ).extract()
        mode_str = ''.join(mode_str).replace('\n', '')
        mode_str = mode_str.replace('    ', '')
        try:
            if "Mode of attendance" in mode_str:
                mstart = mode_str.find("Mode of attendance")
                mend = mode_str.find("Events")
                mode = mode_str[mstart:mend]
                item["mode"] = mode
            else:
                mode = 'NULL'
        except:
            mode = 'NULL'
        print(8, mode)

        types = 'NULL'
        # types = ''.join(types).replace('\n', '')
        # print(8,types)

        duration_str = response.xpath(
            '//div[@class="columns small-12 medium-4 content-aside"]//text()'
        ).extract()
        duration_str = ''.join(duration_str).replace('\n', '')
        duration_str = duration_str.replace('   ', '')
        try:
            if "Length of course" in duration_str:
                dstart = duration_str.find("Length of course")
                dend = duration_str.find("Mode of attendance")
                duration = duration_str[dstart:dend]
                item["duration"] = duration
            else:
                duration = "NULL"
        except:
            duration = "NULL"
        print(9, duration)

        modules = response.xpath(
            '//div[@class="columns small-12 medium-7"]//text()').extract()
        modules = ''.join(modules).replace('\n', '')
        # modules = modules.replace('\n','')
        print(10, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//div[@class="columns small-12 medium-7 content-area"]//text()'
        ).extract()
        assessment = ''.join(assessment).replace('\n', '')
        print(11, assessment)

        career = response.xpath(
            '//div[@class="columns small-12 medium-7 content-area"]//text()'
        ).extract()
        career = ''.join(career).replace('\n', '')
        print(12, career)

        application_date = 'NULL'

        deadline_str = response.xpath(
            '//div[@class="columns small-12 medium-4 content-aside"]//text()'
        ).extract()
        deadline_str = ''.join(deadline_str).replace('\n', '')
        deadline_str = deadline_str.replace('     ', '')
        try:
            if "Application deadline" in deadline_str:
                dstart = deadline_str.find("Application deadline")
                dend = deadline_str.find(
                    "Please note applications may close earlier")
                deadline = deadline_str[dstart:dend]
                item["deadline"] = deadline
            else:
                deadline = "NULL"
        except:
            deadline = "NULL"
        print(13, deadline)

        application_fee = 'NULL'

        tuition_fee_str = response.xpath(
            '//div[@class="columns small-12 medium-4 content-aside"]//text()'
        ).extract()
        tuition_fee_str = ''.join(tuition_fee_str).replace('\n', '')
        tuition_fee_str = tuition_fee_str.replace('   ', '')
        try:
            if "Fees" in tuition_fee_str:
                start = tuition_fee_str.find("Fees")
                end = tuition_fee_str.find("Entry requirements")
                tuition_fee = tuition_fee_str[start:end]
                item["tuition_fee"] = tuition_fee
            else:
                tuition_fee = "NULL"
        except:
            tuition_fee = "NULL"
        print(14, tuition_fee)

        location = response.xpath(
            '//div[@class="sidebar"]/dl/dd/a/text()').extract()
        location = ''.join(location).replace('\n', '')
        print(15, location)

        ATAS = 'NULL'
        GPA = 'NULL'

        average_score = response.xpath(
            '//div[@class="column medium-15 end"]/p/text()').extract()
        average_score = ''.join(average_score)
        print(16, average_score)

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_str = response.xpath(
            '//*[@id="content"]/section/div/div/section/div/div/p/text()'
        ).extract()
        IELTS_str = ''.join(IELTS_str)
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        if "English language" in IELTS_str:
            Istart = IELTS_str.find("IELTS")
            # Iend = IELTS_str.find("The Pearson Test of English Academic (PTE Academic):")
            IELTS = IELTS_str[Istart:]
            IELTS = IELTS[:150]
            item["IELTS"] = IELTS
        else:
            IELTS = 'NULL'
        print(17, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = response.xpath(
            '//div[@class="sidebar seperator reverse"]/ul/li/text()').extract(
            )
        TOEFL = ''.join(TOEFL)
        # if "English language requirements" in TOEFL_str:
        #     Istart = TOEFL_str.find("TOEFL IBT:")
        #     # Iend = IELTS_str.find("The Pearson Test of English Academic (PTE Academic):")
        #     TOEFL = TOEFL_str[Istart:]
        #     TOEFL = TOEFL[:100]
        #     item["TOEFL"] = TOEFL
        # else:
        #     TOEFL = 'NULL'
        print(18, TOEFL)

        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath(
            '//section[@class="entry-requirement"]//text()').extract()
        entry_requirements = ''.join(entry_requirements).replace('\n', '')
        print(19, entry_requirements)

        chinese_requirements = ''

        school_test = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #13
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'Edge Hill University'
        print(2, university)

        country = 'UK'
        city = 'NULL'
        website = 'https://www.edgehill.ac.uk'

        department = response.xpath(
            '//*[@id="overview"]//tr//text()').extract()[6:8]
        department = '  '.join(department)
        print(3, department)

        programme = response.xpath(
            '//*[@id="primary"]/header/h1//text()').extract()
        # programme = response.xpath('//section[@class="pageHead"]/h1/text()').extract()
        programme = ''.join(programme)

        print(4, programme)

        ucas_code_s = response.xpath('//*[@id="overview"]//text()').extract()
        ucas_code_s = ''.join(ucas_code_s)
        try:
            if " Code:" in ucas_code_s:
                start = ucas_code_s.find("Code:")
                end = ucas_code_s.find("Course Length:")
                ucas_code = ucas_code_s[start:end]
                item["ucas_code"] = ucas_code
            else:
                ucas_code = 'NULL'
        except:
            ucas_code = '报错!'
        print(5, ucas_code)

        degree_level = '1'

        degree_type = response.xpath(
            '//*[@id="primary"]/header/h1//text()').extract()
        # degree_type = response.xpath('//section[@class="pageHead"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        try:
            if "BSc" in degree_type:
                degree_type = 'Bsc'
            elif "BA" in degree_type:
                degree_type = 'BA'
            elif "MNSW" in degree_type:
                degree_type = 'MNSW'
            elif "PGCert" in degree_type:
                degree_type = 'PGCert'
            elif "MBA" in degree_type:
                degree_type = 'MBA'
            elif "MA" in degree_type:
                degree_type = 'MA'
            elif "MComp" in degree_type:
                degree_type = 'MComp'
            elif "PhD" in degree_type:
                degree_type = 'PhD'
            elif "FdA" in degree_type:
                degree_type = 'FdA'
            elif "PGCE" in degree_type:
                degree_type = 'PGCE'
            elif "IFP" in degree_type:
                degree_type = 'IFP'
            elif "LLB" in degree_type:
                degree_type = 'LLB'
            elif "MHealth Res" in degree_type:
                degree_type = 'MHealth Res'
            elif "MRes" in degree_type:
                degree_type = 'MRes'
            elif "MMed" in degree_type:
                degree_type = 'MMed'
            elif "MSci" in degree_type:
                degree_type = 'MSci'
            elif "MCh" in degree_type:
                degree_type = 'MCh'
            else:
                degree_type = 'Ordinary degree'
        except:
            degree_type = "N/A"
        print(5, degree_type)

        start_date_s = response.xpath('//*[@id="overview"]//text()').extract()
        start_date_s = ''.join(start_date_s)
        try:
            if "Start Dates:" in start_date_s:
                start = start_date_s.find("Start Dates:")
                end = start_date_s.find("Department:")
                start_date = start_date_s[start:end]
                item["start_date"] = start_date
            else:
                start_date = 'NULL'
        except:
            start_date = '报错!'
        print(6, start_date)

        degree_description = 'NULL'

        overview = response.xpath('//*[@id="overview"]//text()').extract()
        # overview = response.xpath('//div[@class="body-copy"]/ul/li/text()').extract()
        overview = ''.join(overview)
        print(7, overview)

        mode_s = response.xpath('//*[@id="overview"]//tr//text()').extract()
        mode_s = ''.join(mode_s)
        # mode = mode.replace('\n','')
        # mode = mode.replace('      ','')
        try:
            if "Full-Time" in mode_s:
                mode = "Full-Time"
            else:
                mode = "Part-Time"
        except:
            mode = "报错!"
        print(8, mode)

        duration_s = response.xpath('//*[@id="overview"]//text()').extract()
        duration_s = ''.join(duration_s)
        # duration = duration.replace('\n','')
        # duration = duration.replace('    ','')
        try:
            if "Length:" in duration_s:
                start = duration_s.find("Length:")
                end = duration_s.find("Start Dates:")
                duration = duration_s[start:end]
                item["duration"] = duration
            else:
                duration = "NULL"
        except:
            duration = "报错!"

        print(9, duration)

        modules = response.xpath('//*[@id="modules"]//text()').extract()
        modules = ''.join(modules)
        # modules = modules.replace('\n','')
        print(10, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//*[@id="course-in-depth"]//text()').extract()
        assessment = ''.join(assessment)
        print(11, assessment)

        career = response.xpath(
            '//*[@id="careers-and-employability"]//text()').extract()
        career = ''.join(career)
        print(12, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        # tuition_fee= response.xpath('//*[@id="finance"]//text()').extract()[3:9]
        tuition_fee = '£12,750'
        # tuition_fee = ''.join(tuition_fee)

        # tuition_fee = tuition_fee.replace('\n','')
        # tuition_fee = tuition_fee.replace('    ','')
        print(13, tuition_fee)

        location = response.xpath('//*[@id="overview"]//tr//text()').extract()
        location = ''.join(location)
        print(14, location)

        ATAS = 'NULL'
        GPA = 'NULL'
        MCAT = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS = 'NULL'
        # IELTS = ''.join(IELTS)
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(10, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = response.xpath(
            '//*[@id="next-steps"]//text()').extract()
        how_to_apply = ''.join(how_to_apply)
        print(15, how_to_apply)

        entry_requirements = response.xpath(
            '//*[@id="entry-criteria"]//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(16, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'University of Worcester'
        print(2, university)

        department = 'NULL'

        country = 'UK'
        city = "NULL"
        website = 'NULL'
        degree_level = '1'

        # programme = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract()
        programme = response.xpath(
            '//section[@class="pageHead"]//text()').extract()
        programme = ''.join(programme)
        print(3, programme)

        ucas_code = 'NULL'

        # degree_type = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract()
        degree_type = response.xpath(
            '//section[@class="pageHead"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        degree_type = self.getDegree_type(degree_type)
        print(4, degree_type)

        start_date = 'NULL'
        # start_date = ''.join(start_date)
        # print(5,start_date)

        # overview = response.xpath('//div[@class="left logo-bg"]//text()').extract()
        overview = response.xpath(
            '//div[@class="body-copy"]/p//text()').extract()
        overview = ''.join(overview)
        print(5, overview)

        mode = 'NULL'
        # mode = ''.join(mode).replace('\r\n','')
        # mode = mode.replace('\n','')
        # mode = mode.replace('      ','')
        # print(7,mode)

        duration = 'NULL'
        # duration = ''.join(duration).replace('\r\n','')
        # duration = duration.replace('\n','')
        # duration = duration.replace('    ','')
        # print(8,duration)

        modules_s = response.xpath(
            '//div[@class="columns__column"]//text()').extract()
        modules_s = ''.join(modules_s)
        # modules = modules.replace('\n','')
        try:
            if "Modules" in modules_s:
                start = modules_s.find("Modules")
                end = modules_s.find("Assessment")
                modules = modules_s[start:end]
                item["modules"] = modules
            else:
                modules = modules_s
        except:
            modules = modules_s
        print(6, modules)

        teaching = 'NULL'

        assessment_s = response.xpath(
            '//div[@class="columns__column"]//text()').extract()
        assessment_s = ''.join(assessment_s)
        try:
            if "Assessment" in assessment_s:
                start = assessment_s.find("Assessment")
                assessment = assessment_s[start:]
                item["assessment"] = assessment
            else:
                assessment = assessment_s
        except:
            assessment = assessment_s
        print(7, assessment)

        career = response.xpath(
            '//dd[@class="accordion__content rte"]//text()').extract()
        career = ''.join(career)
        print(8, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee_s = response.xpath(
            '//div[@class="columns"]//text()').extract()
        tuition_fee_s = ''.join(tuition_fee_s)
        # tuition_fee = tuition_fee.replace('\n','')
        # tuition_fee = tuition_fee.replace('    ','')
        tuition_fee_s = self.getTuition_fee(tuition_fee_s)
        try:
            if tuition_fee_s > 0:
                tuition_fee = tuition_fee_s
            else:
                tuition_fee = "NULL"
        except:
            tuition_fee = "报错!"

        print(9, tuition_fee)

        location = 'worcester'
        # location = ''.join(location)
        # print(13,location)

        ATAS = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_s = response.xpath(
            '//div[@class="right equal-height"]//text()').extract()
        IELTS_s = ''.join(IELTS_s)
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        try:
            if "IELTS" in IELTS_s:
                start = IELTS_s.find("IELTS")
                IELTS = IELTS_s[:100]
                item["IELTS"] = IELTS
            else:
                IELTS = "NULL"
        except:
            IELTS = "报错!"
        print(10, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT = "NULL"
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply_s = response.xpath(
            '//dd[@class="accordion__content rte"]//text()').extract()
        how_to_apply_s = ''.join(how_to_apply_s)
        try:
            if "How to Apply" in how_to_apply_s:
                start = how_to_apply_s.find("How to Apply")
                end = how_to_apply_s.find("Entry requirements")
                how_to_apply = how_to_apply_s[start:end]
                item["how_to_apply"] = how_to_apply
            else:
                how_to_apply = how_to_apply_s
        except:
            how_to_apply = '报错!'
        print(11, how_to_apply)

        entry_requirements_s = response.xpath(
            '//dd[@class="accordion__content rte"]//text()').extract()
        entry_requirements_s = ''.join(entry_requirements_s)
        # EntryRequirements = EntryRequirements.replace(' ','')
        try:
            if "Entry requirements" in entry_requirements_s:
                start = entry_requirements_s.find("Entry requirements")
                end = entry_requirements_s.find("Study options")
                entry_requirements = entry_requirements_s[start:end]
                item["entry_requirements"] = entry_requirements
            else:
                entry_requirements = entry_requirements_s

        except:
            entry_requirements = '报错!'

        print(12, entry_requirements)

        chinese_requirements = "NULL"

        school_test = 'NULL'

        degree_description = "NULL"

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #15
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'NORWICH UNIVERSITY OF THE ARTS'
        print(2, university)

        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'NULL'

        programme = response.xpath('').extract()
        programme = ''.join(programme)
        print(3, programme)

        ucas_code = 'NULL'
        degree_level = '1'

        degree_type = response.xpath('').extract()
        degree_type = ''.join(degree_type)
        print(4, degree_type)

        start_date = 'NULL'
        # start_date = ''.join(start_date)
        # print(5,start_date)

        overview = response.xpath('').extract()
        overview = ''.join(overview).replace('\n', '')
        print(6, overview)

        mode = response.xpath('').extract()
        mode = ''.join(mode).replace('\r\n', '')
        mode = mode.replace('\n', '')
        mode = mode.replace('      ', '')
        print(7, mode)

        duration = response.xpath('').extract()
        duration = ''.join(duration).replace('\r\n', '')
        duration = duration.replace('\n', '')
        duration = duration.replace('    ', '')
        print(8, duration)

        modules = response.xpath('').extract()
        modules = ''.join(modules).replace('\r\n', '')
        # modules = modules.replace('\n','')
        print(9, modules)

        teaching = 'NULL'

        assessment = response.xpath('').extract()
        assessment = ''.join(assessment)
        print(10, assessment)

        career = 'NULL'
        # career = ''.join(career).replace('\n', '')
        # print(11, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = response.xpath('').extract()
        tuition_fee = ''.join(tuition_fee).replace('\r\n', '')
        tuition_fee = tuition_fee.replace('\n', '')
        tuition_fee = tuition_fee.replace('    ', '')
        print(11, tuition_fee)

        location = 'NULL'
        # location = ''.join(location)
        # print(13,location)

        GPA = 'NULL'
        ATAS = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS = 'UNLL'
        # IELTS = ''.join(IELTS).replace('\r\n','')
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(12, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'

        chinese_requirements = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = response.xpath('').extract()
        how_to_apply = ''.join(how_to_apply).replace('\n', '')
        print(13, how_to_apply)

        entry_requirements = response.xpath('').extract()
        entry_requirements = ''.join(entry_requirements).replace('\n', '')
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(14, entry_requirements)

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #16
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'University of Wales, Trinity St David'
        print(2, university)

        country = 'UK'
        city = 'NULL'
        website = 'http://www.uwtsd.ac.uk'

        department = response.xpath(
            '/html/body/div/div/div/div/div/div/p/a/strong//text()').extract()
        department = ''.join(department)
        print(3, department)

        # programme = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract()
        programme = response.xpath(
            '//h1[@class="t4-course-title"]/text()').extract()
        programme = ''.join(programme)
        print(4, programme)

        ucas_code = response.xpath(
            '//div[@class="span3"]/p//text()').extract()[:7]
        ucas_code = ''.join(ucas_code)
        print(5, ucas_code)

        degree_level = '0'

        # degree_type = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract()
        degree_type = response.xpath(
            '//h1[@class="t4-course-title"]/text()').extract()
        degree_type = ''.join(degree_type)
        print(6, degree_type)

        start_date = 'NULL'
        # start_date = ''.join(start_date)
        # print(7,start_date)

        degree_description = 'NULL'

        # overview = response.xpath('//div[@class="left logo-bg"]//text()').extract()
        overview = response.xpath('//div[@class="span6"]//text()').extract()
        overview = ''.join(overview)
        print(8, overview)

        mode_s = response.xpath('//div[@class="span3"]/p//text()').extract()
        mode_s = ''.join(mode_s)
        try:
            if "Full Time" in mode_s:
                mode = "Full Time"
            else:
                mode = "Full Time"
        except:
            mode = "Part Time"
        # mode = mode.replace('\n','')
        # mode = mode.replace('      ','')
        print(9, mode)

        duration = response.xpath(
            '//div[@class="span3"]/p//text()').extract()[1:30]
        duration = ''.join(duration)
        # duration = duration.replace('\n','')
        # duration = duration.replace('    ',''

        print(10, duration)

        modules = response.xpath(
            '//*[@id="collapseModules"]//text()').extract()
        modules = ''.join(modules)
        # modules = modules.replace('\n','')
        print(11, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//*[@id="collapseAssessment"]//text()').extract()
        assessment = ''.join(assessment)
        print(12, assessment)

        career = response.xpath(
            '//*[@id="collapseCareerOpportunities"]//text()').extract()
        career = ''.join(career)
        print(13, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = 'NULL'
        # tuition_fee = ''.join(tuition_fee)
        # # tuition_fee = tuition_fee.replace('\n','')
        # # tuition_fee = tuition_fee.replace('    ','')
        # print(9, tuition_fee)

        location = response.xpath('//div[@class="span3"]/p//text()').extract()
        location = ''.join(location)
        print(14, location)

        ATAS = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS = 'NULL'
        # IELTS = ''.join(IELTS)
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(10, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'

        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'
        # how_to_apply = ''.join(how_to_apply)
        # print(11,how_to_apply)

        entry_requirements = response.xpath(
            '//*[@id="collapseEntryCriteria"]//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(15, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #17
0
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'University of Chester'
        print(2, university)

        country = 'UK'

        city = 'NULL'

        website = 'https://www1.chester.ac.uk'

        department = 'NULL'

        programme = response.xpath(
            '//h1[@id="main-content"]//text()').extract()
        programme = ''.join(programme)
        print(3, programme)

        degree_type = response.xpath(
            '//h1[@id="main-content"]/div/text()').extract()
        degree_type = ''.join(degree_type)
        print(4, degree_type)

        ucas_code = 'NULL'

        degree_level = '1'

        start_date = response.xpath(
            '//div[@class="startdate m-facts__item"]//text()').extract()
        start_date = ''.join(start_date)
        print(5, start_date)

        degree_description = 'NULL'

        overview = response.xpath(
            '//div[@class="courseajax_overview"]//text()').extract()
        overview = ''.join(overview).replace('\r\n', '')
        print(6, overview)

        mode = response.xpath(
            '//div[@class="mode m-facts__item"]//text()').extract()
        mode = ''.join(mode).replace('\r\n', '')
        # mode = mode.replace('   ','')
        print(7, mode)

        duration = response.xpath(
            '//div[@class="courseajax_duration m-facts__item"]//text()'
        ).extract()
        duration = ''.join(duration).replace('\r\n', '')
        # duration = duration.replace('   ','')
        print(8, duration)

        modules = response.xpath('//*[@id="learning"]//text()').extract()
        modules = ''.join(modules).replace('\r\n', '')
        modules = modules.replace('\n', '').replace('\t', '')
        print(9, modules)

        teaching = 'NULL'

        assessment = response.xpath(
            '//div[@class="large-7 columns float-right m-sections__learning-section"]//text()'
        ).extract()
        assessment = ''.join(assessment).replace('\r\n', '')
        print(10, assessment)

        career = 'NULL'
        # career = ''.join(career).replace('\r\n', '')
        # if "Your personal and professional development" in career_str:
        #     cstart = career_str.find("Your personal and professional development")
        #     cend = career_str.find("Fees and funding")
        #     career = career_str[cstart:cend]
        #     # career = ''.join(career).replace('\r\n', '')
        #     item["career"] = career
        # else:
        #     career = ''
        # print(11,career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = response.xpath(
            '//div[@class="field-fees-international"]/p/text()').extract()
        tuition_fee = ''.join(tuition_fee).replace('\n', '')
        # tuition_fee = tuition_fee.replace('   ','')
        tuition_fee = self.getTuition_fee(tuition_fee)
        try:
            if tuition_fee > 0:
                tuition_fee = tuition_fee
            else:
                tuition_fee = 'NULL'
        except:
            tuition_fee = 'NULL'

        print(11, tuition_fee)

        location = response.xpath(
            '//div[@id="edit-compulsory"]//text()').extract()
        location = ''.join(location)
        print(12, location)

        GPA = 'NULL'
        ATAS = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_lists = response.xpath(
            '//div[@class="courseajax_entryrequirementsint"]//text()').extract(
            )
        IELTS_str = ''.join(IELTS_lists).replace('\r\n', '')
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        if "English Language Requirements" in IELTS_str:
            Istart = IELTS_str.find("IELTS Academic:")
            Iend = IELTS_str.find("Select your country")
            IELTS = IELTS_str[Istart:Iend]
            IELTS = IELTS[:120]
            item["IELTS"] = IELTS
        else:
            IELTS = 'NULL'
        print(13, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath(
            '//div[@class="courseajax_entryrequirements"]//text()').extract()
        entry_requirements = ''.join(entry_requirements).replace('\r\n', '')
        # EntryRequirements = EntryRequirements.replace(' ','')
        # if "Entry requirements" in entry_requirements_str:
        #     erstart = entry_requirements_str.find("Who should study this programme?")
        #     erend = entry_requirements_str.find("English Language requirements")
        #     entry_requirements = entry_requirements_str[erstart:erend]
        #
        #     item["entry_requirements"] = entry_requirements
        # #     print('===========================')
        # else:
        #     entry_requirements = ''
        print(14, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #18
0
    def parse_item(self,response):
        print('==================================',response.url)
        item = HooliItem()

        url = response.url
        print(1,url)

        university = 'Loughborough University'
        print(2,university)

        department = response.xpath('//dd[@class="list__item--definition"]/text()').extract()
        department = ''.join(department)
        print(3,department)

        country = 'UK'
        city = 'NULL'
        website = 'http://www.lboro.ac.uk'
        degree_level = '1'

        programme = response.xpath('//h1[@id="top"]//text()').extract()
        programme = ''.join(programme)
        print(4,programme)

        ucas_code = 'NULL'
        # Master = ''.join(Master)

        degree_type = response.xpath('//h1[@id="top"]/span/text()').extract()
        degree_type = ''.join(degree_type)
        print(5,degree_type)

        start_date_str = response.xpath('//div[@class="list__content icon icon--calendar"]//text()').extract()
        start_date_str = ''.join(start_date_str)
        try:
            if "Start date:" in start_date_str:
                start = start_date_str.find("Start date:")
                end = start_date_str.find("Application deadline:")
                start_date = start_date_str[start:end]
                item["start_date"] = start_date
            else:
                start_date = "NULL"
        except:
            start_date = "报错"
        print(6,start_date)

        overview = response.xpath('//div[@class="content-type content-type--main"]//text()').extract()
        overview = ''.join(overview)
        print(7, overview)

        mode = response.xpath('//div[@class="list__content icon icon--clock"]//text()').extract()
        mode = ''.join(mode)
        print(8,mode)



        duration = response.xpath('//div[@class="list__content icon icon--clock"]//text()').extract()
        duration = ''.join(duration)
        # Duration = Duration.replace('   ','')
        print(9,duration)

        modules = 'NULL'
        # modules = ''.join(modules).replace('\n','')
        # modules = modules.replace('\n','')
        # print(8,modules)

        teaching = 'NULL'

        assessment = 'NULL'
        # teaching_assessment = ''.join(teaching_assessment).replace('\n','')
        # print(9, teaching_assessment)

        career = 'NULL'
        # career = ''.join(career).replace('\n', '')
        # print(10, career)

        application_date = 'NULL'

        deadline_str = response.xpath('//div[@class="list__content icon icon--calendar"]//text()').extract()
        deadline_str = ''.join(deadline_str)
        try:
            if "Application deadline:" in deadline_str:
                start = deadline_str.find("Application deadline:")
                deadline = deadline_str[start:]
                item["deadline"] = deadline
            else:
                deadline = "NULL"
        except:
            deadline = "报错!"
        print(10,deadline)

        application_fee = 'NULL'

        tuition_fee= response.xpath('//div[@class="list__content icon icon--money"]//text()').extract()
        tuition_fee = ''.join(tuition_fee)
        # tuition_fee = tuition_fee.replace('   ','')
        print(11,tuition_fee)

        location_str = response.xpath('//dl[@class="list list--definition list--pg-programme"]//text()').extract()
        location_str = ''.join(location_str).replace('\r\n','')
        location_str = location_str.replace('    ','')
        try:
            if "Location:" in location_str:
                start = location_str.find("Location:")
                end = location_str.find("Application deadline:")
                location = location_str[start:end]
                item["location"] = location
            else:
                location = "NULL"
        except:
            location = "报错!"
        print(12,location)

        ATAS = 'NULL'
        GPA = 'NULL'

        accredited_university = 'NULL'

        IELTS = 'NULL'
        # IELTS = ''.join(IELTS).replace('\n','')
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(11, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT= 'NULL'
        MCAT= 'NULL'

        average_score = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = response.xpath('//div[@class="editor"]//text()').extract()
        how_to_apply = ''.join(how_to_apply)
        print(13,how_to_apply)

        entry_requirements = response.xpath('//div[@class="editor"]//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        # EntryRequirements = EntryRequirements.replace(' ','')
        print(14,entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'Royal Holloway University of LondonEgham'
        print(2, university)

        department = response.xpath(
            '//div[@id="details"]/table/tbody/tr[4]/td/a/div/text()').extract(
            )
        department = ''.join(department)
        print(3, department)

        country = 'UK'
        city = 'NULL'
        website = 'https://www.royalholloway.ac.uk'

        programme = response.xpath(
            '//div[@class="sys_large-col"]/h1/text()').extract()
        programme = ''.join(programme)
        print(4, programme)

        ucas_code = 'NULL'
        degree_level = '1'

        degree_type = response.xpath(
            '//div[@class="sys_large-col"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        try:
            if "BSc" in degree_type:
                degree_type = 'Bsc'
            elif "MSc" in degree_type:
                degree_type = "MSc"
            elif "BA" in degree_type:
                degree_type = 'BA'
            elif "MNSW" in degree_type:
                degree_type = 'MNSW'
            elif "PGCert" in degree_type:
                degree_type = 'PGCert'
            elif "MBA" in degree_type:
                degree_type = 'MBA'
            elif "MA" in degree_type:
                degree_type = 'MA'
            elif "MComp" in degree_type:
                degree_type = 'MComp'
            elif "PhD" in degree_type:
                degree_type = 'PhD'
            elif "FdA" in degree_type:
                degree_type = 'FdA'
            elif "PGCE" in degree_type:
                degree_type = 'PGCE'
            elif "IFP" in degree_type:
                degree_type = 'IFP'
            elif "LLB" in degree_type:
                degree_type = 'LLB'
            elif "MHealth Res" in degree_type:
                degree_type = 'MHealth Res'
            elif "MRes" in degree_type:
                degree_type = 'MRes'
            elif "MMed" in degree_type:
                degree_type = 'MMed'
            elif "MSci" in degree_type:
                degree_type = 'MSci'
            elif "MCh" in degree_type:
                degree_type = 'MCh'
            elif "LLM" in degree_type:
                degree_type = "LLM"
            elif "Y2QF" in degree_type:
                degree_type = "Y2QF"
            elif "Y2QG" in degree_type:
                degree_type = "Y2QG"
            else:
                degree_type = 'Ordinary degree'
        except:
            degree_type = "NULL"
        print(5, degree_type)

        start_date = response.xpath(
            '//div[@id="details"]/table/tbody/tr[2]/td/div/text()').extract()
        start_date = ''.join(start_date)
        print(6, start_date)

        overview = response.xpath('//div[@id="tab-1"]//text()').extract()
        overview = ''.join(overview).replace('\n', '')
        print(7, overview)

        mode = response.xpath(
            '//div[@id="details"]/table/tbody/tr[3]/td/div/text()').extract()
        mode = ''.join(mode)
        print(8, mode)

        duration = response.xpath(
            '//div[@id="details"]/table/tbody/tr[3]/td/div/text()').extract()
        duration = ''.join(duration)
        print(9, duration)

        modules = 'NULL'

        teaching = 'NULL'

        assessment = response.xpath('//div[@id="tab-3"]/p/text()').extract()
        assessment = ''.join(assessment)
        print(10, assessment)

        career = response.xpath('//div[@id="tab-5"]//text()').extract()
        career = ''.join(career).replace('\n', '')
        print(11, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee_s = response.xpath(
            '//div[@id="tab-6"]/p//text()').extract()
        tuition_fee_s = ''.join(tuition_fee_s)
        tuition_fee_s = self.getTuition_fee(tuition_fee_s)
        try:
            if tuition_fee_s > 0:
                tuition_fee = tuition_fee_s
            else:
                tuition_fee = 'NULL'
        except:
            tuition_fee = '报错!'

        print(12, tuition_fee)

        location = response.xpath(
            '//div[@id="details"]/table/tbody/tr[5]/td/a/div/text()').extract(
            )
        location = ''.join(location)
        print(13, location)

        ATAS = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_s = response.xpath('//div[@id="tab-4"]/p/text()').extract()
        IELTS_s = ''.join(IELTS_s)
        try:
            if "IELTS" in IELTS_s:
                start = IELTS_s.find("IELTS")
                IELTS = IELTS_s[start:]
                IELTS = IELTS[:100]
                item["IELTS"] = IELTS
            else:
                IELTS = "NULL"
        except:
            IELTS = "报错!"
        print(14, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath(
            '//div[@id="tab-4"]//text()').extract()
        entry_requirements = ''.join(entry_requirements).replace('\n', '')
        # entry_requirements = entry_requirements.replace('({})','')
        print(15, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #20
0
    def parse_item(self,response):
        print('==================================',response.url)

        item = HooliItem()

        url = response.url
        print(1, url)

        university = "EXETER UNDERGRADUATE STUDY"
        print(2, university)

        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'https://www.exeter.ac.uk'

        programme = response.xpath('//div[@id="left-col"]/h1//text()').extract()
        programme = ''.join(programme)
        # Course = Course.replace('\r\n', '')
        print(3,programme)

        ucas_code = response.xpath('//td[@class="exeter-course-ucascode"]//text()').extract()
        ucas_code = ''.join(ucas_code)
        print(4,ucas_code)

        degree_level = '0'

        degree_type = response.xpath('//div[@id="left-col"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        print(5,degree_type)

        start_date = 'NULL'

        degree_description = 'NULL'

        overview = response.xpath('//div[@id="Overview"]//text()').extract()
        overview = ''.join(overview)
        print(6,overview)

        mode = 'NULL'

        duration = response.xpath('//td[@class="exeter-course-duration"]//text()').extract()
        duration = ''.join(duration)
        print(7,duration)

        Alevel = response.xpath('//td[@class="exeter-course-typicaloffer"]//text()').extract()
        Alevel = ''.join(Alevel)
        print(8,Alevel)

        IB = response.xpath('//td[@class="exeter-course-typicaloffer"]//text()').extract()
        IB = ''.join(IB)
        print(9,IB)

        IELTS = 'NULL'
        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'
        LSAT = 'NULL'
        MCAT = 'NULL'

        location = response.xpath('//td[@class="exeter-course-location"]//text()').extract()
        location = ''.join(location)
        print(10,location)

        ATAS = 'NULL'

        modules = response.xpath('//div[@class="container"]//text()').extract()
        modules = ''.join(modules).replace('\n', '')
        modules = modules.replace('\r', '')
        modules = modules.replace('\t', '')
        modules = str(modules)
        print(11, modules)

        teaching = 'NULL'

        assessment = response.xpath('//div[@id="Learning"]//text()').extract()
        assessment = ''.join(assessment)
        assessment = assessment.replace('\r\n', '')
        assessment = assessment.replace('\n', '')
        assessment = assessment.replace('\r', '')
        print(12, assessment)

        career = response.xpath('//div[@id="Careers"]//text()').extract()
        career = ''.join(career).replace('\r\n', '')
        career = career.replace('\n', '')
        print(13, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee = 'NULL'

        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = 'NULL'

        entry_requirements = response.xpath('//div[@id="Entry-requirements"]//text()').extract()
        entry_requirements = ''.join(entry_requirements).replace('\r\n', '')
        print(14, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'


        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = response.xpath('//div[@id="course-synopsis"]//text()').extract()
        other = ''.join(other).replace('\r\n', '')
        other = other.replace('\n', '')
        other = other.replace('\t', '')
        print(15,other)

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(16, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item
Beispiel #21
0
    def parse_item(self,response):
        print('==================================',response.url)
        item = HooliItem()

        url = response.url
        print(1,url)

        university = 'FALMOUTH UNIVERSITY'
        print(2,university)

        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'https://www.falmouth.ac.uk'

        # programme = response.xpath('//div[@class="title"]/h1/text()').extract()
        programme = response.xpath('//div[@class="h1-box"]/h1/text()').extract()
        programme = ''.join(programme)
        print(3,programme)

        ucas_code = 'NULL'

        degree_level = '1'

        degree_type = response.xpath('//div[@class="h1-box"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        print(4,degree_type)

        start_date_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        start_date_str = ''.join(start_date_lists)
        if "Start dates and application deadlines" in start_date_str:
            sdstart = start_date_str.find("Start dates and application deadlines")
            sdend = start_date_str.find("News and Events")
            start_date = start_date_str[sdstart:sdend]
            item["start_date"] = start_date
        else:
            start_date = 'NULL'
        print(5,start_date)

        # overview = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract()
        overview_list = response.xpath('//div[@class="content-block-wrapper"]//text()').extract()
        overview_str = ''.join(overview_list)
        if "Benefits" in overview_str:
            Ostart = overview_str.find("Benefits")
            Oend = overview_str.find("How the course is taught")
            overview = overview_str[Ostart:Oend]
            item["overview"] = overview
        else:
            overview = response.xpath('//div[@class="content-block-wrapper"]//text()').extract()
            overview = ''.join(overview)

        print(6, overview)

        mode = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract()
        mode = ''.join(mode)
        # mode_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract()
        # mode_str = ''.join(mode_lists)
        # # mode = mode.replace('\n','')
        # # mode = mode.replace('      ','')
        # if "Mode of study:" in mode_str:
        #     mstart = mode_str.find("Mode of study:")
        #     mend = mode_str.find("Summary")
        #     mode = mode_str[mstart:mend]
        #     item["mode"] = mode
        # else:
        #     mode = ''
        print(7,mode)

        types = ''

        # duration_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract()
        duration = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract()
        duration = ''.join(duration)
        # duration_str = ''.join(duration_lists)
        # # duration = duration.replace('\n','')
        # # duration = duration.replace('    ','')
        # if "Mode of study:" in duration_str:
        #     dstart = duration_str.find("Mode of study:")
        #     dend = duration_str.find("Duration:")
        #     duration = duration_str[dstart:dend]
        #     item["duration"] = duration
        # else:
        #     duration = ''
        print(8,duration)

        modules = response.xpath('//div[@class="accordion ui-accordion ui-widget ui-helper-reset"]//text()').extract()
        modules = ''.join(modules)
        # modules_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        # modules_str = ''.join(modules_lists)
        # if "Course content" in modules_str:
        #     mdstart = modules_str.find("Course content")
        #     mdend = modules_str.find("Assessments")
        #     modules = modules_str[mdstart:mdend]
        #     item["modules"] = modules
        # else:
        #     modules = ''
        # modules = modules.replace('\n','')
        print(9,modules)

        teaching = 'NULL'

        assessment = response.xpath('//div[@class="accordion"]//text()').extract()
        assessment = ''.join(assessment)
        # teaching_assessment_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        # teaching_assessment_str = ''.join(teaching_assessment_lists)
        # if "Assessments" in teaching_assessment_str:
        #     Astart = teaching_assessment_str.find("Assessments")
        #     Aend = teaching_assessment_str.find("How you study")
        #     teaching_assessment = teaching_assessment_str[Astart:Aend]
        #     item["teaching_assessment"] = teaching_assessment
        # else:
        #     teaching_assessment = ''
        print(10,assessment)

        career = response.xpath('//div[@class="field-career-opportunities"]//text()').extract()
        career = ''.join(career)
        print(11, career)

        application_date = 'NULL'

        deadline_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        deadline_str = ''.join(deadline_lists)
        if "Start dates and application deadlines" in deadline_str:
            dlstart  = deadline_str.find("Start dates and application deadlines")
            dlend = deadline_str.find("News and Events")
            deadline = deadline_str[dlstart:dlend]
            item["deadline"] = deadline
        else:
            deadline = 'NULL'
        print(11,deadline)



        application_fee = 'NULL'

        tuition_fee= 'NULL'
        # tuition_fee = ''.join(tuition_fee).replace('\r\n','')
        # tuition_fee = tuition_fee.replace('\n','')
        # tuition_fee = tuition_fee.replace('    ','')
        # print(11, tuition_fee)

        location = 'NULL'
        # location = ''.join(location)
        # print(13,location)

        ATAS = 'NULL'


        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        IELTS_str = ''.join(IELTS_lists)
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        if "Entry Requirements" in IELTS_str:
            Istart = IELTS_str.find("Entry Requirements")
            Iend = IELTS_str.find("Financing your studies")
            IELTS = IELTS_str[Istart:Iend]
            item["IELTS"] = IELTS
        else:
            IELTS = 'NULL'
        print(12, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'


        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = response.xpath('//div[@class="field-selection-process"]//text()').extract()
        interview = ''.join(interview)
        print(13,interview)

        portfolio = response.xpath('//div[@class="field-selection-process"]//text()').extract()
        portfolio = ''.join(portfolio)
        print(14,portfolio)

        application_documents = 'NULL'

        how_to_apply_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        how_to_apply_str = ''.join(how_to_apply_lists)
        if "How to apply" in how_to_apply_str:
            hstart = how_to_apply_str.find("How to apply")
            hend = how_to_apply_str.find("Start dates and application deadlines")
            how_to_apply = how_to_apply_str[hstart:hend]
            item["how_to_apply"] = how_to_apply
        else:
            how_to_apply = 'NULL'
        print(13,how_to_apply)

        entry_requirements = response.xpath('//*[@id="start-of-content"]/div[2]/div[2]/div[1]//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        
        # entry_requirements_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        # entry_requirements_str = ''.join(entry_requirements_lists)
        # # EntryRequirements = EntryRequirements.replace(' ','')
        # if "Entry Requirements" in entry_requirements_str:
        #     Estart = entry_requirements_str.find("Entry Requirements")
        #     Eend = entry_requirements_str.find("Financing your studies")
        #     entry_requirements = entry_requirements_str[Estart:Eend]
        #     item["entry_requirements"] = entry_requirements
        # else:
        #     entry_requirements = ''
        print(14,entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item