Ejemplo n.º 1
0
    def parse_item(self, response):
        print('-------------------',response.url)
        urlid=response.url.split('/')[5]
        url_a='https://www.harper-adams.ac.uk/courses/undergraduate/get-entry-requirements.cfm?id=%s&qualification=alevels&year_of_entry=2018'
        url_a=url_a % urlid
        Alevels=requests.get(url_a)
        Alevels=Alevels.text
        Alevels=remove_tags(Alevels).strip()
        url_b='https://www.harper-adams.ac.uk/courses/undergraduate/get-entry-requirements.cfm?id=%s&qualification=IB&year_of_entry=2018'
        url_b=url_b % urlid
        IB=requests.get(url_b)
        IB=IB.text
        IB=remove_tags(IB).strip()
        item = HooliItem()
        url_lont_str = response.xpath('//div[@class="content-section-inner"]//text()').extract()
        # 专业名称
        Master = response.xpath('//div[@id="course-title"]//text()').extract()[0]
        Course = response.xpath('//div[@class="page-heading"]/h1//text()').extract()[0]
        # CourseCode
        index_UCAScode = url_lont_str.index('UCAS code')
        CourseCode = url_lont_str[index_UCAScode + 2]
        # 课程长度
        index_Duration = url_lont_str.index('Duration')
        Duration = url_lont_str[index_Duration + 2]
        # 开学时间
        index_Startdate = url_lont_str.index('Start date')
        StartDate = url_lont_str[index_Startdate + 2]
        # 地点
        index_Location = url_lont_str.index('Location')
        Location = url_lont_str[index_Location + 2]
        if 'Typical offer' in url_lont_str:
            index_Typicaloffer = url_lont_str.index('Typical offer')
            Typicaloffer = url_lont_str[index_Typicaloffer + 2]
        else:
            Typicaloffer = ''
        str_url_kcyq = response.xpath('//div[@id="overview"]//div[@class="content-section-inner"]//text()').extract()
        # 专业描述
        index_biaoji = str_url_kcyq.index('How to apply')
        CourseOverview = str_url_kcyq[0:index_biaoji - 2]
        sx = response.xpath('//div[@id="placement"]//div[@class="content-section-inner"]//text()').extract()
        CourseOverview = CourseOverview + sx
        CourseOverview = ''.join(CourseOverview).strip()
        # 评估方式
        Assessment = response.xpath('//div[@id="teaching"]//div[@class="content-section-inner"]//text()').extract()
        Assessment = ''.join(Assessment).strip()
        # 就业
        Career = response.xpath('//div[@id="careers"]//text()').extract()
        Career = ''.join(Career).strip()
        # 学费
        # 10400
        # 雅思
        IELTS = '6.0 (minimum 5.5 in any component)'
        # 托福
        TOEFL = '80+ (minimum 18 reading, 18 Listening, 22 speaking, 20 writing)'
        university = 'Harper Adams'

        item["university"] = university
        item["location"] = Location
        item["department"] = ''
        item["programme"] =Course
        item["ucas_code"] = CourseCode
        item["degree_type"] = Master
        item["overview"] = CourseOverview
        item["IELTS"] = IELTS
        item["TOEFL"] = TOEFL
        item["Alevel"] =Alevels
        item["IB"] = IB
        item["teaching_assessment"] = Assessment
        item["career"] = Career
        item["tuition_fee"] = '10400'
        item["modules"] = ''
        item["duration"] = Duration
        item["start_date"] = StartDate
        item["deadline"] = ''
        item["entry_requirements"] = Typicaloffer
        item["url"] = response.url
        item["Justone"] = response.url
        # yield item
        print(item)
Ejemplo n.º 2
0
    def parse_brunel_postgraduate(self, response):
        # print('11111111111111111')
        item = HooliItem()
        longstr = response.xpath(
            '//div[@class="featureBlock clearfix"]//text()').extract()
        # print(longstr,response.url)
        if 'Mode of study' in longstr:
            index_Duration = longstr.index('Mode of study')
            Duration = longstr[index_Duration + 2]
            Duration = ''.join(Duration).strip()
            # print(Duration,response.url)
        else:
            Duration = ''
        if 'Start date' in longstr:
            index_Duration = longstr.index('Start date')
            StartDate = longstr[index_Duration + 2]
            StartDate = ''.join(StartDate).strip()
            # print(StartDate,response.url)
        else:
            StartDate = ''

        #专业名
        try:
            Course = response.url.split('/')[-1]
            Master = Course.split('-')[-1]
            Course = ' '.join(Course.split('-'))
            Course = Course.replace(Master, '')
            # print(Course,Master,response.url)
        except:
            Course = ''

        #雅思
        IELTS = response.xpath(
            '//div[@class="featureBlock"]//li//text()').extract()[0]
        IELTS = IELTS.replace('IELTS:', '')
        # print(IELTS,response.url)

        #学费
        try:
            TuitionFee = response.xpath(
                '//div[@class="featureBlock"]/p/span/text()').extract()
            index_TuitionFee = TuitionFee.index('International students:')
            TuitionFee = TuitionFee[index_TuitionFee + 1]
            TuitionFee = TuitionFee.replace(',', '')
            TuitionFee = TuitionFee.replace('£', '')
            # print(TuitionFee,response.url)
        except:
            TuitionFee = ''
            print(response.url, '------------------------------------------')

        #//h4[@id="assessment"]/following-sibling::*
        #评估
        Assessment = response.xpath(
            '//h4[@id="assessment"]/following-sibling::*//text()').extract()
        Assessment = ''.join(Assessment).strip()
        # print(Assessment,response.url)

        #//h2[@id="entrycriteria"]/following-sibling::ul
        #入学要求
        EntryRequirements = response.xpath(
            '//h2[@id="entrycriteria"]/following-sibling::ul//text()').extract(
            )
        EntryRequirements = ''.join(EntryRequirements).strip()
        # print(EntryRequirements,response.url)

        #课程设置
        Modules = response.xpath(
            '//h2[@id="coursecontent"]/following-sibling::*//text()').extract(
            )
        clear_str = Modules.index('Back to top')
        Modules = Modules[0:clear_str]
        Modules = ''.join(Modules)
        # print(Modules,response.url)

        #专业描述
        #//h2[@id="overview"]/following-sibling::*//text()
        CourseOverview = response.xpath(
            '//h2[@id="overview"]/following-sibling::*//text()').extract()
        clear_str_2 = CourseOverview.index('Back to top')
        CourseOverview = CourseOverview[0:clear_str_2]
        CourseOverview = ''.join(CourseOverview)

        item["university"] = 'Brunel University London'
        item["location"] = ''
        item["department"] = ''
        item["programme"] = Course
        item["ucas_code"] = ''
        item["degree_type"] = Master
        item["overview"] = CourseOverview
        item["IELTS"] = IELTS
        item["TOEFL"] = ''
        item["Alevel"] = ''
        item["IB"] = ''
        item["teaching_assessment"] = Assessment
        item["career"] = ''
        item["tuition_fee"] = TuitionFee
        item["modules"] = Modules
        item["duration"] = Duration
        item["start_date"] = StartDate
        item["deadline"] = ''
        item["entry_requirements"] = EntryRequirements
        item["url"] = response.url

        yield item
Ejemplo n.º 3
0
    def parse_brunel(self, response):
        print('brunel----------------------------brunel', response.url)
        item = HooliItem()
        # 专业
        programme = response.xpath('//h1//text()').extract()[0]
        Master = response.url.split('/')[-1]
        Master = Master.split('-')[-1]
        # 专业描述
        overview = response.xpath(
            '//div[@class="important_course_info"]//text()').extract()
        if overview == []:
            overview = 'null'
        else:
            overview = ''.join(overview).strip()

        url_long_right = response.xpath(
            '//div[@class="featureBlock clearfix"]//text()').extract()
        # print(url_long_right)

        # 课程长度
        str_duration = 'Mode of study'
        index_duration = url_long_right.index(str_duration)
        Duration = url_long_right[index_duration + 2]
        # UCAS_code
        str_CAS_code = 'UCAS Code'
        index_UCAS_Code = url_long_right.index(str_CAS_code)
        UCAS_code = url_long_right[index_UCAS_Code + 2]
        # startdate 开始日期
        str_startdate = 'Start date'
        index_startdate = url_long_right.index(str_startdate)
        startdate = url_long_right[index_startdate + 2]
        # 课程设置
        modules = response.xpath(
            '//article[@class="mainArticle"]/section[2]//text()').extract()
        if modules == []:
            modules = 'uncleared'
        else:
            long_str = '\n                  \n                \tRead more about the '
            if long_str in modules:
                index1 = modules.index(long_str)
                modules = modules[0:index1]
                modules = ''.join(modules)

        # 就业方向
        career = response.xpath(
            '//article[@class="mainArticle"]/section[3]//text()').extract()
        if career == []:
            career = 'uncleared'
        else:
            long_str2 = '» More about Employability'
            if long_str2 in career:
                index2 = career.index(long_str2)
                career = career[0:index2]
                career = ''.join(career).strip()

        url_long_str = response.xpath(
            '//article[@class="mainArticle"]/section[4]//text()').extract()
        long_str3 = 'Entry Criteria 2018/19'
        long_str4 = 'International and EU Entry Requirements'
        long_str5 = 'English Language Requirements'
        index_str3 = url_long_str.index(long_str3)
        index_str4 = url_long_str.index(long_str4)
        index_str5 = url_long_str.index(long_str5)
        # 申请要求
        Application_requirements = url_long_str[index_str3:index_str4]
        Application_requirements = ''.join(Application_requirements).strip()
        # 雅思
        IELTS = url_long_str[index_str5 + 3]
        IELTS = IELTS.replace('IELTS:', '').strip()

        # 评估方式
        url_long_str2 = response.xpath(
            '//article[@class="mainArticle"]/section[5]//text()').extract()
        str_btt = 'Back to top'
        index_str6 = url_long_str2.index(str_btt)
        Evaluation_method = url_long_str2[1:index_str6]
        Evaluation_method = ''.join(Evaluation_method).strip()

        # 学费
        url_long_str3 = response.xpath(
            '//article[@class="mainArticle"]/section[7]//text()').extract()
        str_IS = 'International students:'
        index_IS = url_long_str3.index(str_IS)
        tuition_fee = url_long_str3[index_IS + 2]
        tuition_fee = re.findall('\d+', tuition_fee)
        tuition_fee = ''.join(tuition_fee)
        # print(tuition_fee,response.url)

        item["university"] = 'Brunel University London'
        item["location"] = ''
        item["department"] = ''
        item["programme"] = programme
        item["ucas_code"] = UCAS_code
        item["degree_type"] = Master
        item["overview"] = overview
        item["IELTS"] = IELTS
        item["TOEFL"] = ''
        item["Alevel"] = ''
        item["IB"] = ''
        item["teaching_assessment"] = Evaluation_method
        item["career"] = career
        item["tuition_fee"] = tuition_fee
        item["modules"] = modules
        item["duration"] = Duration
        item["start_date"] = startdate
        item["deadline"] = ''
        item["entry_requirements"] = Application_requirements
        item["url"] = response.url
        yield item
Ejemplo n.º 4
0
    def parse_item(self, response):
        print(response.url,'---------------')
        uid=response.url.split('/')[5]
        item = HooliItem()
        long_str = response.xpath('//div[@class="content-section-margin"]//text()').extract()
        index_Duration = long_str.index('Duration')
        Duration = long_str[index_Duration + 2]
        # print(Duration)
        index_startDate = long_str.index('Start date:')
        StartDate = long_str[index_startDate + 1]
        # print(StartDate)
        Course = response.url.split('/')[-1]
        Course = Course.replace('-', ' ').title()
        # print(Course)
        EntryRequirements = response.xpath('//div[@id="entry-requirements"]//text()').extract()
        EntryRequirements = ''.join(EntryRequirements).strip()

        CourseOverview = response.xpath('//div[@id="overview"]//text()').extract()
        CourseOverview = ''.join(CourseOverview).strip()
        # modules
        # Modules = response.xpath('//div[@id="modules"]//text()').extract()
        # Modules = ''.join(Modules).strip()
        base_url='https://www.harper-adams.ac.uk/shared/get-pg-route-modules.cfm?year_of_entry=2018&route=9&id=%s'
        full_url=base_url % uid
        modules=requests.get(full_url)
        modules=remove_tags(modules.content)
        modules=modules.strip().replace('Click module title to see full description:','')
        # print(modules)
        # careers
        Career = response.xpath('//div[@id="careers"]//text()').extract()
        Career = ''.join(Career).strip()
        # teaching
        Assessment = response.xpath('//div[@id="teaching"]//text()').extract()
        Assessment = ''.join(Assessment).strip()
        Master = response.xpath('//div[@class="page-heading"]/h2/text()').extract()
        Master = ''.join(Master)
        University = 'Harper Adams'
        item["university"] = University
        item["location"] = 'Newport'
        item["department"] = ''
        item["programme"] = Course
        item["ucas_code"] = ''
        item["degree_type"] =Master
        item["overview"] =CourseOverview
        item["IELTS"] = '6.0(minimum 5.5 in any component)'
        item["TOEFL"] = '80+(minimum 18 reading, 18 Listening, 22 speaking, 20 writing)'
        item["Alevel"] = ''
        item["IB"] = ''
        item["teaching_assessment"] = Assessment
        item["career"] =Career
        item["tuition_fee"] = ''
        item["modules"] =modules
        item["duration"] = Duration
        item["start_date"] = StartDate
        item["deadline"] = ''
        item["entry_requirements"] =EntryRequirements
        item["url"] = response.url
        item["Justone"] = response.url
        item["type"] = type
        if Master =="PgC":
            print(Master, 'do not need')
        else:
            yield item
    # def parse(self, response):
    #     item=HooliItem()
    #     long_str=response.xpath('//div[@class="content-section-margin"]//text()').extract()
    #     index_Duration=long_str.index('Duration')
    #     Duration=long_str[index_Duration+2]
    #     # print(Duration)
    #     index_startDate=long_str.index('Start date:')
    #     StartDate=long_str[index_startDate+1]
    #     # print(StartDate)
    #     Course=response.url.split('/')[-1]
    #     Course=Course.replace('-',' ').title()
    #     # print(Course)
    #     EntryRequirements=response.xpath('//div[@id="entry-requirements"]//text()').extract()
    #     EntryRequirements=''.join(EntryRequirements).strip()
    #
    #     CourseOverview = response.xpath('//div[@id="overview"]//text()').extract()
    #     CourseOverview = ''.join(CourseOverview).strip()
    #     #modules
    #     Modules = response.xpath('//div[@id="modules"]//text()').extract()
    #     Modules = ''.join(Modules).strip()
    #     #careers
    #     Career = response.xpath('//div[@id="careers"]//text()').extract()
    #     Career = ''.join(Career).strip()
    #     #teaching
    #     Assessment = response.xpath('//div[@id="teaching"]//text()').extract()
    #     Assessment=''.join(Assessment).strip()
    #     Master=response.xpath('//div[@class="page-heading"]/h2/text()').extract()
    #     Master=''.join(Master)
    #     University = 'Harper Adams'
    #     item["url"] = response.url
    #     item["University"] = University
    #     item["Department"] = ''
    #     item["Location"] = ''
    #     item["Course"] = Course
    #     item["CourseCode"] = ''
    #     item["Master"] = Master
    #     item["CourseOverview"] = CourseOverview
    #     item["Alevel"] = ''
    #     item["IB"] = ''
    #     item["IELTS"] = ''
    #     item["TOEFL"] = ''
    #     item["Assessment"] = Assessment
    #     item["Career"] = Career
    #     item["TuitionFee"] = ''
    #     item["Modules"] = Modules
    #     item["Duration"] = Duration
    #     item["StartDate"] = StartDate
    #     item["Deadline"] = ''
    #     item["EntryRequirements"] = EntryRequirements
    #     yield item
Ejemplo n.º 5
0
    def parse_Southampton(self, response):
        if True:
            # print(response.url,'------------------------------------------------')
            item = HooliItem()
            #专业
            Course = response.xpath(
                '//h1[@class="uos-page-title uos-main-title"]//text()'
            ).extract()
            Course = ''.join(Course)

            #UCAS
            CourseCode = response.xpath('//aside//text()').extract()[4]
            #学位类型
            TypeOfDegree = response.xpath('//aside//text()').extract()[6]

            #专业描述
            #//div[@class="uos-page-intro"]//text() 专业描述第一部分
            #//div[@class="uos-grid uos-grid-2-3"]//text() 第二
            part1 = response.xpath(
                '//div[@class="uos-page-intro"]//text() ').extract()
            part2 = response.xpath(
                '//div[@class="uos-grid uos-grid-2-3"]//text()').extract()
            part3 = response.xpath(
                '//div[@data-target="tabset-1"]//text()').extract()
            CourseOverview = part1 + part2 + part3
            CourseOverview = ''.join(CourseOverview).strip()

            #学术要求
            xsyq = response.xpath(
                '//div[@data-target="tabset-2"]//text()').extract()
            # print(xsyq,response.url)

            if 'GCSE' in xsyq:
                index_GCSE = xsyq.index('GCSE')
                GCSE = xsyq[index_GCSE + 1]
                # print(GCSE)
            else:
                GCSE = ''
            #Aleve
            if 'GCE A-level' in xsyq:
                index_Aleve = xsyq.index('GCE A-level')
                Aleve = xsyq[index_Aleve + 1]
                Aleve = Aleve + GCSE
                # print(Aleve,response.url)
            else:
                Aleve = 'uncleared'

            #IB
            if 'International Baccalaureate' in xsyq:
                index_InternationalBaccalaureate = xsyq.index(
                    'International Baccalaureate')
                IB = xsyq[index_InternationalBaccalaureate + 1]
                # print(IB,response.url)
            else:
                IB = 'uncleared'

            #用来区分有无评估标签
            len_div = response.xpath(
                '//div[@id="js-component-tabs"]/h3/text()').extract()
            len_div = len(len_div)
            # 评估
            if len_div >= 6:
                Assessment = response.xpath(
                    '//div[@data-target="tabset-6"]//text()').extract()
                Assessment = ''.join(Assessment)
                # print(Assessment,response.url)
            else:
                Assessment = 'uncleared'

            #课程要求
            Modules = response.xpath(
                '//div[@data-target="tabset-3"]//text()').extract()
            Modules = ''.join(Modules)
            # print(Modules)

            #就业
            Career = response.xpath(
                '//div[@data-target="tabset-5"]//text()').extract()
            Career = ''.join(Career)

            #学费
            TuitionFee = response.xpath(
                '//table[@class="uos-table"]//text()').extract()
            if TuitionFee != []:
                if 'Full-time' in TuitionFee:
                    index_fee = TuitionFee.index('Full-time') + 2
                    TuitionFee = TuitionFee[index_fee]
                    TuitionFee = TuitionFee.replace(',', '').replace('£', '')
                else:
                    TuitionFee = ''
            else:
                TuitionFee = ''
            EntryRequirements = response.xpath(
                '//div[@data-target="tabset-2"]//text()').extract()
            EntryRequirements = ''.join(EntryRequirements)

            University = 'Southampton'
            # other=''.join(xsyq).strip()
            Duration = re.findall('\(\d.*\)', Course)
            Duration = ''.join(Duration)
            Course = Course.replace(Duration, '').replace(CourseCode,
                                                          '').strip()
            # print(Duration,Course,response.url)
            item["university"] = University
            item["location"] = ''
            item["department"] = ''
            item["ucas_code"] = CourseCode
            item["programme"] = Course
            item["degree_type"] = TypeOfDegree
            item["overview"] = CourseOverview
            item["IELTS"] = ''
            item["TOEFL"] = ''
            item["Alevel"] = Aleve
            item["IB"] = IB
            item["teaching_assessment"] = Assessment
            item["career"] = Career
            item["tuition_fee"] = TuitionFee
            item["modules"] = Modules
            item["duration"] = Duration
            item["start_date"] = ''
            item["deadline"] = ''
            item["entry_requirements"] = EntryRequirements
            item["url"] = response.url
            yield item
Ejemplo n.º 6
0
    def parse_text(self, response):
        urls = response.url.split('/')
        if 'study' in urls:
            item = HooliItem()
            url_text = response.xpath(
                '//div[@class="o-grid__box o-grid__box--twothirds"]//text()'
            ).extract()
            url_text_2 = response.xpath(
                '//div[@class="o-grid__box o-grid__box--full"]//ul//text()'
            ).extract()
            # 雅思
            if 'IELTS:' in url_text_2:
                index_IELTS = url_text_2.index('IELTS:')
                IELTS = url_text_2[index_IELTS + 1]
            else:
                IELTS = ''
            # 托福
            if 'TOEFL' in url_text_2:
                index_TOEFL = url_text_2.index('TOEFL')
                TOEFL = url_text_2[index_TOEFL + 1].replace(':', '')
            else:
                TOEFL = ''
            # 学院
            if 'Department' in url_text:
                index_Dep = url_text.index('Department')
                Department = url_text[index_Dep + 1]
            else:
                Department = ''
            # 专业名
            Course = response.xpath(
                '//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1//text()'
            ).extract()[0]
            Master = re.findall('[A-Za-z]{2,10}\s\([a-zA-Z]*\)', Course)
            if Master == []:
                Master = ''
            else:
                Master = ''.join(Master)
            Course = Course.replace(Master, '')
            # 入学时间
            self.startime = 'Start date'
            if self.startime in url_text:
                index_startime = url_text.index(self.startime) + 1
                StartDate = url_text[index_startime].replace('(', '')
            else:
                StartDate = ''
            # 专业描述
            overview = response.xpath(
                '//div[@class="o-grid__box o-grid__box--half o-grid__box--half@medium"]//text()'
            ).extract()
            CourseOverview = ''.join(overview).strip()
            # 课程长度
            if 'Length' in url_text:
                index_TT = url_text.index('Length') + 1
                Duration = url_text[index_TT]
            else:
                Duration = ''
            self.kcsz = 'UCAS code'
            if self.kcsz in url_text:
                index_kesz = url_text.index(self.kcsz)
                ucas_code = url_text[index_kesz + 1]
            else:
                ucas_code = ''

            Evaluation_method = response.xpath(
                '//div[@class="o-grid__box o-grid__box--half o-grid__box--half@medium o-grid__box--full@small"]//text()'
            ).extract()
            Assessment = ''.join(Evaluation_method).strip()

            long_str_eq = response.xpath(
                '//div[@id="entry"]//text()').extract()
            if 'A levels' in long_str_eq:
                index_al = long_str_eq.index('A levels')
                Alevel = long_str_eq[index_al + 2]
            if 'International Baccalaureate' in long_str_eq:
                index_IB = long_str_eq.index('International Baccalaureate')
                IB = long_str_eq[index_IB + 2]
            else:
                IB = ''

            self.t_fee = 'International fees'
            if self.t_fee in url_text:
                index_tfee = url_text.index(self.t_fee)
                tuition_fee = url_text[index_tfee + 2]
                TuitionFee = re.findall('\d+,\d+', tuition_fee)[0]
                TuitionFee = TuitionFee.replace(',', '')
            else:
                TuitionFee = ''

            University = 'University of York'

            item["university"] = University
            item["location"] = ''
            item["department"] = Department
            item["programme"] = Course
            item["ucas_code"] = ucas_code
            item["degree_type"] = Master
            item["overview"] = CourseOverview
            item["IELTS"] = IELTS
            item["TOEFL"] = TOEFL
            item["Alevel"] = Alevel
            item["IB"] = IB
            item["teaching_assessment"] = Assessment
            item["career"] = ''
            item["tuition_fee"] = TuitionFee
            item["modules"] = ''
            item["duration"] = Duration
            item["start_date"] = StartDate
            item["deadline"] = ''
            item["entry_requirements"] = ''
            item["url"] = response.url
            yield item
        else:
            item = HooliItem()
            url_text = response.xpath(
                '//table[@id="course-summary-table"]//text()').extract()
            index_ucas = url_text.index('UCAS\xa0code')
            ucas_code = url_text[index_ucas + 8]
            Duration = url_text[index_ucas + 15]
            overview = response.xpath(
                '//div[@id="course-overview-content"]//text()').extract()
            CourseOverview = ''.join(overview).strip()
            modules = response.xpath(
                '//div[@id="course-content-content"]//text()').extract()
            Modules = ''.join(modules).strip()
            Evaluation_method = response.xpath(
                '//div[@id="course-assessment-content"]//text()').extract()
            Assessment = ''.join(Evaluation_method).strip()
            TuitionFee = ''
            Course = response.xpath(
                '//*[@id="course-summary"]//text()').extract()[1]
            Master = re.findall('[A-Za-z]{2,10}\s\([a-zA-Z]*\)', Course)
            if Master == []:
                Master = ''
            else:
                Master = ''.join(Master)
            Course = Course.replace(Master, '')
            Career = response.xpath(
                '//*[@id="course-careers-content"]//text()').extract()
            Career = ''.join(Career).strip()
            University = 'University of York'

            item["university"] = University
            item["location"] = ''
            item["department"] = ''
            item["programme"] = Course
            item["ucas_code"] = ucas_code
            item["degree_type"] = Master
            item["overview"] = CourseOverview
            item["IELTS"] = 'IELTS 6.5'
            item["TOEFL"] = ''
            item["Alevel"] = ''
            item["IB"] = ''
            item["teaching_assessment"] = Assessment
            item["career"] = Career
            item["tuition_fee"] = TuitionFee
            item["modules"] = Modules
            item["duration"] = Duration
            item["start_date"] = ''
            item["deadline"] = ''
            item["entry_requirements"] = ''
            item["url"] = response.url

            yield item
Ejemplo n.º 7
0
    def parse_Dmu(self, response):
        # print('-----------------------------------',response.url)

        item=HooliItem()

        Internationnal = response.xpath('//div[@data-kftab="2"]//text()').extract()
        # print(response.url)

        #专业
        Course=response.xpath('//div[@class="block__details block__details--overlay block__details--courseOverlay"]//h1[@class="block__details__title"]//text()').extract()[0]
        Course=Course.strip()
        Master=re.findall('[A-Z]{1}[A-Za-z]{1,3}\s?\([a-zA-Z]*\)',Course)
        Master=''.join(Master)
        Course=Course.replace(Master,'')
        if Master=='':
            Master=re.findall('MA|MSc',Course)
            Master=''.join(Master)
            print(Master,Course,response.url)
        else:
            Master=''
        #专业描述
        CourseOverview=response.xpath('//div[@class="block large-8 columns course-col2"]//text()').extract()
        CourseOverview=''.join(CourseOverview).strip()
        #学费
        self.var_fee = 'Fees and funding:'
        self.var_fee_2 = 'Fees and funding 2017/18'
        if self.var_fee in Internationnal:
            index_fee = response.xpath('//div[@data-kftab="2"]//text()').extract().index(self.var_fee)
            tuition_fee = Internationnal[index_fee + 1]
            TuitionFee = re.findall(r"£\d*,?\d*", tuition_fee)

        elif self.var_fee_2 in Internationnal:
            index_fee = response.xpath('//div[@data-kftab="2"]//text()').extract().index(self.var_fee_2)
            tuition_fee = Internationnal[index_fee + 1]
            TuitionFee = re.findall(r"£\d*,?\d*", tuition_fee)
        else:
            TuitionFee = ''
        if TuitionFee!=[]:
            TuitionFee=''.join(TuitionFee)
            TuitionFee=re.findall('\d+',TuitionFee)
            TuitionFee=''.join(TuitionFee)
        else:
            TuitionFee=''
        # print(TuitionFee,response.url)

        # 地点
        if 'Location:' in Internationnal:
            index_Location = response.xpath('//div[@data-kftab="2"]//text()').extract().index('Location:')
            Location = Internationnal[index_Location + 1]
        else:
            Location = 'The Gateway Leicester LE1 9BH '

        # UCAS课程代码
        self.var1 = 'UCAS course code:'
        if self.var1 in Internationnal:
            index_UCAS = response.xpath('//div[@data-kftab="2"]//text()').extract().index(self.var1)
            CourseCode = Internationnal[index_UCAS + 1]
        elif 'UCAS course codes:' in Internationnal:
            index_UCAS = response.xpath('//div[@data-kftab="2"]//text()').extract().index('UCAS course codes:')
            CourseCode = Internationnal[index_UCAS + 1]
        else:
            CourseCode = re.findall('[A-Z]{1}[A-Z0-9]{3}',''.join(Internationnal))
            CourseCode = ''.join(CourseCode)
            CourseCode = CourseCode.replace('UCAS','')

        # 课程长度
        if 'Duration:' in Internationnal:
            index_Tt = response.xpath('//div[@data-kftab="2"]//text()').extract().index('Duration:')
            Teaching_type = Internationnal[index_Tt + 1]
        else:
            Teaching_type = ''

        #IELTS \d?.\d? .*

        # 申请要求
        standard= response.xpath('//div[@class="row row--block course-section course-section--criteria"]//text()').extract()
        standard=' '.join(standard)
        # print(standard,response.url)
        IB=re.findall('International Baccalaureate:[.\w\s+]{0,50}',standard)
        IB=''.join(IB)

        IELTS=re.findall('IELTS (.*){0,3} \d+.\d+ .*',standard)
        IELTS=''.join(IELTS)

        Alevel=re.findall('Normally[0-9A-Z\(\s\),-:]*or',standard)
        Alevel=''.join(Alevel)


        # 课程及评估
        Evaluation_method=response.xpath('//div[@id="cycle-slideshow_course"]//text()').extract()
        Evaluation_method=' '.join(Evaluation_method)
        Evaluation_method=Evaluation_method.strip()

        #就业
        Career=response.xpath('//div[@class="row row--block course-section course-section--opps"]//text()').extract()
        Career=''.join(Career).strip()
        # print(Career)
        University = 'De Montfort'
        item["University"]=University
        # item["Course"]=Course
        # item["TypeOfDegree"]='undergraduate'
        # item["CourseOverview"]=CourseOverview
        # item["Duration"]=Teaching_type
        # item["CourseCode"]=CourseCode
        # item["Assessment"]=Evaluation_method
        # item["TuitionFee"]=TuitionFee
        # item["Location"]=Location
        # item["url"]=response.url
        # item["EntryRequirements"]=standard
        # item["Career"]=Career
        item["url"] = response.url
        item["University"] = University
        item["Course"] = Course
        item["CourseCode"] = CourseCode
        item["Master"] = Master
        item["CourseOverview"] = CourseOverview
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["TOEFL"] = ''
        item["Assessment"] = Evaluation_method
        item["Career"] = Career
        item["TuitionFee"] = TuitionFee
        item["Modules"] = Evaluation_method
        item["Duration"] = Teaching_type
        item["StartDate"] = ''
        item["Deadline"] = ''
        item["EntryRequirements"] = standard
        item["Location"] = Location
        item["Department"] = ''

        yield item
Ejemplo n.º 8
0
        def parse(self, response):
            #print(response.url)

            item = HooliItem()
            #1.专业
            Course = response.xpath('//h1/text()').extract()[0]
            #print(Course)

            # 2开学时间
            StartDate = response.xpath(
                '//div[@class = "col-xs-12 col-md-2 col-md-offset-1 border-left start"]/p/text()'
            ).extract()[0]
            #print(StartDate)

            #3.UCAS code
            CourseCode = response.xpath(
                '/html/body/div[3]/div[2]/div/div[7]/p/text()').extract()[0]
            #print(CourseCode)

            #4 Duration课程长度
            Duration = response.xpath(
                '/html/body/div[3]/div[2]/div/div[3]/p/text()').extract()[0]
            #print(Duration)

            #5专业描述
            CourseOverview = response.xpath(
                '//*[@id="about"]/div/div/div[2]/text()').extract()[0]
            #print(CourseOverview)

            #6索引
            url = 'https://courses.hud.ac.uk'
            #print(url)
            #7 大学名称
            University = 'Huddersfiel'
            #print(University)

            #8 Assessment  评估方式
            try:
                Assessment = response.xpath(
                    '//*[@id="detail"]/div/div/div[3]/p[1]/strong/text()'
                ).extract()[0]
                #print(Assessment)

            except:
                Assessment = 'N/A'
                #print(Assessment)
            # 9 Alevel
            try:
                Alevel = response.xpath(
                    '/html/body/div[3]/div[2]/div/div[5]/p/span[1]/text()'
                ).extract()[0]
                Alevel = 'A Level - ' + Alevel
                #print(Alevel)
            except:
                Alevel = 'N/A'
                #print(Alevel)

            # 10 IB
            try:
                IB = response.xpath(
                    '/html/body/div[3]/div[2]/div/div[5]/p[2]/span/text()'
                ).extract()[0]
                IB = 'BTEC - ' + IB
                #print(IB)
            except:
                IB = 'N/A'
                #print(IB)

            # 11 申请要求
            EntryRequirements = str("BTEC: " + IB + "+" + "Alevel: " + Alevel)
            #print(EntryRequirements)

            item["Course"] = Course
            item["StartDate"] = StartDate
            item["CourseCode"] = CourseCode
            item["Duration"] = Duration
            item["CourseOverview"] = CourseOverview
            item["University"] = University
            item["Assessment"] = Assessment
            item["Alevel"] = Alevel
            item["IB"] = IB
            item["EntryRequirements"] = EntryRequirements
            item["url"] = url

            yield item
Ejemplo n.º 9
0
    def parse_item(self, response):
        print('----------------', response)

        item = HooliItem()
        degree_type = response.xpath(
            '//div[@class="course-hero__label"]//text()').extract()
        programme = response.xpath(
            '//h1[@class="course-hero__title"]//text()').extract()
        #课程方向
        #//div[@class="course-hero__labels"]//text()

        mode = 'Full-time'

        text1 = response.xpath(
            '//div[@class="course-hero__options course-options js-course_options"]//text()'
        ).extract()
        if '\r\n                    Duration\r\n                ' in text1:
            index_dur = get_index(
                text1, '\r\n                    Duration\r\n                ')
            duration = text1[index_dur + 2].strip()
        else:
            duration = ''
        if '\r\n                    Main Location\r\n                ' in text1:
            index_lo = get_index(
                text1,
                '\r\n                    Main Location\r\n                ')
            location = text1[index_lo + 2].strip()
        else:
            location = ''
        start_date = re.findall('\d{1,2}\s[a-zA-Z]*\s\d{4}', ''.join(text1))
        start_date = list(set(start_date))
        start_date = ','.join(start_date)

        overview = response.xpath(
            '//section[@id="overview_block"]//div[@class="tabs__panels"]//text()'
        ).extract()
        overview = clear_long_text(overview)

        entry_requirements = response.xpath(
            '//section[@id="requirements_block"]//div[@class="tabs__panels"]/div[2]//text()'
        ).extract()
        entry_requirements = clear_long_text(entry_requirements)

        IELTS = re.findall('IELTS\s\d.\d[\sa-zA-Z]*\d.\d', entry_requirements)
        IELTS = list(set(IELTS))
        IELTS = ''.join(IELTS)

        career = response.xpath(
            '//section[@id="careers_block"]//div[@class="tabs__panels"]//text()'
        ).extract()
        career = clear_long_text(career)
        modules = response.xpath(
            '//section[@id="learning_block"]//div[@class="tabs__panels"]//text()'
        ).extract()
        modules = clear_long_text(modules)

        tuition_fee = find_fee(''.join(
            response.xpath('//div[@class="tabs__panels"]//text()').extract()))
        # print(tuition_fee)

        programme = ''.join(programme)
        degree_type = ''.join(degree_type)

        item["university"] = 'Leeds Beckett University'
        item["location"] = location
        item["department"] = ''
        item["programme"] = programme
        item["ucas_code"] = ''
        item["degree_type"] = degree_type
        item["mode"] = mode
        item["overview"] = overview
        item["IELTS"] = IELTS
        item["TOEFL"] = ''
        item["Alevel"] = ''
        item["IB"] = ''
        item["teaching_assessment"] = ''
        item["career"] = career
        item["tuition_fee"] = tuition_fee
        item["modules"] = modules
        item["duration"] = duration
        item["start_date"] = start_date
        item["deadline"] = ''
        item["entry_requirements"] = entry_requirements
        item["url"] = response.url
        item["how_to_apply"] = ''
Ejemplo n.º 10
0
    def parse_item(self, response):
        print('----------------',response.url)
        item=HooliItem()
        text1=response.xpath('//div[@class="major-intro"]//text()').extract()
        index_st=get_index(text1,'Start date')
        start_date=text1[index_st+2].strip()
        index_lo=get_index(text1,'Location')
        location=text1[index_lo+3].strip()
        if 'UCAS Code' in text1:
            index_ucas=get_index(text1,'UCAS Code')
            ucas_code=text1[index_ucas+2].strip()
        else:
            ucas_code=''
        programme=text1[3].strip()
        programme=programme.split(':')[0]
        degree_type=re.findall('\(.*\)',programme)
        degree_type=''.join(degree_type)

        if 'Duration' in text1:
            index_dur=get_index(text1,'Duration')
            durations=text1[index_dur+3]
        else:
            durations=''
        duration=re.findall('(?i)[\sa-zA-Z]*year[\sa-zA-Z]*full-time',durations)
        mode=re.findall('full-time',durations)
        duration=''.join(duration)
        mode=''.join(mode)

        overview=response.xpath('//div[@class="row column max-medium"]/div/p//text()|//div[@class="row column max-medium"]/div/ul//text()').extract()
        overview=clear_long_text(overview)
        modules=response.xpath('//div[@id="courseStructure"]//text()').extract()
        modules=clear_long_text(modules)

        entry_requirements=response.xpath('//div[@class="row column max-xxlarge"]//text()').extract()
        title=response.xpath('//div[@class="column medium-10 large-8"]//ul/li/a//text()').extract()
        title=clear_long_text(title)
        entry_requirements=clear_long_text(entry_requirements)
        entry_requirements=title+'\n'+entry_requirements

        IELTS=re.findall('\d\.\d[\sa-zA-Z\,]*\d\.\d',entry_requirements)
        IELTS=''.join(IELTS)
        tuition_fee=find_fee_s(entry_requirements)
        programme = programme.replace(degree_type,'')
        item["university"] = 'Birkbeck University of London'
        item["location"] =location
        item["department"] = ''
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_type"] = degree_type
        item["mode"] = mode
        item["overview"] = overview
        item["IELTS"] = IELTS
        item["TOEFL"] = ''
        item["Alevel"] = ''
        item["IB"] = ''
        item["teaching_assessment"] = ''
        item["career"] = ''
        item["tuition_fee"] = tuition_fee
        item["modules"] = modules
        item["duration"] = duration
        item["start_date"] = start_date
        item["deadline"] = ''
        item["entry_requirements"] = entry_requirements
        item["url"] = response.url
        item["how_to_apply"] = ''
        item["degree_level"] = 0

        if mode =='full-time':
            # yield item
            print(item)
        else:
            print('do not need part-time')
Ejemplo n.º 11
0
    def parse_arts(self, response):
        a = response.url
        a = a.split('/')
        if 'courses' in a:
            print('--------------------------', response.url)
            item = HooliItem()
            #学校 University of the Arts London
            University = 'University of the Arts London'

            #学院
            Department = response.xpath(
                '//nav[@class="college"]//text()').extract()[0]

            #学位类型
            TypeOfDegree = 'undergraduate'

            url_long_str = response.xpath(
                '//div[@class="ual-container"]//text()').extract()

            # 入学时间
            str_startdate = 'Start date'
            index_startdate = url_long_str.index(str_startdate)
            StartDate = url_long_str[index_startdate + 1]

            #课程长度
            str_Duration = 'Course length'
            index_Duration = url_long_str.index(str_Duration)
            Duration = url_long_str[index_Duration + 1]

            #CourseCode
            str_CourseCode = 'UCAS code'
            index_CourseCode = url_long_str.index(str_CourseCode)
            CourseCode = url_long_str[index_CourseCode + 1]

            # #学位类型
            Master = response.url.split('/')
            Master = Master[-2]
            Master = Master.split('-')[0].upper()
            # print(Master)
            Course = response.url.split('/')[-2].upper()
            Course = Course.replace(Master, '')
            Course = Course.replace('-', ' ').strip()
            #课程描述
            #//div[@id="tab1-panel"]//text()
            CourseOverview = response.xpath(
                '//div[@id="tab1-panel"]//text()').extract()
            CourseOverview = ''.join(CourseOverview)

            #课程设置
            Modules = response.xpath(
                '//div[@id="tab2-panel"]//text()').extract()
            Modules = ''.join(Modules)

            #申请要求
            EntryRequirements = response.xpath(
                '//div[@id="tab3-panel"]//text()').extract()
            EntryRequirements = ''.join(EntryRequirements)
            # print(EntryRequirements)
            IELTS = re.findall('IELTS[, \da-zA-Z.() ]{0,88}',
                               EntryRequirements)
            IELTS = ''.join(IELTS)
            url_long_str2 = response.xpath(
                '//div[@id="tab4-panel"]//text()').extract()
            # print(url_long_str2)
            # 学费
            str_TuitionFee = 'International fee'
            index_TuitionFee = url_long_str2.index(str_TuitionFee)
            TuitionFee = url_long_str2[index_TuitionFee + 1]
            TuitionFee = re.findall(r"£\d+,\d+", TuitionFee)[0]
            TuitionFee = TuitionFee.replace(',', '').replace('£', '')

            #就业方向
            Career = response.xpath(
                '//div[@id="tab5-panel"]//text()').extract()
            Career = ''.join(Career)

            item["url"] = response.url
            item["University"] = University
            item["Department"] = Department
            item["Location"] = ''
            item["Course"] = Course
            item["CourseCode"] = CourseCode
            item["Master"] = Master
            item["CourseOverview"] = CourseOverview
            item["Alevel"] = ''
            item["IB"] = ''
            item["IELTS"] = IELTS
            item["TOEFL"] = ''
            item["Assessment"] = ''
            item["Career"] = Career
            item["TuitionFee"] = TuitionFee
            item["Modules"] = Modules
            item["Duration"] = Duration
            item["StartDate"] = StartDate
            item["Deadline"] = ''
            item["EntryRequirements"] = EntryRequirements
            yield item
Ejemplo n.º 12
0
    def parse_item(self, response):
        item=HooliItem()
        Course = response.xpath('//section//h2//text()').extract()[0]
        long_str = response.xpath('//section[@class="sta-grey-light course"]//text()').extract()
        print(long_str,response.url)
        if 'UCAS code' in long_str:
            index_UCAS = long_str.index('UCAS code')
            CourseCode = long_str[index_UCAS + 1]
        else:
            CourseCode = ''
        Master = response.url.split('/')
        Master = Master[-2]
        Master = Master.split('-')
        Master = Master[-1].upper()

            # Duration
        if 'Course duration' in long_str:
            index_Duration = long_str.index('Course duration')
            Duration = long_str[index_Duration + 1]
        else:
            Duration = ''
        Career = response.xpath('//section[6]//text()').extract()
        Career = ''.join(Career).strip()

        long_str_date=''.join(long_str)
        # print(long_str_date)
        StartDate=re.findall('Start.+',long_str_date)
        if StartDate:
            StartDate=StartDate[0]
        else:
            StartDate=''
        Deadline = re.findall('End.+',long_str_date)
        if Deadline:
            Deadline=Deadline[0]
            # Alevel
        if 'GCE A-Levels' in long_str:
            index_Alevel = long_str.index('GCE A-Levels')
            Alevel = long_str[index_Alevel + 2]
        else:
            Alevel = ''
            # IB
        if 'IB points' in long_str:
            index_IB = long_str.index('IB points')
            IB = long_str[index_IB + 2]
        else:
            IB = ''
            # IELTS
        if 'International applicants' in long_str:
            index_IELTS = long_str.index('International applicants')
            IELTS = long_str[index_IELTS + 1:index_IELTS + 4]
            IELTS = ''.join(IELTS)
        else:
            IELTS = ''
        # CourseOverview
        CourseOverview = response.xpath('//section[2]//text()').extract()
        CourseOverview = ''.join(CourseOverview)
        # print(CourseOverview)
        # Assessment
        Assessment = response.xpath('//section[3]//text()').extract()
        Assessment = ''.join(Assessment).strip()
        # print(Assessment)
        # Modules
        Modules = response.xpath('//div[@id="year-tabs"]//text()').extract()
        Modules = ''.join(Modules).strip()
        # print(Modules)
        # TuitionFee
        Fee = response.xpath('//section[5]//text()').extract()
        if 'Overseas' in Fee:
            index_fee = Fee.index('Overseas')
            TuitionFee = Fee[index_fee + 2]
            TuitionFee = TuitionFee.replace(',', '')
            TuitionFee = TuitionFee.replace('£', '')
        else:
            TuitionFee = ''
        # if TuitionFee=='':
            # TuitionFee = re.findall('£\d+,\d+', long_str)
        # print(TuitionFee)
        # Career
        Career = response.xpath('//section[6]//text()').extract()
        Career = ''.join(Career).strip()
        EntryRequirements=''.join(long_str).strip()
        url = response.url
        University = 'St-Andrews'
        item["url"] = url
        item["University"] = University
        item["Course"] = Course
        item["CourseCode"] = CourseCode
        item["Master"] = Master
        item["CourseOverview"] = CourseOverview
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["TOEFL"] = ''
        item["Assessment"] = Assessment
        item["Career"] = Career
        item["TuitionFee"] = TuitionFee
        item["Modules"] = Modules
        item["Duration"] = Duration
        item["StartDate"] = StartDate
        item["Deadline"] = Deadline
        item["EntryRequirements"] = EntryRequirements
        item["Location"] = ''
        item["Department"] = ''

        yield item