コード例 #1
0
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Bournemouth University'
        # print(university)

        #2.location
        location = response.xpath(
            "//*[contains(text(),'Location:')]//following-sibling::p").extract(
            )
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #3.programme_en 4.degree_name
        programme_en = response.xpath('/html/body/div/section//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        try:
            degree_name = programme_en.split()[0]
        except:
            degree_name = ''
        if '-' in programme_en:
            programme_en = programme_en.replace('-', '')
        programme_en = programme_en.replace(degree_name, '')
        programme_en = clear_space_str(programme_en)
        if '–' in programme_en:
            programme_en = programme_en.replace('–', '').strip()
        programme_en = programme_en.replace('&', '')
        # print('programme_en:',programme_en)
        # print('degree_name:',degree_name)

        # 5.degree_type
        degree_type = 2

        #6.teach_time
        teach_time = response.xpath(
            "//*[contains(text(),'Delivery:')]//following-sibling::*").extract(
            )
        teach_time = ''.join(teach_time)
        teach_time = remove_tags(teach_time)
        if 'Full time' in teach_time:
            teach_time = 'Full time'
        else:
            teach_time = 'Part time'
        # print(teach_time)

        #7.duration #8.duration_per
        duration = response.xpath(
            "//*[contains(text(),'Duration:')]//following-sibling::p").extract(
            )
        duration = ''.join(duration)
        duration = remove_tags(duration)
        # print(duration)
        if '1 year' in duration:
            duration = 1
            duration_per = 1
        elif '12-18 months' in duration:
            duration = 12
            duration_per = 3
        elif '36 months' in duration:
            duration = 36
            duration_per = 3
        elif '1 to 2 years' in duration:
            duration = 1
            duration_per = 1
        elif '2 years' in duration:
            duration = 2
            duration_per = 1
        elif '3-5 years' in duration:
            duration = 3
            duration_per = 1
        elif '48 months' in duration:
            duration = 48
            duration_per = 3
        elif '18-36 months' in duration:
            duration = 18
            duration_per = 3
        elif '12 months' in duration:
            duration = 12
            duration_per = 3
        elif '5 years' in duration:
            duration = 5
            duration_per = 1
        elif '3 years' in duration:
            duration = 3
            duration_per = 1
        elif '14 months' in duration:
            duration = 14
            duration_per = 3
        elif '15 months' in duration:
            duration = 15
            duration_per = 3
        elif '18-24 months' in duration:
            duration = 18
            duration_per = 3
        elif '27 months' in duration:
            duration = 27
            duration_per = 3
        elif '8 months' in duration:
            duration = 8
            duration_per = 3
        elif 'Nine months' in duration:
            duration = 9
            duration_per = 3
        else:
            duration_per = 1
            duration = 1
        # print('duration_per:',duration_per)
        # print('duration:',duration)

        #9.overview_en
        overview_en = response.xpath(
            '//*[@id="main-content"]/div/section[2]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.teach_time
        teach_time = 'full time'

        #11.modules_en
        modules_en = response.xpath(
            "//section[@id='course-details']//div[@id='accordion-1']").extract(
            )
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #12.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Next start date:')]//following-sibling::p"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #13.rntry_requirements
        rntry_requirements = response.xpath(
            "//*[contains(text(),'Entry requirements')]/../following-sibling::div[1]"
        ).extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        rntry_requirements = clear_space_str(rntry_requirements)
        # print(rntry_requirements,'******************************************************************************')

        #14.ielts 15.16.17.18
        ielts_list = re.findall('\d\.\d', rntry_requirements)
        # print(ielts_list)
        if len(ielts_list) == 4:
            ielts = ielts_list[2]
            ielts_l = ielts_list[3]
            ielts_s = ielts_list[3]
            ielts_r = ielts_list[3]
            ielts_w = ielts_list[3]
        elif len(ielts_list) == 3:
            ielts = ielts_list[1]
            ielts_l = ielts_list[2]
            ielts_s = ielts_list[2]
            ielts_r = ielts_list[2]
            ielts_w = ielts_list[2]
        elif len(ielts_list) == 2:
            ielts = ielts_list[0]
            ielts_l = ielts_list[1]
            ielts_s = ielts_list[1]
            ielts_r = ielts_list[1]
            ielts_w = ielts_list[1]
        elif len(ielts_list) == 1:
            ielts = ielts_list[0]
            ielts_l = ielts_list[0]
            ielts_s = ielts_list[0]
            ielts_r = ielts_list[0]
            ielts_w = ielts_list[0]
        else:
            ielts = None
            ielts_l = None
            ielts_s = None
            ielts_r = None
            ielts_w = None
        # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s)

        #19.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Careers')]/../following-sibling::*|//*[contains(text(),'Careers')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #20.tuition_fee,#21.tuition_fee_pre
        tuition_fee_list = response.xpath(
            '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]'
        ).extract()
        tuition_fee_list = ''.join(tuition_fee_list)
        #
        # if len(tuition_fee) == 0:
        #     tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract()
        # tuition_fee = ''.join(tuition_fee)
        # tuition_fee = remove_tags(tuition_fee)
        # tuition_fee = tuition_fee.replace(',','')
        # tuition_fee = tuition_fee.replace('£','')
        # print(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee_list)
        # print(tuition_fee)
        tuition_fee_pre = '£'

        #22.url
        url = response.url
        # print(url)

        #23.application_open_date
        application_open_date = '2018-7-18'
        #24.apply_pre
        apply_pre = '£'

        #25.apply_fee
        apply_fee = 0
        #26.apply_proces_en
        apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>'
        #27.require_chinese_en

        require_chinese_en = "<p>This is a guide to the normal entry requirements, assuming you’ve followed the Chinese education system. An admissions tutor will study your application, so make sure you include your academic background and personal information when you apply.Entry requirements vary depending on what sort of course you’re coming to BU to study. BU International College Foundation Certificates You can undertake a Foundation Certificate before going on to an undergraduate course if you’ve completed 11 years of schooling or Senior High School Year 2 in China and have a minimum of IELTS (Academic) 5.0. Undergraduate courses You can apply to study a Bachelor's degree from year one if you hold a Chinese Senior High School Diploma plus successful completion of a relevant first-year undergraduate programme in a recognised Chinese university, or a Diploma from Specialized College (zhongzhuan). Chinese Senior High School certificate of graduation with overall HuiKao result grade B average,  transcripts of 3 years with 85% average (85% also eligible for AES). Top-up courses You need to hold a College Graduation Diploma (Dazhuan awarded by a university/college on completion of two to three years of study), or a BTEC Higher National Diploma or Foundation degree in a relevant subject.Postgraduate courses ​You need to have a Bachelor's (Honours) degree from a recognised Chinese university, normally from a four-year undergraduate programme, or a Bachelors degree from Higher Education Self-Study Examinations, or a Top-up degree or university-recognised Pre-Master’s Foundation programme. Grade requirements from Chinese Bachelor's degree holders are as below: Applicants from 985 or 211 universities Media studies and other subjects equivalent to UK 2:1 degree	65% +	GPA 2.25 + Business and subjects equivalent to UK 2:2 degree	60% +	GPA 2.0 + Academic Excellence Scholarship (automatic award of £3500)	75% +	GPA 2.75 + Applicants from other universities Media studies and other subjects equivalent to UK 2:1 degree	70% +	GPA 2.5 + Business and subjects equivalent to UK 2:2 degree	65% +	GPA 2.25 + Academic Excellence Scholarship (automatic award of £3500)	80% +	GPA 3.0 + Research programmes You need a good postgraduate degree to be considered for a BU research programme. Please see more detail on the postgraduate research page.You can find more information about English language requirements for entry to BU on our English language requirements page. Full information about preparatory courses is available on the Bournemouth University International College website.If you need help with your visa or want more information about the immigration process, you can find it on our immigration information page.</p>"

        item['require_chinese_en'] = require_chinese_en
        item['apply_proces_en'] = apply_proces_en
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['location'] = location
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['degree_type'] = degree_type
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['teach_time'] = teach_time
        item['modules_en'] = modules_en
        item['start_date'] = start_date
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['url'] = url
        item['application_open_date'] = application_open_date
        yield item
コード例 #2
0
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Edge Hill University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath('//*[@id="primary"]/header/h1/a').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = programme_en.split()[0]
        programme_en = programme_en.replace(degree_name,'').strip()
        # print(degree_name)
        # print(programme_en)

        #6.teach_time #7.duration #8.duration_per
        teach_time_list = response.xpath("//*[contains(text(),'Length:')]//following-sibling::*").extract()
        teach_time_list= ''.join(teach_time_list)
        teach_time_list = remove_tags(teach_time_list)
        # print(teach_time_list)
        duration = re.findall('\d+',teach_time_list)[0]
        if 'Months' in teach_time_list:
            duration_per = 3
        elif 'Weeks' in teach_time_list:
            duration_per = 4
        else:
            duration_per = 1
        if 'Full-Time' in teach_time_list:
            teach_time = 'Full-Time'
        else:
            teach_time = 'Part-Time'
        # print(duration,'***********',duration_per)
        # print(teach_time)

        #9.start_date
        start_date = response.xpath("//*[contains(text(),'Dates:')]//following-sibling::*").extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #10.department
        department = response.xpath("//*[contains(text(),'Department:')]//following-sibling::*").extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #11.location
        location = response.xpath("//*[contains(text(),'Location:')]//following-sibling::*").extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #12.overview_en
        overview_en = response.xpath('//*[@id="overview"]/div[1]/div/ul/li/text()').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = '<p>' + overview_en +'</p>'
        # print(overview_en)

        #13.assessment_en
        assessment_en = response.xpath("//*[contains(text(),'How will I be assessed?')]//following-sibling::*[1]").extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #14.modules_en
        modules_en = response.xpath('//*[@id="modules"]/h4/strong').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #15.rntry_requirements
        rntry_requirements = response.xpath("//*[contains(text(),'Entry Requirements')]//following-sibling::*").extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #16.ielts 17.18.19.20
        ielts_list = response.xpath("//*[contains(text(),'English Language Requirements')]//following-sibling::*[1]").extract()
        ielts_list = ''.join(ielts_list)
        ielts_list = remove_tags(ielts_list)
        # print(ielts_list)
        try:
            ielts = re.findall('\d\.\d',ielts_list)
        except:
            ielts = None
        if len(ielts) ==1:
            a = ielts[0]
            ielts = a
            ielts_r = a
            ielts_w = a
            ielts_s = a
            ielts_l = a
        elif len(ielts) ==2:
            a= ielts[0]
            b= ielts[1]
            ielts = a
            ielts_r = b
            ielts_w = b
            ielts_s = b
            ielts_l = b
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        # print(ielts,ielts_r,ielts_w,ielts_l,ielts_s)

        #21.career_en
        career_en = response.xpath("//*[contains(text(),'What are my career prospects?')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #22.tuition_fee
        tuition_fee= response.xpath("//*[contains(text(),'Tuition Fees')]//following-sibling::*").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #23.tuition_fee_pre
        tuition_fee_pre= '£'

        #24.apply_proces_en
        apply_proces_en = response.xpath("//h4[contains(text(),'How to Apply')]//following-sibling::*").extract()
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = remove_class(apply_proces_en)
        # print(apply_proces_en)

        #25.apply_pre
        apply_pre = '£'

        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['start_date'] = start_date
        item['department'] = department
        item['location'] = location
        item['overview_en'] = overview_en
        item['assessment_en'] = assessment_en
        item['modules_en'] = modules_en
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_proces_en'] = apply_proces_en
        yield  item
コード例 #3
0
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Bournemouth University'
        # print(university)

        #2.location
        location = response.xpath(
            "//*[contains(text(),'Location:')]//following-sibling::p").extract(
            )
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #3.programme_en 4.degree_name
        programme_en = response.xpath('/html/body/div/section//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        try:
            degree_name = programme_en.split()[0]
        except:
            degree_name = ''
        if '-' in programme_en:
            programme_en = programme_en.replace('-', '')
        programme_en = programme_en.replace(degree_name, '')
        programme_en = clear_space_str(programme_en)
        if '–' in programme_en:
            programme_en = programme_en.replace('–', '').strip()
        programme_en = programme_en.replace('&amp;', '').replace('(Hons)',
                                                                 '').strip()
        # print('programme_en:',programme_en)
        # print('degree_name:',degree_name)

        #5.degree_type
        degree_type = 1

        #6.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS Code:')]//following-sibling::*"
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        # print(ucascode)

        #7.duration #8.duration_per
        duration_a = response.xpath(
            "//*[contains(text(),'Duration:')]//following-sibling::p").extract(
            )
        duration_a = ''.join(duration_a)
        duration_a = remove_tags(duration_a)
        # print(duration)
        if 'Four years' in duration_a:
            duration = 4
            duration_per = 1
        else:
            duration = re.findall('\d', duration_a)[0]
            duration_per = 1
        # print('duration_per:',duration_per)
        # print('duration:',duration)

        #9.overview_en
        overview_en = response.xpath(
            '//*[@id="main-content"]/div/section[3]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.alevel
        try:
            alevel_list = response.xpath(
                "//*[contains(text(),'GCSEs')]//preceding-sibling::p").extract(
                )[1]
            alevel = ''.join(alevel_list)
            alevel = remove_tags(alevel)
        except:
            alevel = 'N/A'
        # print(alevel)

        #11.modules_en
        modules_en = response.xpath(
            "//section[@id='course-details']//div[@id='accordion-1']").extract(
            )
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #12.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Next start date:')]//following-sibling::p"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #13.ib
        ib = response.xpath(
            "//*[contains(text(),'International Baccalaureate')]/..").extract(
            )
        ib = ''.join(ib)
        ib = remove_tags(ib)
        if len(ib) > 500:
            ib = ib[:500]

        # print(ib)

        #14.ielts 15.16.17.18
        rntry_requirements = response.xpath(
            '//*[@id="entry-requirements"]/div').extract()
        rntry_requirements = ''.join(rntry_requirements)
        ielts_list = re.findall('\d\.\d', rntry_requirements)
        # print(ielts_list)
        if len(ielts_list) == 4:
            ielts = ielts_list[2]
            ielts_l = ielts_list[3]
            ielts_s = ielts_list[3]
            ielts_r = ielts_list[3]
            ielts_w = ielts_list[3]
        elif len(ielts_list) == 3:
            ielts = ielts_list[1]
            ielts_l = ielts_list[2]
            ielts_s = ielts_list[2]
            ielts_r = ielts_list[2]
            ielts_w = ielts_list[2]
        elif len(ielts_list) == 2:
            ielts = ielts_list[0]
            ielts_l = ielts_list[1]
            ielts_s = ielts_list[1]
            ielts_r = ielts_list[1]
            ielts_w = ielts_list[1]
        elif len(ielts_list) == 1:
            ielts = ielts_list[0]
            ielts_l = ielts_list[0]
            ielts_s = ielts_list[0]
            ielts_r = ielts_list[0]
            ielts_w = ielts_list[0]
        else:
            ielts = None
            ielts_l = None
            ielts_s = None
            ielts_r = None
            ielts_w = None
        # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s)

        #19.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Careers')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #20.tuition_fee,#21.tuition_fee_pre
        tuition_fee_list = response.xpath(
            '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]'
        ).extract()
        tuition_fee_list = ''.join(tuition_fee_list)
        #
        # if len(tuition_fee) == 0:
        #     tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract()
        # tuition_fee = ''.join(tuition_fee)
        # tuition_fee = remove_tags(tuition_fee)
        # tuition_fee = tuition_fee.replace(',','')
        # tuition_fee = tuition_fee.replace('£','')
        # print(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee_list)
        # print(tuition_fee)
        tuition_fee_pre = '£'

        #22.url
        url = response.url
        # print(url)

        #23.application_open_date
        application_open_date = '2018-7-18'
        #24.apply_pre
        apply_pre = '£'

        #25.apply_fee
        apply_fee = 0
        #26.apply_proces_en
        apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>'

        #27.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'How you will be assessed')]//following-sibling::p|//*[@id='accordion-1']/div[6]"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en,url)

        item['assessment_en'] = assessment_en
        item['alevel'] = alevel
        item['ib'] = ib
        item['ucascode'] = ucascode
        item['apply_proces_en'] = apply_proces_en
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['location'] = location
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['degree_type'] = degree_type
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['start_date'] = start_date
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['url'] = url
        item['application_open_date'] = application_open_date
        yield item
コード例 #4
0
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'London School of Economics and Political Science'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.teach_type
        if '2018/VRS' in url:
            teach_type = 'research'
        elif '2018/MResPhD' in url:
            teach_type = 'research'
        elif '2018/MPhilPhD' in url:
            teach_type = 'research'
        else:
            teach_type = 'taught'
        # print(teach_type)

        #4.programme_en
        programme_en = response.xpath(
            '//*[@id="form1"]/header[2]/div/div[2]/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #5.degree_type
        degree_type = 2

        #6.degree_name
        if 'MSc' in programme_en:
            degree_name = 'MSc'
        elif 'LSE' in programme_en:
            degree_name = 'LSE'
        elif 'MPA' in programme_en:
            degree_name = 'MPA'
        elif 'LLM' in programme_en:
            degree_name = 'LLM'
        elif 'Diploma' in programme_en:
            degree_name = 'Diploma'
        elif 'MA' in programme_en:
            degree_name = 'MA'
        elif 'MPhil/PhD' in programme_en:
            degree_name = 'MPhil/PhD'
        elif 'MRes/PhD' in programme_en:
            degree_name = 'MRes/PhD'
        elif 'Visiting Research' in programme_en:
            degree_name = 'Visiting Research'
        else:
            degree_name = 'N/A'
        # print(degree_name)
        programme_en = programme_en.replace(degree_name,
                                            '').strip().replace('-', '')
        # print(programme_en)

        #7.department
        department = response.xpath(
            '//*[@id="form1"]/div[4]/div/div[1]/div/ul/li[2]').extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #8.overview_en
        overview_en = response.xpath(
            '//*[@id="form1"]/div[3]/div/div[2]/div/p|//*[@id="form1"]/div[4]/div/div[2]/div/p'
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #9.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Start date')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        try:
            start_date = tracslateDate(start_date)[0]
            if 'Introductory' in start_date:
                start_date = '2018-9'
            elif 'Early' in start_date:
                start_date = '2018-9'
            elif 'First' in start_date:
                start_date = '2018-9'
            elif 'Mandatory' in start_date:
                start_date = '2018-9'
            elif 'Suspended' in start_date:
                start_date = '2018-9'
            elif 'Late' in start_date:
                start_date = '2018-9'
            elif 'Intake' in start_date:
                start_date = '2018-9'
            else:
                start_date = start_date
        except:
            start_date = 'N/A'
        # print(start_date)

        #10.deadline
        deadline = response.xpath(
            "//*[contains(text(),'Application deadline')]//following-sibling::*"
        ).extract()
        deadline = ''.join(deadline)
        deadline = remove_tags(deadline)
        deadline = tracslateDate(deadline)
        deadline = ''.join(deadline)
        deadline = deadline.replace('None', '').replace('However', '').replace(
            'Apply', '').replace('Sciences', '')
        # print(deadline)

        #11.duration
        duration_list = response.xpath(
            "//*[contains(text(),'Duration')]//following-sibling::*").extract(
            )
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        # print(duration_list)
        if 'Nine months' in duration_list:
            duration = 9
            duration_per = 3
        elif 'Ten months' in duration_list:
            duration = 10
            duration_per = 3
        elif 'months' in duration_list:
            duration = re.findall('\d+', duration_list)[0]
            # print(duration)
            duration_per = 3
        elif 'Three-four years' in duration_list:
            duration = 3
            duration_per = 1
        elif 'Three to four years' in duration_list:
            duration = 3
            duration_per = 1
        elif 'Four to five' in duration_list:
            duration = 4
            duration_per = 1
        elif 'Five years' in duration_list:
            duration = 5
            duration_per = 1
        elif '3 to 4 years' in duration_list:
            duration = 3
            duration_per = 1
        elif 'Six years' in duration_list:
            duration = 6
            duration_per = 1
        else:
            duration = 1
            duration_per = 1
        # print(duration,'********************',duration_per)

        #12.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'Tuition fee')]//following-sibling::*[1]"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #13.location
        location = response.xpath(
            "//*[contains(text(),'Location')]//following-sibling::*[1]"
        ).extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #14.rntry_requirements
        rntry_requirements = response.xpath(
            "//*[contains(text(),'Minimum entry requirement')]//following-sibling::*[1]"
        ).extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        rntry_requirements = clear_space_str(rntry_requirements)
        # print(rntry_requirements)

        #15.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Programme structure and courses')]/../../following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #16.assessment_en
        assessment_en = response.xpath(
            "//h3[contains(text(),'ssessment')]//following-sibling::*"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        assessment_en = clear_space_str(assessment_en)
        # print(assessment_en)

        #17.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Support for your career')]//preceding-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = clear_space_str(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #18.ielts,19.20.21.22
        if 'MPhil' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_l = 6.5
            ielts_w = 7.0
            ielts_s = 6.5
        elif 'LLM' in programme_en:
            ielts = 7.5
            ielts_r = 6.5
            ielts_l = 7.0
            ielts_w = 7.0
            ielts_s = 6.5
        else:
            ielts = 7.0
            ielts_r = 6.5
            ielts_l = 6.5
            ielts_w = 6.0
            ielts_s = 6.0

        #23.require_chinese_en
        require_chinese_en = "<p>Graduate entry requirements for applicants from China Taught master's programmes (MSc/MA/MPA/LLM)To be considered for admission to a taught master's programme, we would normally require a bachelor's degree with an overall mark of 85 per cent from applicants who have attended a highly regarded institution in China, with all other applicants we would normally require a mark of at least 90 per cent.Research programmes (MPhil/MRes/PhD)To be considered for admission to a research programme, we would normally require a master's degree with an overall mark of 85 per cent/B from applicants who have attended a highly regarded institution, while all other applicants are normally required to obtain a mark of 90 per cent/A.</p>"

        #24.apply_proces_en
        apply_proces_en = 'http://www.lse.ac.uk/study-at-lse/Graduate/Prospective-students/How-to-Apply'

        #25.teach_time
        teach_time = 'Full time'

        #26.tuition_fee_pre
        tuition_fee_pre = '£'
        #27.toefl 28293031
        if 'MPhil' in programme_en:
            toefl = 100
            toefl_r = 23
            toefl_l = 22
            toefl_w = 27
            toefl_s = 22
        elif 'LLM' in programme_en:
            toefl = 109
            toefl_r = 23
            toefl_l = 24
            toefl_w = 27
            toefl_s = 22
        else:
            toefl = 100
            toefl_r = 23
            toefl_l = 22
            toefl_w = 24
            toefl_s = 22
        #32.apply_pre
        apply_pre = '£'
        #33.apply_documents_en
        apply_documents_en = '<p>We welcome applications from all suitably qualified prospective students and want to recruit students with the very best academic merit, potential and motivation, irrespective of their background. We carefully consider each application on an individual basis, taking into account all the information presented on your application form, including your: academic achievement (including predicted and achieved grades) personal statement two references CV</p>'
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['toefl_l'] = toefl_l
        item['toefl_s'] = toefl_s
        item['apply_pre'] = apply_pre
        item['apply_documents_en'] = apply_documents_en
        item['university'] = university
        item['url'] = url
        item['teach_type'] = teach_type
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['department'] = department
        item['overview_en'] = overview_en
        item['start_date'] = start_date
        item['deadline'] = deadline
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['tuition_fee'] = tuition_fee
        item['location'] = location
        item['rntry_requirements'] = rntry_requirements
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['require_chinese_en'] = require_chinese_en
        item['apply_proces_en'] = apply_proces_en
        item['teach_time'] = teach_time
        item['tuition_fee_pre'] = tuition_fee_pre
        yield item