Python clear_duration Examples

Programming Language: Python

Namespace/Package Name: scrapySchool_England.middlewares

Method/Function: clear_duration

Examples at hotexamples.com: 23

Python clear_duration - 23 examples found. These are the top rated real world Python examples of scrapySchool_England.middlewares.clear_duration extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: DeMontfortUniversity_P.py Project: histudent/python_spider

    def parse(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        Internationnal = response.xpath(
            '//div[@data-kftab="2"]//text()').extract()
        # print(response.url)
        Course = response.xpath(
            '//div[@class="block__details block__details--overlay block__details--courseOverlay"]//h1[@class="block__details__title"]//text()'
        ).extract()[0]
        Course = Course.strip()
        Master = re.findall('[A-Z]{1}[A-Za-z]{1,3}\s?\([a-zA-Z]*\)', Course)
        Master = ''.join(Master)
        programme = Course.replace(Master, '')
        if Master == '':
            Master = re.findall('MA|MSc', Course)
            Master = ''.join(Master)
            # print(Master, Course, response.url)
        else:
            Master = ''
        # 专业描述
        CourseOverview = response.xpath(
            '//div[@class="block large-8 columns course-col2"]').extract()
        overview = remove_class(CourseOverview)
        overview = clear_same_s(overview)
        # 学费
        tuition_fee = response.xpath(
            '//*[contains(text(),"£")]//text()').extract()
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        # 课程长度
        duration = response.xpath(
            '//*[contains(text(),"uration")]/..//text()').extract()
        mode = re.findall('(?i)full', ''.join(duration))
        if mode != []:
            mode = '1'
        else:
            mode = '2'
        try:
            duration = clear_duration(duration)
        except:
            duration = {'duration_per': None, 'duration': None}
        print(duration)

        # 申请要求
        standard = response.xpath(
            '//div[@class="row row--block course-section course-section--criteria"]'
        ).extract()
        standard = remove_class(standard)
        standard = clear_same_s(standard)

        # 课程及评估
        Evaluation_method = response.xpath(
            '//div[@id="cycle-slideshow_course"]').extract()
        Evaluation_method = remove_class(Evaluation_method)
        Evaluation_method = clear_same_s(Evaluation_method)
        teaching_assessment = Evaluation_method.strip()

        # 就业
        Career = response.xpath(
            '//div[@class="row row--block course-section course-section--opps"]'
        ).extract()
        career = remove_class(Career)
        career = clear_same_s(career)
        # print(Career)
        IELTS = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        # print(IELTS)
        ielts = get_ielts(IELTS)
        # print(IELTS)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']
        else:
            item['ielts'] = ''
            item['ielts_l'] = ''
            item['ielts_s'] = ''
            item['ielts_r'] = ''
            item['ielts_w'] = ''
        # print(tuition_fee)
        university = 'De Montfort University'
        programme = programme.replace(Master, '').strip()

        item["university"] = university
        item["location"] = 'Lestat de Lioncourt'
        item["department"] = ''
        item["programme_en"] = programme
        item["degree_name"] = Master
        item['degree_type'] = 2
        item["teach_time"] = mode
        item['teach_type'] = 'taught'
        item["overview_en"] = overview
        item["assessment_en"] = teaching_assessment
        item["career_en"] = career
        item["tuition_fee"] = tuition_fee
        item['tuition_fee_pre'] = '£'
        item["modules_en"] = Evaluation_method
        item["duration"] = duration['duration']
        item['duration_per'] = duration['duration_per']
        item["start_date"] = '2018-9'
        item["rntry_requirements"] = standard
        item["url"] = response.url

        # print(programme)
        yield item

Example #2

Show file

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        item['university']='Anglia Ruskin University'
        item['url']=response.url
        item['teach_time']='1'
        programme=response.xpath('//h1/text()').extract()
        programme=''.join(programme).split('\r\n')
        if len(programme)==4:
            prog=programme[1].strip()
            degr=programme[2].strip()
            item['degree_name'] = degr
        else:
            prog=''.join(programme)
        item['programme_en']=prog


        location=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__locations"]/a/text()').extract()
        location=set(location)
        # print(location)
        location=','.join(location)
        item['location']=location

        start_date=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__entry"]/text()').extract()
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date']=start_date

        duration=response.xpath('//div[@class="course-summary__teaching"]/p[1]/text()').extract()
        try:
            duration=clear_duration(duration)
            item['duration']=duration['duration']
            item['duration_per']=duration['duration_per']
        except:
            pass

        overview=response.xpath('//div[@id="overview"]').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        career=response.xpath('//div[@id="careers"]').extract()
        career=remove_class(career)
        # print(career)
        item['career_en']=career

        modules=response.xpath('//div[@id="modulesassessment"]').extract()
        modules=remove_class(modules)
        item['modules_en']=remove_class(modules)

        item['ielts']='6.5'
        item['ielts_l']='5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['ielts_desc']='Our standard entry criteria for postgraduate courses is IELTS 6.5 or equivalent, with nothing lower than 5.5 in any of the four elements (listening, speaking, reading and writing).'
        item['toefl']='88'
        item['toefl_l']='17'
        item['toefl_s'] = '20'
        item['toefl_r'] = '18'
        item['toefl_w'] = '17'
        item['toefl_desc']="TOEFL iBT with 88 overall and a minimum of 17 in Writing and Listening, 18 in Reading and 20 in Speaking"

        fee=response.xpath('//div[@id="feesfunding"]//text()').extract()
        tuition_fee=getTuition_fee(fee)
        # print(tuition_fee)
        if tuition_fee==2018:
            tuition_fee=0
        item['tuition_fee']=tuition_fee
        item['tuition_fee_pre']='£'

        department=response.xpath('//a[contains(text(),"Visit your")]/@href').extract()
        # print(department)
        department=''.join(department).split('/')[-1]
        # print(department)
        department=department.title().replace('-',' ')
        # print(department)
        item['department']=department

        how_to_apply=["<p>Step 1 - Choose your course</p>",
"<p>Step 2 - Submit your application form</p>",
"<p>Step 3 - Check your email regularly</p>",
"<p>Step 5 - Start your visa application</p>",
"<p>Step 4 - Receive our decision on your application</p>",]
        how_to_apply='\n'.join(how_to_apply)
        item['apply_proces_en']=how_to_apply

        apply_d=["<ul><li>Qualification certificates and transcripts, including certified translations, where applicable</li>",
"<li>A personal statement. You can download and complete our Personal Statement Form.</li>",
"<li>References/recommendation letters</li>",
"<li>Curriculum vitae/resume</li>",
"<li>Passport</li>",
"<li>Current and previous visa(s) (if applicable)</li>",
"<li>Proof of name change (if applicable)</li>",
"<li>Portfolio (if applicable)</li></ul>",]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        courseid=response.xpath('//input[@id="erastracode"]/@value').extract()
        # print(courseid)
        if courseid==['']:
            rntry=response.xpath('//h4[contains(text(),"ain")]/following-sibling::*').extract()
            rntry=remove_class(rntry)
            # print(rntry)
            item['rntry_requirements'] = rntry
        else:
            cid=re.findall('[A-Z0-9]+',courseid[0])
            courseid='%20'.join(cid)
            rntry_url='https://www.anglia.ac.uk/api/coursewidget/multipleentryrequirements?academicYears=2017%2C2018&moaCode=FT&astraCode='+courseid
            # print(rntry_url)
            try:
                rntry_content=json.loads(requests.get(rntry_url).text)[0]['GroupItems'][0]['Text'][0]
                rntry_content='<div>'+rntry_content+'</div>'
            except:
                rntry_content=''

            item['rntry_requirements'] = rntry_content
            # print(rntry_content)
        # yield item

Example #3

Show file

File: StaffordshireUniversity_P.py Project: histudent/python_spider

    def parses(self, response):
        # print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'Staffordshire University'
        item['url'] = response.url
        item['location'] = 'Staffordshire'
        programme = response.xpath('//h1/text()').extract()
        programme = ''.join(programme).strip()
        degree_name = response.xpath(
            '//h2[@class="hero_header text-center"]/text()').extract()
        if degree_name == []:
            degree_name = re.findall('[A-Z]{2,}[a-z]*', programme)
            degree_name = ''.join(degree_name).strip()
            item['degree_name'] = degree_name
        else:
            item['degree_name'] = ''.join(degree_name).strip()
        item['programme_en'] = programme
        programme = response.xpath(
            '//div[@class="col-sm-9"]/h1/text()|//div[@id="main"]//h1/text()'
        ).extract()
        programme = ''.join(programme).strip()
        degree = re.findall('[A-Z]{2}[/a-zA-Z\s]*', programme)
        programme = programme.replace(''.join(degree), '').strip()
        if degree == []:
            degree = response.xpath(
                '//h2[@class="hero_header text-center"]/text()').extract()
        elif degree != []:
            degree = ''.join(degree)
        else:
            degree = ''
        item['degree_name'] = ''.join(degree).strip()
        item['programme_en'] = programme
        duration = response.xpath(
            '//th[contains(text(),"Duration")]/following-sibling::td/text()|//dt[contains(text(),"Duration")]/following-sibling::dd[1]/text()'
        ).extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        start_date = response.xpath(
            '//dt[contains(text(),"Academic year:")]/following-sibling::dd/text()'
        ).extract()
        if start_date == []:
            start_date = response.xpath(
                '//th[contains(text(),"Course start")]/following-sibling::td/text()'
            ).extract()
        start_date = tracslateDate(start_date)
        item['start_date'] = ','.join(start_date).strip()
        department = response.xpath(
            '//th[contains(text(),"School")]/following-sibling::td/text()'
        ).extract()
        department = ''.join(department).strip()
        item['department'] = department
        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        tuition_fee = getTuition_fee(fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'
        overview = response.xpath(
            '//div[@id="key-features"]|'
            '//section[@class="course-details_section summary-section"]//div[@class="medium-8 medium-pull-4 large-pull-3 column"]'
        ).extract()

        overview = remove_class(overview)
        item['overview_en'] = overview
        modules = response.xpath(
            '//div[@id="course-content"]|//section[@id="contents"]|//div[@id="course-summary"]'
        ).extract()
        modules = remove_class(modules)
        item['modules_en'] = modules
        rntry = response.xpath(
            '//div[@id="course-entry-requirements"]|//section[@id="entry"]'
        ).extract()
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry
        career = response.xpath(
            '//div[@id="graduate-destinations"]|//section[@id="careers"]'
        ).extract()
        career = remove_class(career)
        item['career_en'] = career
        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = ''.join(ielts).strip()
        item['ielts_desc'] = ielts
        ielts = get_ielts(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass
        assessment = response.xpath(
            '//a[contains(text(),"ssessment")]/../following-sibling::div[1]'
        ).extract()
        item['assessment_en'] = remove_class(assessment)

        yield item

Example #4

Show file

File: BuckinghamshireNewUniversity_P.py Project: histudent/python_spider

    def parses(self, response):
        # print('进入专业链接页面',response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['url'] = response.url
        item['university'] = 'Buckinghamshire New University'
        location = response.xpath(
            '//ul[@class="course-details"]/li[contains(text(),"Location")]/text()'
        ).extract()
        location = ''.join(location).replace('Location:', '').strip()
        # print(location)
        programme = response.xpath(
            '//h1[@class="banner-title"]/text()').extract()
        item['programme_en'] = ''.join(programme).strip()
        degree_name = response.xpath(
            '//p[@class="school-code"]/text()').extract()
        item['degree_name'] = ''.join(degree_name).strip()
        item['location'] = location
        duration = response.xpath(
            '//ul[@class="course-details"]/li[contains(text(),"Duration")]/text()'
        ).extract()
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        start_date = response.xpath(
            '//ul[@class="course-details"]/li[contains(text(),"Start Date")]/text()'
        ).extract()
        start_date = tracslateDate(start_date)
        # print(start_date)
        overview = response.xpath(
            '//h2[contains(text(),"Course Overview")]/..').extract()
        item['overview_en'] = remove_class(overview)
        modules = response.xpath(
            '//h2[contains(text(),"Course Modules")]/..').extract()
        item['modules_en'] = remove_class(modules)
        career = response.xpath(
            '//h2[contains(text(),"Employability")]/..').extract()
        item['career_en'] = remove_class(career)
        entry = response.xpath(
            '//h3[contains(text(),"What are the course entry requirements?")]/following-sibling::p[position()<=3]'
        ).extract()
        if entry == []:
            print(response.url)
        else:
            print(entry)
        item['rntry_requirements'] = remove_class(entry)
        item['tuition_fee'] = '11500'
        # item['apply_desc_en']=remove_class(entry)
        chi = [
            ' <div>  ',
            ' <p>Academic entry requirements</p ><p>We require successful completion of a 学士学位 (Bachelor degree) or successful completion of a three-year 本科毕业证书 (Benke) with an overall pass from a UK NARIC-recognised or Ministry of Education-listed institution.</p ><p>Mathematics entry requirements</p ><p>Students need the equivalent of GCSE Mathematics grade C/4.</p >  ',
            ' </div>  ',
        ]
        htp = [
            '<p>There&rsquo;s still time to apply for September 2018. Visit our <a hre>clearing section</a> to find out more.</p><p><strong>Check you meet the entry requirements</strong></p><p>Once you&rsquo;ve had a good look at our course information, and chosen which one feels right for you, before applying it&rsquo;s worth checking that you meet the entry requirements for your country.</p><p>We welcome applications from students with a wide range of qualifications from around the world. You&rsquo;ll find details of the exact academic and English language requirements for your country on our <a hre>country pages</a>.</p><p>Every student studying with us also needs to meet our <a hre>English language requirements</a> and we will ask you to provide evidence to show you have good enough English to study a higher education course in the UK.</p><p><strong>Different ways to apply</strong></p><p>When you are ready to apply for your course, you can do so in one of three ways:</p><ul><li>directly through our <a href="https://www.applycpd.com/bucks?tabid=21">application portal</a></li><li>through <a hre>UCAS</a>, or</li><li>through a recruitment agent in your country (see <a hre>your country page</a> for details of agents we work with who are operating locally to you).</li></ul><p>It doesn&rsquo;t matter which of these routes you use, but we advise you to apply early to give yourself enough time to prepare for moving to the UK and arranging your visa, if you need one.</p><p>If you&rsquo;ve missed out on your first choices, declined any offers made to you, or you&rsquo;re applying to university after&nbsp;30 June, you can also apply to us through <a hre>Clearing</a>.</p>',
        ]
        item['require_chinese_en'] = remove_class(chi)
        item['apply_desc_en'] = remove_class(htp)
        item['ielts'] = '6.0'
        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'

        yield item

Example #5

Show file

File: StMaryUniversityTwickenham_P.py Project: histudent/python_spider

    def parse(self, response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)

        item['university']="St Mary's University, Twickenham"
        item['url']=response.url
        item['location']='London'

        rntry=response.xpath('//h2[contains(text(),"Entry requirements")]/following-sibling::div').extract()
        rntry=remove_class(rntry)
        # print(rntry)
        item['rntry_requirements']=rntry

        modules=response.xpath('//h2[contains(text(),"Course")]/../following-sibling::div//ul').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        overview=response.xpath('//div[@id="overview"]//div[@class="large-8 columns content"]').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        duration=response.xpath('//p[contains(text(),"uration")]/preceding-sibling::p/text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        programme=response.xpath('//h1/text()').extract()
        # print(programme)
        if len(programme)==2:
            prog=programme[0]
            degr=programme[1]
            item['programme_en']=prog
            item['degree_name']=degr
            degree_name=degr

        else:
            prog=''.join(programme).strip()
            item['programme_en']=prog
        # print(prog)

        career=response.xpath('//section[@id="careers"]').extract()
        career=remove_class(career)
        # print(career)
        item['career_en']=career

        #13650
        fee=response.xpath('//h2[contains(text(),"Tuition")]/following-sibling::*/text()').extract()
        tuition_fee=getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee']=tuition_fee
        item['tuition_fee_pre']='£'

        item['deadline']='2019-7-31'

        apply_d=["<ul><li>Copies of academic transcripts and certificates</li>",
"<li>A Copy of your English language requirements (if needed)</li>",
"<li>A Copy of your passport</li>",
"<li>Visa history questionnaire</li></ul>",]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        # print(item)
        ielts=response.xpath('//h4[contains(text(),"International re")]/following-sibling::p[1]/text()').extract()
        ielts=''.join(ielts).strip()
        # print(ielts)
        ielts=get_ielts(ielts)
        try:
            if ielts!=[] or ielts!={}:
                item['ielts_l']=ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        assessment=response.xpath('//h2[contains(text(),"ssessment")]/following-sibling::p[position()<=5]').extract()
        if assessment==[]:
            print(response.url)
        else:
            print('sssssssssssssssssssssss')
        item['assessment_en']=remove_class(assessment)

        yield item

Example #6

Show file

File: UniversityofBedfordshire_P.py Project: histudent/python_spider

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)

        item['university']='University of Bedfordshire'
        item['url']=response.url

        programme=response.xpath('//div[@id="inner-course-content"]/h1/text()').extract()
        # print(programme)
        programme=''.join(programme)
        # print(programme)
        item['tuition_fee_pre']='£'
        if 'MBA' in programme:
            # print(programme)
            item['tuition_fee']='14000'
        else:
            item['tuition_fee']='12750'

        programme=programme.split('-')
        if len(programme)==2:
            prog=programme[0].strip()
            degr=programme[1].strip()
            # print(prog)
            # print(degr)
            item['degree_name']=degr
            try:
                if degr[0] == 'M':
                    item['degree_type'] = '2'
                elif degr[0] == 'P':
                    item['degree_type'] = '3'
            except:
                pass
        else:
            prog=''.join(programme).strip()
        item['programme_en']=prog
        location=response.xpath('//strong[contains(text(),"Campus Location")]/../text()').extract()
        location=''.join(location).replace('-','').strip()
        # print(location)
        item['location']=location

        duration=response.xpath('//strong[contains(text(),"Duration")]/../text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        mode=response.xpath('//strong[contains(text(),"Attendance")]/../text()').extract()
        mode=''.join(mode)
        mode=re.findall('(?i)full',mode)
        if mode!=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        start_date=response.xpath('//strong[contains(text(),"Start")]/../text()').extract()
        # print(start_date)
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        # print(start_date)
        item['start_date']=start_date

        overview=response.xpath('//div[@id="why_content"]').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        modules=response.xpath('//div[@id="unit_content"]').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        assessment_en=response.xpath('//div[@id="how_content"]').extract()
        assessment_en=remove_class(assessment_en)
        item['assessment_en']=assessment_en

        rntry=response.xpath('//h2[@id="entry"]/following-sibling::div/ul[@class="tab-content"]/div[3]').extract()
        rntry=remove_class(rntry)
        # print(rntry)
        item['rntry_requirements']=rntry

        item['ielts']='6.0'
        item['ielts_l']='5.5'
        item['ielts_s']='5.5'
        item['ielts_r']='5.5'
        item['ielts_w']='5.5'
        # item['toefl']='80'
        item['toefl_l']='17'
        item['toefl_s']='20'
        item['toefl_r']='18'
        item['toefl_w']='17'

        career=response.xpath('//div[@id="career_content"]').extract()
        career=remove_class(career)
        # print(career)
        item['career_en']=career

        apply_d=['<p>There are two ways you can make a direct application to the University of Bedfordshire:</p><ul><li><a href="https://evision.beds.ac.uk/urd/sits.urd/run/siw_ipp_lgn.login?process=siw_ipp_app&amp;code1=OA_FORM&amp;code2=0007">Apply online now for 2017/18</a> Courses starting from 1 August 2017 to 31 July 2018</li><li>Download <span class="include_asset_summary"><a href="https://www.beds.ac.uk/__data/assets/pdf_file/0006/441798/International-Application-web-2018.pdf">an application form - <img src="https://www.beds.ac.uk/__data/asset_types/pdf_file/icon.png" alt="" title="" height="16" width="16"  class="sq-icon" /> PDF  1.0 MB ',
'</a></span> and submit it to our <a href="https://www.beds.ac.uk/international/international-applications/contactus">Admissions Team</a> along with scans of your supporting documents, via email, post or in person at the International Office.</li></ul><p>You can post your completed form to:</p><p>University of Bedfordshire International Admissions/International Office/University Square/Luton/Bedfordshire/LU1 3JU/United Kingdom</p><h4>Please note</h4><ul><li><strong>BSc (Hons) Nursing Studies</strong> Level 3 and <strong>MSc Advanced Nursing Studies</strong> are available to overseas students - please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a></li><li><strong>Healthcare, Nursing and Midwifery students</strong> - many of these courses are not available to overseas students due to UK immigration law in regard to bursary funding. Please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a> to find out if you are eligible to apply.</li></ul><p>*Please note that international students studying on a Tier 4 Student Visa must choose a full-time Undergraduate or Postgraduate course and are not eligible for part-time study.</p><p>Watch some more tips and advice on making your application to Bedfordshire:</p>',]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        # item['application_open_date']='2018-8'
        # item['deadline']='2019-7'

        # print(item)
        yield item

Example #7

Show file

    def pro_parse(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['url'] = response.url
        item['university'] = 'London South Bank University'
        item['location'] = 'London'
        item['tuition_fee_pre'] = '£'
        pro = response.xpath('//div[@id="breadcrumbs"]//span/text()').extract()
        prog = pro[-1].split('-')
        if len(prog) == 2:
            programme = prog[0]
            degree_type = prog[1]
            degree_type = degree_type.strip()
            item['degree_name'] = degree_type
            if degree_type[0] == 'M':
                item['degree_type'] = '2'
            elif degree_type[0] == 'P':
                item['degree_type'] = '3'
        else:
            programme = prog
        item['programme_en'] = programme
        fee = response.xpath(
            '//div[@id="tab_fees_and_funding"]//*[contains(text(),"£")]//text()'
        ).extract()
        # print(fee)
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        overview = response.xpath('//div[@id="tab_overview"]').extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath('//div[@id="tab_modules"]').extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        career = response.xpath('//div[@id="tab_employability"]').extract()
        career = remove_class(career)
        item['career_en'] = career

        rntry = response.xpath('//div[@id="tab_entry_requirements"]').extract()
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        ielts = get_ielts(rntry)
        # print(ielts)
        if ielts != [] and ielts != {}:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']

        apply_desc_en = response.xpath(
            '//div[@id="tab_how_to_apply"]').extract()
        apply_desc_en = remove_class(apply_desc_en)
        item['apply_desc_en'] = apply_desc_en

        duration = response.xpath(
            '//td/span[contains(text(),"Duration")]/following-sibling::div/text()'
        ).extract()
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        mode = response.xpath(
            '//td/span[contains(text(),"Mode")]/following-sibling::div/text()'
        ).extract()
        mode = set(mode)
        mode = ''.join(mode)
        # print(mode)
        mode = re.findall('(?i)full', mode)
        if mode != []:
            item['teach_time'] = '1'
        else:
            item['teach_time'] = '2'
        start_date = response.xpath(
            '//td/span[contains(text(),"Start")]/following-sibling::div/text()'
        ).extract()
        # start_date=tracslateDate(start_date)
        # start_date=set(start_date)
        try:
            start_date = tracslateDate(start_date)
            start_date = list(set(start_date))
            start_list = []
            for i in start_date:
                start_list.append('2019' + '-' + i)
            start_date = ','.join(start_list)
            item['start_date'] = start_date
        except:
            pass

        item['department'] = ''.join(
            response.xpath(
                '//a[contains(text(),"School of")]/text()').extract())
        yield item

Example #8

Show file

File: UniversityCollegeLondon_P.py Project: histudent/python_spider

    def parses(self, response):
        print(response.url)
        # print('收到了')
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'University College London'
        item['url'] = response.url
        item['tuition_fee_pre'] = '£'
        location = response.xpath(
            '//div/strong[contains(text(),"Location")]/../text()').extract()
        location = ''.join(location).strip()
        item['location'] = location
        programme = response.xpath('//h1[@class="heading"]//text()').extract()
        programme = ''.join(programme)
        # print(programme)
        degree_name = re.findall('[MB][A-Z]{1,2}[a-z]*', programme)
        # print(degree_name)
        degree_name = ''.join(set(degree_name)).strip()
        programme = programme.replace(degree_name, '')
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        item['degree_type'] = '2'
        # print(programme)
        mode = response.xpath('//*[contains(text(),"FT")]//text()').extract()
        if mode != []:
            item['teach_time'] = 1
        else:
            item['teach_time'] = 2
        # department=response.meta['department']
        # department=''.join(department).strip()
        # # print(department)
        # item['department'] = department
        department = response.xpath(
            '//h5[contains(text(),"Department website")]/following-sibling::p/a/text()'
        ).extract()
        department = ''.join(department).strip()
        # # print(department)
        item['department'] = department

        overview = response.xpath(
            '//article[@class="article"]/h1/following-sibling::article/p[1]'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        application_open_date = response.xpath(
            '//div[contains(text(),"Open")]/text()').extract()
        application_open_date = tracslateDate(application_open_date)
        # print(application_open_date)
        application_open_date = ','.join(set(application_open_date))
        item['application_open_date'] = application_open_date

        deadline = response.xpath(
            '//div[contains(text(),"Close")]/text()').extract()
        deadline = tracslateDate(deadline)
        deadline = ','.join(set(deadline))
        item['deadline'] = deadline

        tuition_fee = getTuition_fee(
            response.xpath('//*[contains(text(),"£")]//text()').extract())
        item['tuition_fee'] = tuition_fee

        duration = response.xpath(
            '//h4[contains(text(),"uration")]/following-sibling::div/text()'
        ).extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        start_date = response.xpath(
            '//h4[contains(text(),"tarts")]/following-sibling::p//text()'
        ).extract()
        # print(start_date)
        start_date = tracslateDate(start_date)
        # print(start_date)
        start_date = ','.join(set(start_date))
        # print(start_date)
        item['start_date'] = start_date

        item['apply_fee'] = '75'
        item['apply_pre'] = '£'

        eng_level = response.xpath(
            '//p[contains(text(),"English language")]/strong/text()').extract(
            )
        eng_level = ''.join(eng_level).strip()
        if eng_level == 'Standard':
            ielts = 'Overall grade of 6.5 with a minimum of 6.0 in each of the subtests.'
            toefl = 'Overall score of 92 with 24/30 in reading and writing and 20/30 in speaking and listening.'
        elif eng_level == 'Good':
            ielts = 'Overall grade of 7.0 with a minimum of 6.5 in each of the subtests.'
            toefl = 'Overall score of 100 with 24/30 in reading and writing and 20/30 in speaking and listening.'
        elif eng_level == 'Advanced':
            ielts = 'Overall grade of 7.5 with a minimum of 6.5 in each of the subtests.'
            toefl = 'Overall score of 109 with 24/30 in reading and writing and 20/30 in speaking and listening.'
        else:
            ielts = ''
            toefl = ''
        ieltss = get_ielts(ielts)
        # print(ieltss)
        if ieltss != {} and ieltss != []:
            # ieltss=list(map(float,ieltss))
            item['ielts_l'] = ieltss['IELTS_L']
            item['ielts_s'] = ieltss['IELTS_S']
            item['ielts_r'] = ieltss['IELTS_R']
            item['ielts_w'] = ieltss['IELTS_W']
            item['ielts'] = ieltss['IELTS']
        toefls = re.findall('\d{1,3}', ''.join(toefl))
        # print(toefls)
        if len(toefls) == 5:
            item['toefl'] = toefls[0]
            item['toefl_l'] = toefls[4]
            item['toefl_w'] = toefls[2]
            item['toefl_r'] = toefls[1]
            item['toefl_s'] = toefls[3]
        elif len(toefls) == 2:
            toefls = list(map(int, toefls))
            item['toefl'] = max(toefls)
            item['toefl_l'] = min(toefls)
            item['toefl_w'] = min(toefls)
            item['toefl_r'] = min(toefls)
            item['toefl_s'] = min(toefls)
        item['ielts_desc'] = ielts
        item['toefl_desc'] = toefl
        # print(item)

        rntry_requirements = response.xpath(
            '//h4[contains(text(),"ntry")]/following-sibling::p[1]').extract()
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)
        item['rntry_requirements'] = rntry_requirements

        chinese_reuqirement = [
            "<div>Equivalent qualifications for China",
            "Bachelor's degree with a minimum overall average mark of 80%. Please note that a number of programmes / departments will require higher marks.",
            "ALTERNATIVE QUALIFICATIONS",
            "Medical/ Dental/ Master's degree; Doctorate.</div>",
        ]
        chinese_reuqirement = '\n'.join(chinese_reuqirement)
        item['require_chinese_en'] = chinese_reuqirement

        modules = response.xpath(
            '//h2[contains(text(),"About this")]/following-sibling::div'
        ).extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::div').extract(
            )
        career = remove_class(career)
        item['career_en'] = career

        yield item

Example #9

Show file

File: CityUniversityOfLondon_P.py Project: histudent/python_spider

    def parse_main(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # print(response.url)
        item['university'] = "City, University of London"
        item['url'] = response.url
        item['location'] = 'London'
        item['programme_en'] = response.meta['programme']
        item['degree_name'] = response.meta['degree_name']
        item['tuition_fee_pre'] = '£'
        item['teach_type'] = 'taught'
        department = response.meta['department']
        department = set(department)
        department = ' '.join(department)
        item['department'] = department

        fee = response.xpath(
            '//h3[contains(text(),"Fee")]/../../following-sibling::div//text()'
        ).extract()
        tuition_fee = getTuition_fee(fee)
        if tuition_fee == 0:
            fee = response.xpath(
                '//span[contains(text(),"£")]//text()').extract()
            tuition_fee = getTuition_fee(fee)
        item['tuition_fee'] = tuition_fee
        # print(item['tuition_fee'])

        overview = response.xpath(
            '//h2[contains(text(),"Who is it")]/following-sibling::*|'
            '//h2[contains(text(),"Overview")]/following-sibling::*').extract(
            )
        overview = remove_class(overview)
        overview = clear_same_s(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//h2[contains(text(),"Structure")]/following-sibling::*|'
            '//h2[contains(text(),"Modules")]/following-sibling::*').extract()
        modules = remove_class(modules)
        modules = clear_same_s(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry_requirement = response.xpath(
            '//h3[contains(text(),"Entry")]/following-sibling::*|//div[@id="entryreq"]'
        ).extract()
        rntry_requirement = remove_class(rntry_requirement)
        rntry_requirement = clear_same_s(rntry_requirement)
        # print(rntry_requirement)
        item['rntry_requirements'] = rntry_requirement

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        # print(ielts)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::*').extract()
        # print(career)
        career = remove_class(career)
        career = clear_same_s(career)
        item['career_en'] = career
        # print(career)

        duration = response.xpath(
            '//span[contains(text(),"Duration")]/../following-sibling::div//text()|'
            '//h3[contains(text(),"Duration")]/following-sibling::*//text()'
        ).extract()
        mode = re.findall('(?i)full', ''.join(duration))
        if mode != []:
            item['teach_time'] = '1'
        else:
            item['teach_time'] = '2'
        # print(''.join(duration))
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        start_date = response.xpath(
            '//h3[contains(text(),"Start date")]/following-sibling::p/text()'
        ).extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        # print(start_date)

        apply_desc_en = response.xpath(
            '//h3[contains(text(),"How to apply")]/following-sibling::*|//div[@id="howtoapply"]'
        ).extract()
        apply_desc_en = remove_class(apply_desc_en)
        item['apply_proces_en'] = apply_desc_en

        require_chinese = "<p>Applicants will be considered for most postgraduate courses with a good Chinese bachelor’s degree from a recognised University.Students who don’t meet the requirements for direct entry may have the option to undertake our Graduate Diploma programme at INTO City, which then offers the opportunity for guaranteed entry into City’s Masters programmes.</p>"
        item['require_chinese_en'] = require_chinese

        assessment = response.xpath(
            '//h2[contains(text(),"Teaching and learning")]/following-sibling::*|//h3[contains(text(),"ssessment")]/following-sibling::*'
        ).extract()
        # if assessment==[]:
        #     print(response.url)
        # else:
        #     print('不为空')
        item['assessment_en'] = remove_class(assessment)

Example #10

Show file

    def parse_main(self,response):
        item=get_item1(ScrapyschoolEnglandItem1)
        print(response.url)

        item['university'] = 'Middlesex University'
        item['url'] = response.url
        item['location'] = 'London'

        programme=response.xpath('//div[@class="course-page-banner__texts"]/h1/text()').extract()
        # print(programme)
        programme=''.join(programme)
        degree_name=re.findall('[A-Z]{2,}.*',programme)
        # print(degree_name)
        degree_name=''.join(degree_name)
        if degree_name!=programme:
            programme=programme.replace(degree_name,'')
        # print(programme)
        # print(degree_name)
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        try:
            if degree_name[0] == 'M':
                item['degree_type'] = '2'
            elif degree_name[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass


        start_date=response.xpath('//span[contains(text(),"Start")]/../following-sibling::div//text()').extract()
        # print(start_date)
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date'] = start_date

        duration=response.xpath('//span[contains(text(),"Duration")]/../following-sibling::div//text()').extract()
        mode=re.findall('(?i)full',''.join(duration))
        duration=clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        if mode !=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        fee = response.xpath('//span[contains(text(),"Fees")]/../following-sibling::div//text()').extract()
        tuition_fee=getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        overview=response.xpath('//h2[contains(text(),"Overview")]/following-sibling::*').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules=response.xpath('//h2[contains(text(),"Course content")]/following-sibling::*').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry=response.xpath('//h2[contains(text(),"Entry requirements")]/following-sibling::*').extract()
        rntry=remove_class(rntry)
        # print(rntry)
        item['rntry_requirements'] = rntry

        ielts=response.xpath('//p[contains(text(),"IELTS")]//text()').extract()
        ielts=''.join(ielts)
        item['ielts_desc']=ielts
        ielts=get_ielts(ielts)
        # print(ielts)
        try:
            if ielts!=[] or ielts!={}:
                item['ielts_l']=ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        career=response.xpath('//h2[contains(text(),"Careers")]/following-sibling::*').extract()
        career=remove_class(career)
        # print(career)
        item['career_en'] = career

        yield item

Example #11

Show file

File: UniversityOfYork_P.py Project: histudent/python_spider

    def parse(self, response):
        # print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        university = 'University of York'
        item['university'] = university
        item['url'] = response.url
        item['location'] = 'York'
        item['tuition_fee_pre'] = '£'
        start_date = response.xpath(
            '//h4[contains(text(),"Start date")]/following-sibling::p//text()'
        ).extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        overview = response.xpath(
            '//div[@class="o-grid__box o-grid__box--half o-grid__box--half@medium"]|'
            '//h2[contains(text(),"verview")]/following-sibling::*|'
            '//h2[contains(text(),"At a glance")]/following-sibling::*|'
            '//h2[contains(text(),"Course summary")]/following-sibling::*|'
            '//h2[contains(text(),"At a Glance")]/following-sibling::*|'
            '//div[@id="mdcolumn"]/h1/following-sibling::*[position()<5]'
        ).extract()
        overview = remove_class(overview)
        item['overview_en'] = overview
        # print(overview)
        modules = response.xpath(
            '//div[@id="content_modules"]|'
            '//h2[contains(text(),"Course structure")]/following-sibling::*|'
            '//th[contains(text(),"Module")]/../../..|'
            '//h2[contains(text(),"ontent")]/following-sibling::*|'
            '//h3[contains(text(),"What does the course cover?")]/following-sibling::p[1]|'
            '//strong[contains(text(),"Course structure")]/../following-sibling::*[position()<=5]|'
            '//h2[contains(text(),"Structure and ethos")]/..|'
            '//h2[contains(text(),"Modules")]/following-sibling::*|'
            '//h2[contains(text(),"Structure and Ethos")]/following-sibling::*|'
            '//h2[contains(text(),"module")]/following-sibling::*').extract()
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)
        tuition_fee = response.xpath(
            '//div[@id="fees"]/following-sibling::div[1]//*[contains(text(),"£")]//text()'
        ).extract()
        tuition_fee = getTuition_fee(tuition_fee)
        item['tuition_fee'] = tuition_fee
        # print(tuition_fee)
        assessment = response.xpath(
            '//h2[contains(text(),"Teaching and assessment")]/../../following-sibling::div[1]'
            '|//h2[contains(text(),"ssessment")]/following-sibling::*|'
            '//h2[contains(text(),"ssessment")]/following-sibling::*[position()<=5]|'
            '//strong[contains(text(),"Specialist training tailored to your interests and aspirations")]/../following-sibling::*|'
            '//span[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|'
            '//h3[contains(text(),"ssessment")]/following-sibling::*[position()<=3]|'
            '//strong[contains(text(),"SUMMER TERM")]/../following-sibling::*|'
            '//strong[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|'
            '//h2[contains(text(),"Teaching")]/following-sibling::*|'
            '//blockquote[@class="rightBox"]/following-sibling::*[1]|'
            '//h2[contains(text(),"Dissertation")]/following-sibling::p[1]|'
            '//p[contains(text(),"This programme aims: ")]/following-sibling::table[1]'
        ).extract()
        # if assessment==[]:
        #     print(response.url)
        assessment = remove_class(assessment)
        item['assessment_en'] = assessment
        # print(assessment)

        entry_requirements = response.xpath(
            '//div[@id="entry"]|'
            '//h2[contains(text(),"requirement")]/following-sibling::*|'
            '//h2[contains(text(),"pplicants")]/following-sibling::*|'
            '//h3[contains(text(),"Entry Requirements")]/following-sibling::*|'
            '//h2[contains(text(),"Entry")]/following-sibling::*[position()>1]|'
            '//h3[contains(text(),"International students")]/following-sibling::*|'
            '//h3[contains(text(),"Entry requirements")]/following-sibling::*[position()<4]|'
            '//h2[contains(text(),"English Language Requirements")]/following-sibling::*[position()<3]'
        ).extract()
        # if entry_requirements==[]:
        #     print(response.url)
        entry_requirements = remove_class(entry_requirements)
        item['rntry_requirements'] = entry_requirements
        # print(entry_requirements)

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']
        toefl = response.xpath(
            '//*[contains(text(),"TOEFL")]//text()').extract()
        toefl = ''.join(toefl).strip()
        item['toefl_desc'] = toefl
        toefl = re.findall('\d{2,3}', toefl)
        if len(toefl) == 2:
            toefl = list(map(int, toefl))
            item['toefl'] = max(toefl)
            item['toefl_l'] = min(toefl)
            item['toefl_w'] = min(toefl)
            item['toefl_r'] = min(toefl)
            item['toefl_s'] = min(toefl)

        career = response.xpath(
            '//div[@class="o-grid__box o-grid__box--half"]|'
            '//h2[contains(text(),"areer")]/following-sibling::*|'
            '//h2[contains(text(),"Employment relevance")]/following-sibling::*|'
            '//p[contains(text(),"employment,")]/following-sibling::ul[1]|'
            '//p[contains(text(),"This programme aims: ")]/following-sibling::ul[1]|'
            '//h3[contains(text(),"areers")]/following-sibling::ul[1]|'
            '//h2[contains(text(),"Employment outcomes")]/following-sibling::*|'
            '//h3[contains(text(),"What can it lead to?")]/following-sibling::p[1]'
        ).extract()
        # if career==[]:
        #     print(response.url)
        career = remove_class(career)
        # print(career)
        item['career_en'] = career
        departnemt = response.xpath(
            '//h4[contains(text(),"Department")]/following-sibling::p//text()|//div[@id="location"]/h1//text()'
        ).extract()
        departnemt = ''.join(departnemt)
        item['department'] = departnemt
        # pro = response.meta['programme']
        # item['programme_en'] = pro
        # duration = response.meta['duration']
        # print(duration)
        # duration = clear_duration(duration)
        # item['duration'] = duration['duration']
        # item['duration_per'] = duration['duration_per']
        programme = response.xpath(
            '//div[@id="mdcolumn"]/h1/text()|//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()'
        ).extract()
        # print(programme)
        clears = re.findall('[A-Za-z]+ in ', ''.join(programme))
        programme = ''.join(programme).replace(''.join(clears), '').strip()
        item['programme_en'] = programme
        duration = response.xpath(
            '//h4[contains(text(),"Length")]/following-sibling::p//text()'
        ).extract()
        # print(duration)
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        major_type1 = response.xpath(
            '//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()|//div[@id="content-container"]//h1/text()'
        ).extract()
        major_type1 = ''.join(major_type1)
        item['major_type1'] = major_type1
        # if 'diploma' not in response.url:
        # print(response.url)
        # print(major_type1)
        degree_name = re.findall('[A-Z]{2}[a-zA-Z]*', major_type1)
        # print(degree_name)
        degree_name = '/'.join(degree_name).strip()
        item['degree_name'] = degree_name

Example #12

Show file

File: BirkbeckUniversityOfLondon_P.py Project: histudent/python_spider

    def programme(self, response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        programme = response.xpath('//h1/text()').extract()
        # print(programme)
        deg = re.findall('\(.*\)', ''.join(programme))
        clears = re.findall(':.*', ''.join(programme))
        # print(deg)
        deg = ''.join(deg)
        programme = ''.join(programme).replace(''.join(clears), '').replace(deg, '').strip()
        # print(programme)
        item['programme_en'] = programme
        item['degree_name'] = deg.replace('(', '').replace(')', '').strip()
        item['url'] = response.url
        start_date = response.xpath('//dt[contains(text(),"tart date")]/following-sibling::dd[1]//text()').extract()
        start_date = tracslateDate(start_date)
        item['start_date'] = ','.join(start_date)
        item['university'] = 'Birkbeck, University of London'
        # item['tuition_fee_pre']='£'
        item['location'] = ''.join(
            response.xpath('//dt[contains(text(),"ocation")]/following-sibling::dd[1]//text()').extract())
        duration = response.xpath('//dt[contains(text(),"uration")]/following-sibling::dd[1]//text()').extract()
        # print(duration)
        mode = re.findall('(?i)full', ''.join(duration))
        # if mode!=[]:
        #     print('这个专业要')
        # else:
        #     print('这个专业只有兼职，不要！！！')
        dura = re.findall('[a-zA-Z0-9\s]+full', ''.join(duration))
        dura = clear_duration(dura)
        # print(dura)
        item['duration'] = dura['duration']
        item['duration_per'] = dura['duration_per']
        overview = response.xpath('//h2[contains(text(),"Highlights")]/preceding-sibling::div[1]').extract()
        overview = remove_class(overview)
        item['overview_en'] = overview
        # print(overview)
        modules = response.xpath('//h2[contains(text(),"Course structure")]/following-sibling::section').extract()
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)
        # if modules=='':
        #     print(response.url)
        entry = response.xpath('//h2[contains(text(),"ntry requirements")]/following-sibling::*').extract()
        entry = remove_class(entry)
        # print(entry)
        item['rntry_requirements']=entry
        chinese = ['<h3 class="content-show">Postgraduate entry requirements</h3>',
"<ul><li>Please <a>check your postgraduate course online</a> to see if your programme of study has an entry requirement of a UK undergraduate degree with a 2:1 or a 2:2 classification. </li><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:2 classification</strong>, you will typically need to have one of the following:</li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 70% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 75% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 75% </li><li>a Master's degree with an overall average grade of 60%. </li></ul><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:1 classification</strong>, you will typically need to have one of the following: </li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 75% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 80% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 80% </li><li>a Master's degree with an overall average grade of 70%. </li></ul><li>If you do not meet these criteria, you can apply for Birkbeck’s <a>International Foundation Programme</a><span>, which acts as a bridge between undergraduate and postgraduate study, preparing students to study a Master’s degree in the UK. There are progression pathways onto various courses at Birkbeck.</span></li><li>Another option is the <a>Master's Foundation programme</a>, at our partner provider OnCampus London, which is available for two- or three-term progression onto a wide range of Master’s Degrees at Birkbeck.</li><li>If your transcript is provided in GPA format and not a percentage value, <a>please contact our International Office</a> to check your equivalency. For most institutions: </li><ul><li>80% is equivalent to 4/5 or 3.3/4 </li><li>75% is equivalent to 3.5/5 or 2.7/4. </li></ul>"]
        item['require_chinese_en'] = remove_class(chinese)
        item['toefl_desc'] = 'overall score of 92, with 22 in Reading, 21 in Listening, 23 in Speaking, 24 in Writing.'
        item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'] = '22', '23', '22', '24'
        ielts = 'overall score of 6.5, with 6.0 in each subtest'
        ielts = response.xpath('//*[contains(text(),"IELTS")]//text()').extract()
        # print(ielts)
        ies = re.findall('\d\.?\d?', ''.join(ielts))
        # print(ies)
        if len(ies) == 2:
            ies = list(map(float, ies))
            item['ielts'] = max(ies)
            item['ielts_l'] = min(ies)
            item['ielts_s'] = min(ies)
            item['ielts_r'] = min(ies)
            item['ielts_w'] = min(ies)
        item['ielts_desc'] = '\n'.join(ielts).strip()
        fee = response.xpath('//h2[contains(text(),"Fees")]/following-sibling::p/text()').extract()
        # print(fee)
        assessment = response.xpath('//h2[contains(text(),"Assessment")]/following-sibling::*').extract()
        assessment = remove_class(assessment)
        item['assessment_en'] = assessment
        department = response.xpath('//a[contains(text(),"isit the")]/text()').extract()
        # print(department)
        department = ''.join(department).replace('Visit the', '').strip()
        # print(department)
        item['department'] = department
        howtoapply = response.xpath('//h2[contains(text(),"How to apply")]/following-sibling::*').extract()
        howtoapply = remove_class(howtoapply)
        # print(howtoapply)
        item['apply_proces_en'] = howtoapply
        # print(item)
        if mode!=[]:
            print('这个专业要')
            yield item
        else:
            print('这个专业只有兼职，不要！！！')

Example #13

Show file

File: UniversityForTheCreativeArts_P.py Project: histudent/python_spider

    def parse(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)

        item['university'] = 'University for the Creative Arts'
        item['url'] = response.url

        programme = response.xpath('//h1/text()').extract()
        programme = ''.join(programme)
        # print(programme)
        item['programme_en'] = programme

        degr = response.xpath('//h1/following-sibling::p[1]/text()').extract()
        # print(degr)
        degr = ''.join(degr).split('-')
        if len(degr) == 3:
            # print(degr)
            degree_name = degr[0]
            location = degr[1]
            item['degree_name'] = degree_name
            try:
                if degree_name[0] == 'M':
                    item['degree_type'] = '2'
                elif degree_name[0] == 'P':
                    item['degree_type'] = '3'
            except:
                pass
        elif len(degr) == 4:
            # print(degr)
            item['degree_name'] = 'Pre-degree'
            item['degree_type'] = '2'

        duration = response.xpath(
            '//p[contains(text(),"Length of study")]/following-sibling::p/text()'
        ).extract()
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        location = response.xpath(
            '//p[contains(text(),"Campus")]/following-sibling::p/text()'
        ).extract()
        location = ''.join(location)
        item['location'] = location

        start_date = response.xpath(
            '//p[contains(text(),"Start month")]/following-sibling::p/text()'
        ).extract()
        start_date = tracslateDate(start_date)
        # print(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date

        overview = response.xpath('//div[@class="cell overview"]').extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//div[@id="syllabus"]/following-sibling::section[@class="article-content-area"][1]'
        ).extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        career = response.xpath(
            '//div[contains(text(),"Career")]/following-sibling::div').extract(
            )
        career = remove_class(career)
        # print(career)
        item['career_en'] = career

        item['ielts'] = '6'
        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'

        rntry = [
            "We will consider equivalent qualifications from your home country for entry onto our Foundation, Bachelor’s and Master’s courses. Please see below for details of the accepted qualifications (including English language qualifications) for each level of course. Each application we receive is considered individually and therefore these qualifications are provided as a guide.",
            "For our International Foundation in Art, Design and Media, we usually require that you have one of the following:",
            "Chinese Senior School graduation with 12 years of completed school study, with an average of 65% or above.",
            "Pre Foundation course at Guildford College Training School (China).",
            "For our Bachelor's courses, we usually require that you have:",
            "Chinese Senior School graduation with 12 years of completed school study, plus a recognised Foundation course.",
            "A transcript showing successful completion of one year of university study at a recognised Chinese university with average of 70%.",
            "For our Master's courses, we usually require that you have:",
            "Bachelor's degree with 80% average grade from a recognised Chinese university.",
        ]
        rntry = '\n'.join(rntry)
        rntry = response.xpath(
            '//h3[contains(text(),"UK entry requirements")]/following-sibling::*'
        ).extract()
        item['rntry_requirements'] = remove_class(rntry)
        portfolio = response.xpath(
            '//h3[contains(text(),"Your portfolio")]/following-sibling::*'
        ).extract()
        item['portfolio_desc_en'] = remove_class(portfolio)

        item['tuition_fee'] = '13540'
        item['tuition_fee_pre'] = '£'

        item['deadline'] = '2019-3'

        # print(item)
        yield item

Example #14

Show file

File: UniversityOfLeicester_P.py Project: histudent/python_spider

    def parse_main(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'University of Leicester'
        item['url'] = response.url
        item['tuition_fee_pre'] = '£'

        department = response.xpath(
            '//dt[contains(text(),"Department")]/following-sibling::dd/text()'
        ).extract()
        department = ''.join(department).strip()
        # print(department)
        item['department'] = department

        overview = response.xpath(
            '//h2[contains(text(),"Course description")]/following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        chinese_require = [
            "<p>",
            "If you have completed a four-year Bachelors degree in China, you can be considered for entry to a Masters degree at Leicester. Our requirements depend on the rank of the university from which you graduated and your chosen Masters degree. The following is intended as a guide to our requirements:</p>",
            "<p>If you have graduated from a 'top 200' university in China, you may be asked for 70% overall if you are applying for an Engineering or Science degree, or 75% for an Arts, Humanities, Law or Social Science degree. You may need to have scores of at least 80% in modules that are particularly relevant to your chosen Master&rsquo;s degree. The School of Museum Studies requires at least 80% overall.</p>",
            "<p>If you graduated from a Chinese university ranked below the top 200 you may require higher scores (80-85%).</p>",
            "<p>If you have completed a three-year college diploma from a Chinese university, you will need to take an accepted one-year Pre-Masters course or upgrade your diploma to a Bachelor&rsquo;s degree before applying for a Master&rsquo;s degree.</p>",
        ]
        chinese_require = remove_class(chinese_require)
        item['require_chinese_en'] = chinese_require

        rntry = response.xpath(
            '//h2[contains(text(),"Entry requirements")]/following-sibling::*'
        ).extract()
        rntry = remove_class(rntry).replace(
            'International Qualifications', ''
        ).replace('Countries list', '').replace(
            'Find your country in this list to check equivalent qualifications, scholarships and additional requirements.',
            '')
        # print(rntry)
        item['rntry_requirements'] = rntry

        fee = response.xpath(
            '//h3[contains(text(),"International Students")]/following-sibling::*//text()'
        ).extract()
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        career = response.xpath('//div[@id="careers"]').extract()
        career = remove_class(career)
        # print(career)
        item['career_en'] = career

        modules = response.xpath('//div[@id="course-structure"]').extract()
        modules = remove_class(modules)
        item['modules_en'] = modules

        assessment = response.xpath(
            '//h2[contains(text(),"Teaching and learning")]/following-sibling::div'
        ).extract()
        assessment = remove_class(assessment)
        item['assessment_en'] = assessment

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        # print(ielts)
        if ielts != []:
            item['ielts'] = ielts['IELTS']
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']

        if item['ielts'] == 6.0:
            item['toefl'] = 80
        elif item['ielts'] == 6.5:
            item['toefl'] = 90
        elif item['ielts'] == 7.0:
            item['toefl'] = 100
        if item['toefl'] != None:
            item['toefl_l'] = '17'
            item['toefl_s'] = '20'
            item['toefl_r'] = '18'
            item['toefl_w'] = '17'

        programme = response.xpath(
            '//span[contains(text(),"Course")]/following-sibling::span/text()'
        ).extract()
        # print(programme)
        degree_name = response.xpath(
            '//span[contains(text(),"Qualification")]/following-sibling::span/text()'
        ).extract()
        # print(degree_name)
        duration = response.xpath(
            '//span[contains(text(),"Duration")]/following-sibling::span/text()'
        ).extract()
        # print(duration)
        start_date = response.xpath(
            '//span[contains(text(),"Start Dates")]/following-sibling::span/text()'
        ).extract()
        # print(start_date)
        if start_date == []:
            start_date = ['', '', '', '']
        for pro, deg, dur, sta in zip(programme, degree_name, duration,
                                      start_date):
            item['programme_en'] = pro
            item['degree_name'] = deg
            dura = clear_duration(dur)
            item['duration'] = dura['duration']
            item['duration_per'] = dura['duration_per']
            sta = tracslateDate(sta)
            sta = ','.join(sta)
            item['start_date'] = sta
            mode = re.findall('(?i)full', dur)
            if mode != []:
                item['teach_time'] = 'fulltime'
                if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE':
                    # print(item)
                    yield item
            else:
                item['teach_time'] = 'parttime'
                if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE':
                    # print(item)
                    yield item

Example #15

Show file

 def parse_main(self, response):
     print('进入一个详情页')
     # print(response.url)
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = 'Oxford Brookes University'
     item['url'] = response.url
     item['location'] = 'London'
     programme = response.xpath('//h1/text()').extract()
     programme = ''.join(programme).strip()
     # print(programme)
     item['programme_en'] = programme
     degree_name = response.xpath(
         '//h1/following-sibling::h2/text()').extract()
     degree_name = ''.join(degree_name).strip()
     # print(degree_name)
     item['degree_name'] = degree_name
     department = response.xpath(
         '//h1/following-sibling::h2/following-sibling::p/a/text()'
     ).extract()
     department = ''.join(department).strip()
     # print(department)
     item['department'] = department
     start_date = response.xpath(
         '//h3[contains(text(),"Available")]/following-sibling::p[1]/text()'
     ).extract()
     start_date = tracslateDate(start_date)
     start_date = ','.join(start_date)
     # print(start_date)
     item['start_date'] = start_date
     duration = response.xpath(
         '//h3[contains(text(),"Course length")]/following-sibling::ul//text()'
     ).extract()
     # print(duration)
     mode = re.findall('(?i)full', ''.join(duration))
     if mode != []:
         item['teach_time'] = 'fulltime'
     else:
         item['teach_time'] = 'parttime'
     try:
         duration = clear_duration(duration)
         # print(duration)
         item['duration'] = duration['duration']
         item['duration_per'] = duration['duration_per']
     except:
         pass
     overview = response.xpath(
         '//h1/following-sibling::h2/following-sibling::p/following-sibling::*'
     ).extract()
     overview = remove_class(overview)
     item['overview_en'] = overview
     modules = response.xpath('//div[@id="section-two"]').extract()
     modules = remove_class(modules)
     item['modules_en'] = modules
     fee = response.xpath('//p[contains(text(),"£")]/text()').extract()
     tuition_fee = getTuition_fee(fee)
     # print(tuition_fee)
     item['tuition_fee'] = tuition_fee
     item['tuition_fee_pre'] = '£'
     rntry = response.xpath('//div[@id="section-four"]').extract()
     rntry = remove_class(rntry)
     item['rntry_requirements'] = rntry
     career = response.xpath('//div[@id="section-five"]').extract()
     career = remove_class(career)
     item['career_en'] = career
     ielts = response.xpath(
         '//*[contains(text(),"IELTS")]/text()').extract()
     ielts = ''.join(ielts)
     IELTS = ielts
     ielts = re.findall('\d\.\d', ielts)
     if len(ielts) == 2:
         # print('长度为二的ielts',ielts)
         ielts = list(map(float, ielts))
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min(
                 ielts), min(ielts), min(ielts)
     elif len(ielts) == 3:
         # print('长度为三的ielts',ielts,IELTS)
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = ielts[0], ielts[2], ielts[
                 2], ielts[1], ielts[1]
     elif len(ielts) == 0:
         pass
     elif len(ielts) == 1:
         # print('长度为一的ielts',ielts)
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = ielts[0], ielts[0], ielts[
                 0], ielts[0], ielts[0]
     else:
         # print('其他长度的ielts',ielts,response.url)
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min(
                 ielts), min(ielts), min(ielts)
     # print(item)
     yield item

Example #16

Show file

    def parse_main(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['teach_time'] = 'fulltime'
        item['university'] = 'University of Glasgow'
        item['url'] = response.url
        item['location'] = 'Glasgow'
        item['start_date'] = '2018-9'
        item['deadline'] = '2018-7'
        item["tuition_fee_pre"] = "£"
        item['teach_type'] = 'taught'

        programme = response.xpath(
            '//div[@id="prog-title"]/h1/text()').extract()
        programme = ''.join(programme)
        item['programme_en'] = programme
        degree_type = response.xpath(
            '//div[@id="prog-title"]/h1/span/text()').extract()
        degree_type = ''.join(degree_type)
        item['degree_name'] = degree_type
        duration = response.xpath(
            '//li[contains(text(),"full-time")]/text()').extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        # print(durations)

        overview = response.xpath(
            '//h2[contains(text(),"Why this programme")]/following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//h2[contains(text(),"Programme str")]/following-sibling::*'
        ).extract()
        modules = clear_same_s(modules)
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::*').extract()
        career = clear_same_s(career)
        career = remove_class(career)
        item['career_en'] = career

        fees = response.xpath(
            '//h2[contains(text(),"Fees and")]/following-sibling::div//text()'
        ).extract()
        fees = response.xpath('//div[@id="fees"]//text()').extract()
        # print(fees)
        tuition_fee = getTuition_fee(fees)
        # print(tuition_fee)
        if tuition_fee == 2018:
            tuition_fee = '0'
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        IELTS = response.xpath(
            '//*[contains(text(),"IELTS")]/../following-sibling::ul[1]//text()'
        ).extract()
        # print(IELTS)
        ielts = get_ielts(IELTS)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']
        TOEFL = response.xpath(
            '//*[contains(text(),"TOEFL")]/..//text()').extract()
        # print(TOEFL)
        toefl = get_toefl(TOEFL)
        if toefl != []:
            try:
                item['toefl_r'] = toefl[1]
                item['toefl_l'] = toefl[2]
                item['toefl_s'] = toefl[3]
                item['toefl_w'] = toefl[4]
                item['toefl'] = toefl[0]
            except:
                pass

        entry = response.xpath(
            '//h2[contains(text(),"Entry requirements")]/following-sibling::*'
        ).extract()
        entry = clear_same_s(entry)
        entry = remove_class(entry)
        item['rntry_requirements'] = entry

        apply_d = response.xpath(
            '//h3[contains(text(),"Documents")]/following-sibling::ul[1]'
        ).extract()
        apply_d = clear_same_s(apply_d)
        item['apply_proces_en'] = remove_class(apply_d)

        if programme != '':
            yield item

Example #17

Show file

File: UniversityOfLiverpool_P.py Project: histudent/python_spider

 def parse_career(self, response):
     print(response.url)
     item = get_item1(ScrapyschoolEnglandItem1)
     overview = response.meta['overview']
     item['overview_en'] = overview
     modules = response.meta['modules']
     item['modules_en'] = modules
     ielts = response.meta['ielts']
     # department = response.meta['department']
     # item['department'] = department
     toefls = response.meta['toefl']
     rntry_requirements = response.meta['rntry_requirements']
     item['rntry_requirements'] = rntry_requirements
     tuition_fee = response.meta['tuition_fee']
     item['tuition_fee'] = tuition_fee
     item['tuition_fee_pre'] = '£'
     apply_documents_en = response.meta['apply']
     item['apply_documents_en'] = apply_documents_en
     career = response.xpath('//section[@class="content"]').extract()
     career = remove_class(career)
     item['career_en'] = career
     # print(career)
     department = response.xpath(
         '//a[contains(text(),"Faculty")]/text()').extract()
     # print(department)
     department = ''.join(department)
     department = response.xpath(
         '//nav[@id="breadcrumb"]/ul/li/a/text()').extract()
     if department != []:
         department = department[-1]
     item['department'] = department
     item['university'] = 'University of Liverpool'
     item['url'] = response.url.replace('career-prospects', 'overview')
     item['location'] = 'Liverpool'
     programme = response.url.split('/')[-3]
     programme = programme.replace('-', ' ').title()
     degree_name = re.findall('\sM[sarbm][a-z]{0,2}', programme)
     # print(degree_name)
     degree_name = ' '.join(degree_name).strip()
     degree_name = degree_name.strip()
     programme = programme.replace(degree_name, '').strip()
     item['programme_en'] = programme
     item['degree_name'] = degree_name.replace('Mana', '')
     # print(item['programme_en'])
     item['toefl_desc'] = ''.join(toefls)
     item['ielts_desc'] = ''.join(ielts)
     ielts = get_ielts(ielts)
     if ielts != {} and ielts != []:
         item['ielts_l'] = ielts['IELTS_L']
         item['ielts_s'] = ielts['IELTS_S']
         item['ielts_r'] = ielts['IELTS_R']
         item['ielts_w'] = ielts['IELTS_W']
         item['ielts'] = ielts['IELTS']
     toefl = re.findall('\d{1,3}', ''.join(toefls))
     if len(toefl) == 4:
         item['toefl'] = toefl[0]
         item['toefl_l'] = toefl[1]
         item['toefl_w'] = toefl[1]
         item['toefl_r'] = toefl[2]
         item['toefl_s'] = toefl[3]
     elif len(toefl) == 2:
         toefl = list(map(int, toefl))
         item['toefl'] = max(toefl)
         item['toefl_l'] = min(toefl)
         item['toefl_w'] = min(toefl)
         item['toefl_r'] = min(toefl)
         item['toefl_s'] = min(toefl)
     duration = response.xpath(
         '//li[contains(text(),"duration")]/span/text()').extract()
     # print(duration)
     for i in duration:
         if 'Full' not in i:
             del duration[duration.index(i)]
     duration = clear_duration(duration)
     # print(duration)
     item['duration'] = duration['duration']
     item['duration_per'] = duration['duration_per']

Example #18

Show file

    def parse(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['location'] = 'Leeds'
        item['university'] = 'Leeds Trinity University'
        item['url'] = response.url
        # item['start_date']='2019-8'
        # item['application_open_date']='2019-7'
        programme = response.xpath(
            '//h1[@class="course-title"]/text()').extract()
        programme = ''.join(programme).strip()
        degree_name = response.xpath(
            '//h2[@class="course-title"]/text()').extract()
        degree_name = ''.join(degree_name).strip()
        item['degree_type'] = '2'
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        # print(programme)
        # print(degree_name)

        overview = response.xpath(
            '//h2/a[contains(text(),"Overview")]/../following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        item['overview_en'] = overview
        # print(overview)
        duration = response.xpath(
            '//div[contains(text(),"Course type")]/span/text()').extract()
        duration = clear_duration(duration)
        # print(duration
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        modules = response.xpath(
            '//div[contains(@class,"structure")]').extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        fee = response.xpath(
            '//div[contains(@class,"fees")]//text()').extract()
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        rntry = response.xpath('//div[contains(@class,"entry")]').extract()
        ielts = get_ielts(rntry)
        rntry = remove_class(rntry)
        # print(rntry)
        item['rntry_requirements'] = rntry
        # print(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        career = response.xpath('//div[contains(@class,"graduate")]').extract()
        career = remove_class(career)
        # print(career)
        item['career_en'] = career

        apply_p = [
            "Choose a course and check its entry requirements using our course finder. You can find out more about us and your chosen course by coming to an Open Day.",
            "Apply for your chosen course by downloading the relevant application form below. Complete the application form and return it, along with your references (if they’re required) to the Admissions team at [email protected] or by post to: Admissions Team, Leeds Trinity University, Horsforth, Leeds, LS18 5HD",
            "The Admissions team will acknowledge receipt of your application by email, process your application and forward it to the relevant Programme Leader within three days of receipt.",
            "The Programme Leader will review your application and either make a decision based on your application or invite you to attend an Interview Day at Leeds Trinity University. Those selected for an interview will be contacted with the details of the interview within ten days of your application being processed.",
            "The Admissions team will notify you of your interview outcome in writing within five working days of receiving a decision from the Programme Leader.",
            "Made an offer? You should reply to accept or decline your offer at [email protected]. If you accept, you’ll need to prove that you satisfy the conditions outlined in your offer letter, usually by presenting the relevant supporting documentation in person to Leeds Trinity University, Student Administration Office (AM36).",
        ]
        apply_p = '<ul><li>' + '</li><li>'.join(apply_p) + '</li></ul>'
        item['apply_proces_en'] = apply_p

        # print(item)
        yield item

Example #19

Show file

File: NorwichUniversityoftheArts_P.py Project: histudent/python_spider

    def parse_main(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'Norwich University of the Arts'
        item['url'] = response.url
        item['location'] = 'Norfolk'
        programme = response.xpath(
            '//span[contains(text(),"Course")]/../../following-sibling::span/span/text()'
        ).extract()
        programme = set(programme)
        programme = ''.join(programme).strip()
        # print(programme)
        degree_name = re.findall('[A-Z]{2,}', programme)
        degree_name = ''.join(degree_name).strip()
        programme = programme.replace(degree_name, '').strip()
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        try:
            if degree_name[0] == 'M':
                item['degree_type'] = '2'
            elif degree_name[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass

        duration = response.xpath(
            '//strong[contains(text(),"Course length")]/../text()').extract()
        mode = re.findall('(?i)full', ''.join(duration))
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        if mode != []:
            item['teach_time'] = '1'
        else:
            item['teach_time'] = '2'

        overview = response.xpath(
            '//strong[contains(text(),"Course length")]/../../following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        career = response.xpath(
            '//h3[contains(text(),"career")]/following-sibling::ul').extract()
        career = remove_class(career)
        item['career_en'] = career

        item[
            'ielts_desc'] = "BA and MA applicants are required to have a minimum UKVI approved IELTS exam score of 6.0 overall, with a minimum of 5.5 in each section"
        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['ielts'] = '6.0'

        rntry = response.xpath('//div[@id="entry-requirements"]').extract()
        rntry = remove_class(rntry)
        # print(rntry)
        item['rntry_requirements'] = rntry

        portfolio_desc_en = response.xpath(
            '//div[@id="portfolio-guidance"]').extract()
        portfolio_desc_en = remove_class(portfolio_desc_en)
        # print(portfolio_desc_en)
        item['apply_proces_en'] = portfolio_desc_en

        fee = response.xpath('//div[@id="fees-funding"]//text()').extract()
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        how_to_apply = response.xpath('//div[@id="how-to-apply"]').extract()
        item['apply_proces_en'] = remove_class(how_to_apply)

        yield item

Example #20

Show file

File: LeedsBeckettUniversity_P.py Project: histudent/python_spider

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        item['university']='Leeds Beckett University'
        item['url']=response.url
        location=response.xpath('//div[contains(text(),"Location")]/following-sibling::span/text()').extract()
        location=set(location)
        location=''.join(location).strip()
        # print(location)
        item['location']='Leeds'

        degree_name=response.xpath('//div[@class="course-hero__label"]/text()').extract()
        degree_name=''.join(degree_name).strip()
        item['degree_name']=degree_name
        programme=response.xpath('//h1[@class="course-hero__title"]/text()').extract()
        programme=''.join(programme).strip()
        # print(programme)
        # print(degree_name)
        item['programme_en']=programme

        department=response.xpath('//div[@class="course-hero__labels"]/a/text()').extract()
        department=''.join(department)
        # print(department)
        item['department']=department

        mode=response.xpath('//div[contains(text(),"Attendance")]/following-sibling::div//text()').extract()
        mode=''.join(mode)
        mode=re.findall('(?i)full',mode)
        if mode!=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        start_date=response.xpath('//div[contains(text(),"Start Date")]/following-sibling::div//text()').extract()
        start_date=tracslateDate(start_date)
        start_date=set(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date']=start_date

        duration=response.xpath('//div[contains(text(),"Duration")]/following-sibling::span//text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        overview=response.xpath('//h2[contains(text(),"Overview")]/../following-sibling::div').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        rntry=response.xpath('//h2[contains(text(),"Entry Requirements")]/../following-sibling::div').extract()
        rntry=remove_class(rntry)
        item['rntry_requirements']=rntry

        IELTS=response.xpath('//div[@class="entry-ielts"]/text()').extract()
        ielts=get_ielts(IELTS)
        # print(ielts)
        try:
            if ielts!=[] or ielts!={}:
                item['ielts_l']=ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        career=response.xpath('//h3[contains(text(),"Careers")]/following-sibling::div').extract()
        career=remove_class(career)
        item['career_en']=career

        modules=response.xpath('//div[@class="course-modules__table-modules"]//div[@class="course-modules__dropdowns"]').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        fee=response.xpath('//div[contains(text(),"£")]/text()').extract()
        fee=''.join(fee).strip()
        fee=re.findall('£\d{3,}',fee)
        fee = '-'.join(fee).replace(',', '').replace('£', '')
        fee = fee.split('-')
        try:
            fee = list(map(int, fee))
            fee = max(fee)
            item['tuition_fee']=fee
        except:
            pass
        item['tuition_fee_pre']='£'

        apply_d=["Academic Certificates.",
"Evidence of your English language ability (see below).",
"A photocopy of your passport.",
"A reference to support your application – either academic or professional.",
"A completed Agent Consent Form (required if you are applying via or with the help of an agent).",]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        apply_p=["Applying for a postgraduate course",
"Once you have found the course you want to study in our online prospectus you will then click on the ‘Apply Now’ button located at the top of the online course page. ",
"You will be asked to create an account on our application portal and complete your application via your Leeds Beckett account. Once you have submitted your application you should receive a decision within six weeks of applying. The exception to this is if the course you have applied for has a closing date specified. In this case, we will wait until the closing date has passed before we contact you",]
        apply_p='\n'.join(apply_p)
        item['apply_proces_en']=apply_p

        # print(item)
        yield item

Example #21

Show file

    def parses(self, response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'Durham University'
        item['url'] = response.url
        item['location'] ='Durham'
        item['tuition_fee_pre'] = '£'
        programme = response.xpath(
            '//div[@id="course"]/div[@class="row-fluid titlebar"]/h1/span[@class="span7 title"]/text()').extract()
        programme = ''.join(programme).strip()
        # print(programme)
        item['programme_en'] = programme
        degree_type = response.xpath(
            '//div[@id="course"]/div[@class="row-fluid titlebar"]/h1//span[@class="type"]/text()').extract()
        degree_type = ''.join(degree_type).strip()
        # print(degree_type)
        item['degree_name'] =degree_type

        duration=response.xpath('//th[contains(text(),"Duration")]/following-sibling::td//text()').extract()
        duration=clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per']=duration['duration_per']
        # print(duration)

        mode=response.xpath('//th[contains(text(),"Mode")]/following-sibling::td//text()').extract()
        if mode!=[]:
            item['teach_time']=1
        else:
            item['teach_time']=2

        tuition=response.xpath('//th[contains(text(),"nternational")]/following-sibling::td/text()').extract()
        tuition_fee=getTuition_fee(tuition)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        department=response.xpath('//div[@id="department"]/h3[1]/text()').extract()
        department=' '.join(department)
        # print(department)
        item['department'] = department

        coursecontent=response.xpath('//div[@id="coursecontent"]//*').extract()
        overviewSplit=response.xpath('//div[@id="coursecontent"]/h2[contains(text(),"Structure")]/self::*').extract()
        if overviewSplit!=[]:
            overview=coursecontent[0:coursecontent.index(overviewSplit[0])]
        else:
            overview=coursecontent
        # print(overview)
        item['overview_en']=remove_class(overview)
        modules=response.xpath('//div[@id="coursecontent"]/h2[contains(text(),"Structure")]/following-sibling::*').extract()
        # print(modules)
        item['modules_en']=remove_class(modules)
        # overview=response.xpath('//div[@id="department"]/h5[contains(text(),"verview")]/following-sibling::p').extract()
        # item['overview_en']=remove_class(overview)

        item['ielts'] = '6.5'
        item['ielts_l'],item['ielts_s'],item['ielts_r'],item['ielts_w']='6.0','6.0','6.0','6.0'
        item['toefl'] ='92'
        item['toefl_l'],item['toefl_l'],item['toefl_l'],item['toefl_l']='23','23','23','23'
        item['ielts_desc'] ='6.5 (no component under 6.0)'
        item['toefl_desc'] ='TOEFL iBT (internet based test): 92 (no component under 23)'

        assessment=response.xpath('//div[@id="learning"]').extract()
        assessment=remove_class(assessment)
        item['assessment_en'] = assessment

        rntry=response.xpath('//div[@id="admissions"]').extract()
        rntry=remove_class(rntry)
        item['rntry_requirements'] = rntry

        # item['apply_pre'] = '£'
        # item['apply_fee'] = '60'
        # item['application_open_date'] = '2018-10-1'
        # item['start_date'] = '2018-9,2019-1,2019-4'
        start_date=response.xpath('//th[contains(text(),"tart Date")]/following-sibling::td/text()').extract()
        start_date=''.join(start_date)
        # print(start_date)
        if start_date!='':
            start_date='2019-10'
            item['start_date']=start_date


        apply_proces=["<p>Apply Online",
"Stage One: Check entry requirements",
"Stage Two: Complete the application form",
"Stage Three: We process your application",
"Stage Four: We communicate a decision",
"Stage Five: Next steps</p>",]
        apply_proces='</p><p>'.join(apply_proces)
        item['apply_proces_en'] = apply_proces

        apply_documents_en=["<p>Personal details",
"Your education and qualifications already achieved and details of any qualifications that you are currently studying for, if applicable",
"The names and addresses of two academic referees",
"A Personal Statement",
"Supporting documents (for example, degree certificates / transcripts, English Language evidence if you are not a native English speaker, CV, samples of academic work).</p>",]
        apply_documents_en='</p><p>'.join(apply_documents_en)
        item['apply_documents_en'] = apply_documents_en

        apply_desc=["<p>The standard minimum entry requirement to study a postgraduate programme at Durham University is normally achievement of an upper second class UK honours degree (2:1) or equivalent qualification and two satisfactory academic references. Full details of qualification equivalencies by country can be found here. For applicants who are not Native English speakers, English language evidence may also be required."
"However, some Academic Departments and programmes have different or additional entry requirements. Therefore, before you apply, it is important to check the appropriate course listing in the courses database or departmental web page to ensure that you meet or are able to meet before the programme commencement date:"
"• The Academic Department and specific programme’s entry requirements and, if applicable, any English language requirements"
"• The financial requirements of the programme you are interested in (including deposit payment, tuition fees and any other associated costs).</p>"]
        apply_desc='</p><p>'.join(apply_desc)
        item['apply_desc_en'] = apply_desc

        career=response.xpath('//div[@id="opportunities"]').extract()
        career=remove_class(career)
        item['career_en'] = career

        # if degree_type not in ['BA', 'BEng', 'BSc', 'PCert', 'PGCE', 'GDip', 'LLB']:
        #     print(item)
        # yield item

        # print(item)
        yield item

Example #22

Show file

File: ManchesterMetropolitanUniversity_P.py Project: histudent/python_spider

    def parses(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # print('接受了')
        print('开始下载', response.url, '的数据')
        # print(response.status)
        item['university'] = 'Manchester Metropolitan University'
        item['url'] = response.url
        item['location'] = 'Manchester'
        degree_name = response.xpath('//h1/span/text()').extract()
        degree_name = ''.join(degree_name)
        item['degree_name'] = degree_name
        programme = response.xpath('//h1/text()').extract()
        # print(programme)
        programme = ''.join(programme).strip()
        item['programme_en'] = programme
        # print(degree_name)
        # print(programme)
        item['degree_type'] = 2
        overview = response.xpath(
            '//h2[contains(text(),"Overview")]/following-sibling::article'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::p').extract()
        career = remove_class(career)
        item['career_en'] = career

        rntry = response.xpath(
            '//h2[contains(text(),"Entry")]/following-sibling::p').extract()
        ieltssss = re.findall('\d\.?\d?', ''.join(rntry))
        print(ieltssss)
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        modules = response.xpath(
            '//h2[contains(text(),"Course")]/following-sibling::div').extract(
            )
        modules = remove_class(modules)
        item['modules_en'] = modules

        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        tuition = getTuition_fee(fee)
        # print(tuition)
        item['tuition_fee'] = tuition
        item['tuition_fee_pre'] = '£'

        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['ielts'] = '6.5'
        item[
            'ielts_desc'] = 'For Postgraduate courses, we usually ask for IELTS 6.5 (No less than 5.5 in any section) or equivalent.'

        item[
            'toefl_desc'] = 'Overall score: 89 With no individual test score below: Listening: 17 Reading: 18 Speaking: 20 Writing : 17'
        item['toefl'] = '89'
        item['toefl_l'] = '17'
        item['toefl_s'] = '20'
        item['toefl_r'] = '18'
        item['toefl_w'] = '17'

        turation = response.xpath(
            '//li[contains(text(),"Length")]/span//text()').extract()
        duration = clear_duration(turation)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        ieltsopen = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        # print(ieltsopen)
        start_date = response.xpath(
            '//li[contains(text(),"Start")]/span//text()').extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        item['department'] = ''.join(
            response.xpath(
                '//span[@id="department_name"]/text()').extract()).strip()
        if response.status == 404:
            print("****404****")
            with open("errorurl.txt", 'a+') as f:
                f.write(response.url + "\n")
        else:
            yield item

Example #23

Show file

    def parse(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['location'] = 'Newcastle'
        item['university'] = 'Northumbria University'
        item['url'] = response.url

        programme = response.xpath(
            '//div[@class="col-sm-6"]/h1/text()|//div[@class="hero-content"]/h1/text()|//header[@class="course-heading"]/h1/text()'
        ).extract()
        programme = ''.join(programme).strip()
        degree_name = re.findall('[A-Z]{2,}.*', programme)
        degree_name = ''.join(degree_name)
        if degree_name != programme:
            programme = programme.replace(degree_name, '')
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        try:
            if degree_name[0] == 'M':
                item['degree_type'] = '2'
            elif degree_name[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass

        dur = response.xpath(
            '//strong[contains(text(),"Mode")]/../text()|//span[contains(text(),"uration")]/../text()'
        ).extract()
        # print(dur)
        duration = clear_duration(dur)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        item['teach_time'] = '1'

        start_date = response.xpath(
            '//strong[contains(text(),"Start")]/../text()|//span[contains(text(),"Start")]/../text()'
        ).extract()
        start_date = list(set(start_date))
        # print(start_date)
        start_date = tracslateDate(start_date)
        # print(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date

        deadline = response.xpath(
            '//span[contains(text(),"deadline")]/../text()').extract()
        deadline = list(set(deadline))
        # print(deadline)
        deadline = tracslateDate(deadline)
        # print(deadline)
        deadline = ''.join(deadline)
        item['deadline'] = deadline

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]/text()').extract()
        item['ielts_desc'] = ''.join(ielts).strip()
        ielts = get_ielts(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass
        if ielts == []:
            ielts = response.xpath(
                '//*[contains(text(),"English Language requirements")]/../text()'
            ).extract()
            ielts = get_ielts(ielts)
            try:
                if ielts != [] or ielts != {}:
                    item['ielts_l'] = ielts['IELTS_L']
                    item['ielts_s'] = ielts['IELTS_S']
                    item['ielts_r'] = ielts['IELTS_R']
                    item['ielts_w'] = ielts['IELTS_W']
                    item['ielts'] = ielts['IELTS']
            except:
                pass
            # print(ielts)

        overview = response.xpath(
            '//div[@id="tab-0"]//div[@class="rich-text"]|//h3[contains(text(),"Overview")]/following-sibling::p'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//div[@id="tab-1"]//div[@class="rich-text"]|//div[@id="modules"]'
        ).extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry = response.xpath(
            '//*[contains(text(),"English Language requirements")]/..'
        ).extract()
        rntry = remove_class(rntry)
        # print(rntry)
        item['rntry_requirements'] = rntry

        howtoapply = response.xpath('//div[@id="how-to-apply"]').extract()
        howtoapply = remove_class(howtoapply)
        item['apply_proces_en'] = howtoapply

        department = response.xpath(
            '//strong[contains(text(),"Department")]/../text()').extract()
        department = ''.join(department).strip()
        item['department'] = department

        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        # print(fee)
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        career = response.xpath(
            '//h1[contains(text(),"career")]/../following-sibling::div|//div[@id="tab-5"]'
        ).extract()
        career = remove_class(career)
        # print(career)
        item['career_en'] = career