Python tracslateDate Exemples, scrapySchool_England.middlewares.tracslateDate Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : LeedsBeckettUniversity_P.py Projet : histudent/python_spider

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        item['university']='Leeds Beckett University'
        item['url']=response.url
        location=response.xpath('//div[contains(text(),"Location")]/following-sibling::span/text()').extract()
        location=set(location)
        location=''.join(location).strip()
        # print(location)
        item['location']='Leeds'

        degree_name=response.xpath('//div[@class="course-hero__label"]/text()').extract()
        degree_name=''.join(degree_name).strip()
        item['degree_name']=degree_name
        programme=response.xpath('//h1[@class="course-hero__title"]/text()').extract()
        programme=''.join(programme).strip()
        # print(programme)
        # print(degree_name)
        item['programme_en']=programme

        department=response.xpath('//div[@class="course-hero__labels"]/a/text()').extract()
        department=''.join(department)
        # print(department)
        item['department']=department

        mode=response.xpath('//div[contains(text(),"Attendance")]/following-sibling::div//text()').extract()
        mode=''.join(mode)
        mode=re.findall('(?i)full',mode)
        if mode!=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        start_date=response.xpath('//div[contains(text(),"Start Date")]/following-sibling::div//text()').extract()
        start_date=tracslateDate(start_date)
        start_date=set(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date']=start_date

        duration=response.xpath('//div[contains(text(),"Duration")]/following-sibling::span//text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        overview=response.xpath('//h2[contains(text(),"Overview")]/../following-sibling::div').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        rntry=response.xpath('//h2[contains(text(),"Entry Requirements")]/../following-sibling::div').extract()
        rntry=remove_class(rntry)
        item['rntry_requirements']=rntry

        IELTS=response.xpath('//div[@class="entry-ielts"]/text()').extract()
        ielts=get_ielts(IELTS)
        # print(ielts)
        try:
            if ielts!=[] or ielts!={}:
                item['ielts_l']=ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        career=response.xpath('//h3[contains(text(),"Careers")]/following-sibling::div').extract()
        career=remove_class(career)
        item['career_en']=career

        modules=response.xpath('//div[@class="course-modules__table-modules"]//div[@class="course-modules__dropdowns"]').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        fee=response.xpath('//div[contains(text(),"£")]/text()').extract()
        fee=''.join(fee).strip()
        fee=re.findall('£\d{3,}',fee)
        fee = '-'.join(fee).replace(',', '').replace('£', '')
        fee = fee.split('-')
        try:
            fee = list(map(int, fee))
            fee = max(fee)
            item['tuition_fee']=fee
        except:
            pass
        item['tuition_fee_pre']='£'

        apply_d=["Academic Certificates.",
"Evidence of your English language ability (see below).",
"A photocopy of your passport.",
"A reference to support your application – either academic or professional.",
"A completed Agent Consent Form (required if you are applying via or with the help of an agent).",]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        apply_p=["Applying for a postgraduate course",
"Once you have found the course you want to study in our online prospectus you will then click on the ‘Apply Now’ button located at the top of the online course page. ",
"You will be asked to create an account on our application portal and complete your application via your Leeds Beckett account. Once you have submitted your application you should receive a decision within six weeks of applying. The exception to this is if the course you have applied for has a closing date specified. In this case, we will wait until the closing date has passed before we contact you",]
        apply_p='\n'.join(apply_p)
        item['apply_proces_en']=apply_p

        # print(item)
        yield item

Exemple #2

0

Afficher le fichier

Fichier : UlsterUniversity_P.py Projet : histudent/python_spider

    def parse(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)

        item['university'] = 'Ulster University'
        item['url'] = response.url
        item['location'] = 'Belfast'
        item['teach_time'] = '1'

        programme = response.xpath('//h1//text()').extract()
        programme = ''.join(programme).strip()
        # print(programme)
        degr = re.findall('-.+', programme)
        degr = ''.join(degr)
        # print(degr)
        programme = programme.replace(degr, '').replace('*', '').strip()
        degr = degr.replace('-', '').strip()
        # print(degr)
        # print(programme)
        item['programme_en'] = programme
        item['degree_name'] = degr
        try:
            if degr[0] == 'M':
                item['degree_type'] = '2'
            elif degr[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass

        overview = response.xpath(
            '//h2[contains(text(),"Overview")]/following-sibling::*').extract(
            )
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath('//div[@id="modules"]').extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry = response.xpath('//div[@id="entryconditions"]').extract()
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        career = response.xpath('//div[@id="opportunities"]').extract()
        career = remove_class(career)
        item['career_en'] = career

        start_date = response.xpath(
            '//h3[contains(text(),"Start dates")]/following-sibling::*//text()'
        ).extract()
        start_date = tracslateDate(start_date)
        start_date = set(start_date)
        # print(start_date)
        start_date = '.'.join(start_date).strip()
        item['start_date'] = start_date

        # item['deadline']='2019-6'

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        # print(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        fee = response.xpath(
            '//dt[contains(text(),"International:")]/following-sibling::dd/text()'
        ).extract()
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        # print(item)
        yield item

Exemple #3

0

Afficher le fichier

Fichier : UniversityCollegeLondon_P.py Projet : histudent/python_spider

    def parses(self, response):
        print(response.url)
        # print('收到了')
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'University College London'
        item['url'] = response.url
        item['tuition_fee_pre'] = '£'
        location = response.xpath(
            '//div/strong[contains(text(),"Location")]/../text()').extract()
        location = ''.join(location).strip()
        item['location'] = location
        programme = response.xpath('//h1[@class="heading"]//text()').extract()
        programme = ''.join(programme)
        # print(programme)
        degree_name = re.findall('[MB][A-Z]{1,2}[a-z]*', programme)
        # print(degree_name)
        degree_name = ''.join(set(degree_name)).strip()
        programme = programme.replace(degree_name, '')
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        item['degree_type'] = '2'
        # print(programme)
        mode = response.xpath('//*[contains(text(),"FT")]//text()').extract()
        if mode != []:
            item['teach_time'] = 1
        else:
            item['teach_time'] = 2
        # department=response.meta['department']
        # department=''.join(department).strip()
        # # print(department)
        # item['department'] = department
        department = response.xpath(
            '//h5[contains(text(),"Department website")]/following-sibling::p/a/text()'
        ).extract()
        department = ''.join(department).strip()
        # # print(department)
        item['department'] = department

        overview = response.xpath(
            '//article[@class="article"]/h1/following-sibling::article/p[1]'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        application_open_date = response.xpath(
            '//div[contains(text(),"Open")]/text()').extract()
        application_open_date = tracslateDate(application_open_date)
        # print(application_open_date)
        application_open_date = ','.join(set(application_open_date))
        item['application_open_date'] = application_open_date

        deadline = response.xpath(
            '//div[contains(text(),"Close")]/text()').extract()
        deadline = tracslateDate(deadline)
        deadline = ','.join(set(deadline))
        item['deadline'] = deadline

        tuition_fee = getTuition_fee(
            response.xpath('//*[contains(text(),"£")]//text()').extract())
        item['tuition_fee'] = tuition_fee

        duration = response.xpath(
            '//h4[contains(text(),"uration")]/following-sibling::div/text()'
        ).extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        start_date = response.xpath(
            '//h4[contains(text(),"tarts")]/following-sibling::p//text()'
        ).extract()
        # print(start_date)
        start_date = tracslateDate(start_date)
        # print(start_date)
        start_date = ','.join(set(start_date))
        # print(start_date)
        item['start_date'] = start_date

        item['apply_fee'] = '75'
        item['apply_pre'] = '£'

        eng_level = response.xpath(
            '//p[contains(text(),"English language")]/strong/text()').extract(
            )
        eng_level = ''.join(eng_level).strip()
        if eng_level == 'Standard':
            ielts = 'Overall grade of 6.5 with a minimum of 6.0 in each of the subtests.'
            toefl = 'Overall score of 92 with 24/30 in reading and writing and 20/30 in speaking and listening.'
        elif eng_level == 'Good':
            ielts = 'Overall grade of 7.0 with a minimum of 6.5 in each of the subtests.'
            toefl = 'Overall score of 100 with 24/30 in reading and writing and 20/30 in speaking and listening.'
        elif eng_level == 'Advanced':
            ielts = 'Overall grade of 7.5 with a minimum of 6.5 in each of the subtests.'
            toefl = 'Overall score of 109 with 24/30 in reading and writing and 20/30 in speaking and listening.'
        else:
            ielts = ''
            toefl = ''
        ieltss = get_ielts(ielts)
        # print(ieltss)
        if ieltss != {} and ieltss != []:
            # ieltss=list(map(float,ieltss))
            item['ielts_l'] = ieltss['IELTS_L']
            item['ielts_s'] = ieltss['IELTS_S']
            item['ielts_r'] = ieltss['IELTS_R']
            item['ielts_w'] = ieltss['IELTS_W']
            item['ielts'] = ieltss['IELTS']
        toefls = re.findall('\d{1,3}', ''.join(toefl))
        # print(toefls)
        if len(toefls) == 5:
            item['toefl'] = toefls[0]
            item['toefl_l'] = toefls[4]
            item['toefl_w'] = toefls[2]
            item['toefl_r'] = toefls[1]
            item['toefl_s'] = toefls[3]
        elif len(toefls) == 2:
            toefls = list(map(int, toefls))
            item['toefl'] = max(toefls)
            item['toefl_l'] = min(toefls)
            item['toefl_w'] = min(toefls)
            item['toefl_r'] = min(toefls)
            item['toefl_s'] = min(toefls)
        item['ielts_desc'] = ielts
        item['toefl_desc'] = toefl
        # print(item)

        rntry_requirements = response.xpath(
            '//h4[contains(text(),"ntry")]/following-sibling::p[1]').extract()
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)
        item['rntry_requirements'] = rntry_requirements

        chinese_reuqirement = [
            "<div>Equivalent qualifications for China",
            "Bachelor's degree with a minimum overall average mark of 80%. Please note that a number of programmes / departments will require higher marks.",
            "ALTERNATIVE QUALIFICATIONS",
            "Medical/ Dental/ Master's degree; Doctorate.</div>",
        ]
        chinese_reuqirement = '\n'.join(chinese_reuqirement)
        item['require_chinese_en'] = chinese_reuqirement

        modules = response.xpath(
            '//h2[contains(text(),"About this")]/following-sibling::div'
        ).extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::div').extract(
            )
        career = remove_class(career)
        item['career_en'] = career

        yield item

Exemple #4

0

Afficher le fichier

Fichier : UniversityOfLeicester_P.py Projet : histudent/python_spider

    def parse_main(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'University of Leicester'
        item['url'] = response.url
        item['tuition_fee_pre'] = '£'

        department = response.xpath(
            '//dt[contains(text(),"Department")]/following-sibling::dd/text()'
        ).extract()
        department = ''.join(department).strip()
        # print(department)
        item['department'] = department

        overview = response.xpath(
            '//h2[contains(text(),"Course description")]/following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        chinese_require = [
            "<p>",
            "If you have completed a four-year Bachelors degree in China, you can be considered for entry to a Masters degree at Leicester. Our requirements depend on the rank of the university from which you graduated and your chosen Masters degree. The following is intended as a guide to our requirements:</p>",
            "<p>If you have graduated from a 'top 200' university in China, you may be asked for 70% overall if you are applying for an Engineering or Science degree, or 75% for an Arts, Humanities, Law or Social Science degree. You may need to have scores of at least 80% in modules that are particularly relevant to your chosen Master&rsquo;s degree. The School of Museum Studies requires at least 80% overall.</p>",
            "<p>If you graduated from a Chinese university ranked below the top 200 you may require higher scores (80-85%).</p>",
            "<p>If you have completed a three-year college diploma from a Chinese university, you will need to take an accepted one-year Pre-Masters course or upgrade your diploma to a Bachelor&rsquo;s degree before applying for a Master&rsquo;s degree.</p>",
        ]
        chinese_require = remove_class(chinese_require)
        item['require_chinese_en'] = chinese_require

        rntry = response.xpath(
            '//h2[contains(text(),"Entry requirements")]/following-sibling::*'
        ).extract()
        rntry = remove_class(rntry).replace(
            'International Qualifications', ''
        ).replace('Countries list', '').replace(
            'Find your country in this list to check equivalent qualifications, scholarships and additional requirements.',
            '')
        # print(rntry)
        item['rntry_requirements'] = rntry

        fee = response.xpath(
            '//h3[contains(text(),"International Students")]/following-sibling::*//text()'
        ).extract()
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        career = response.xpath('//div[@id="careers"]').extract()
        career = remove_class(career)
        # print(career)
        item['career_en'] = career

        modules = response.xpath('//div[@id="course-structure"]').extract()
        modules = remove_class(modules)
        item['modules_en'] = modules

        assessment = response.xpath(
            '//h2[contains(text(),"Teaching and learning")]/following-sibling::div'
        ).extract()
        assessment = remove_class(assessment)
        item['assessment_en'] = assessment

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        # print(ielts)
        if ielts != []:
            item['ielts'] = ielts['IELTS']
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']

        if item['ielts'] == 6.0:
            item['toefl'] = 80
        elif item['ielts'] == 6.5:
            item['toefl'] = 90
        elif item['ielts'] == 7.0:
            item['toefl'] = 100
        if item['toefl'] != None:
            item['toefl_l'] = '17'
            item['toefl_s'] = '20'
            item['toefl_r'] = '18'
            item['toefl_w'] = '17'

        programme = response.xpath(
            '//span[contains(text(),"Course")]/following-sibling::span/text()'
        ).extract()
        # print(programme)
        degree_name = response.xpath(
            '//span[contains(text(),"Qualification")]/following-sibling::span/text()'
        ).extract()
        # print(degree_name)
        duration = response.xpath(
            '//span[contains(text(),"Duration")]/following-sibling::span/text()'
        ).extract()
        # print(duration)
        start_date = response.xpath(
            '//span[contains(text(),"Start Dates")]/following-sibling::span/text()'
        ).extract()
        # print(start_date)
        if start_date == []:
            start_date = ['', '', '', '']
        for pro, deg, dur, sta in zip(programme, degree_name, duration,
                                      start_date):
            item['programme_en'] = pro
            item['degree_name'] = deg
            dura = clear_duration(dur)
            item['duration'] = dura['duration']
            item['duration_per'] = dura['duration_per']
            sta = tracslateDate(sta)
            sta = ','.join(sta)
            item['start_date'] = sta
            mode = re.findall('(?i)full', dur)
            if mode != []:
                item['teach_time'] = 'fulltime'
                if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE':
                    # print(item)
                    yield item
            else:
                item['teach_time'] = 'parttime'
                if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE':
                    # print(item)
                    yield item

Exemple #5

0

Afficher le fichier

Fichier : HarperAdamsUniveristy_P.py Projet : histudent/python_spider

    def parses(self, response):
        # print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        Duration = response.xpath(
            '//h4[contains(text(),"Duration")]/following-sibling::p[1]//text()'
        ).extract()
        Duration = ''.join(Duration)
        if '1' in Duration:
            item['duration'] = 1
            item['duration_per'] = 1
        if 'full' in Duration:
            item['teach_time'] = 'fulltime'
        else:
            item['teach_time'] = 'parttime'

        StartDate = response.xpath(
            '//*[contains(text(),"Start")]/../text()').extract()
        try:
            StartDate = tracslateDate(StartDate)
            StartDate = ','.join(StartDate)
            item["start_date"] = StartDate
        except:
            pass
        Course = response.url.split('/')[-1]
        Course = Course.replace('-', ' ').title()
        EntryRequirements = response.xpath(
            '//div[@id="entry-requirements"]').extract()
        EntryRequirements = remove_class(EntryRequirements)
        EntryRequirements = clear_same_s(EntryRequirements)
        CourseOverview = response.xpath('//div[@id="overview"]').extract()
        CourseOverview = remove_class(CourseOverview)
        CourseOverview = clear_same_s(CourseOverview)
        Career = response.xpath('//div[@id="careers"]').extract()
        # if Career==[]:
        #     print(response.url)
        Career = remove_class(Career)
        Career = clear_same_s(Career)
        Assessment = response.xpath('//div[@id="teaching"]').extract()
        if Assessment == []:
            print(response.url)
        Assessment = remove_class(Assessment)
        Master = response.xpath(
            '//div[@class="page-heading"]/h2/text()').extract()
        Master = ''.join(Master)
        university = 'Harper Adams University'
        item['ielts'] = '6.0'
        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['toefl_r'] = '18'
        item['toefl_l'] = '18'
        item['toefl_s'] = '22'
        item['toefl_w'] = '20'
        item['toefl'] = '80'
        item["university"] = university
        item["programme_en"] = Course
        item["degree_name"] = Master
        item["overview_en"] = CourseOverview
        item["assessment_en"] = Assessment
        item["career_en"] = Career
        item["tuition_fee"] = '12650'
        item['tuition_fee_pre'] = '£'
        item['apply_proces_en'] = remove_class([
            "<div>",
            "    	<div>",
            "	        <div>",
            "                <div>",
            "                    <div>",
            "                        <div>",
            "                        ",
            "                        	<p>Applying for university as an international student is similar to the process that UK students follow, but there are a few extra steps.</p>",
            "                            ",
            "                            <p>These include:</p>",
            "							<ul>",
            "                            	<li>Taking an <a>English language test</a></li>",
            "								<li>Applying for a <a>visa</a></li>",
            "								<li>Attending a pre-sessional course</li>",
            "                            </ul>",
            "							<p>To understand the general steps for applying to one of courses, take a look at our <a>How to apply</a> pages.</p>",
            "                        </div>",
            "                    </div><div>",
            "                     	<div>    ",
            "                            ",
            "                         </div>",
            "                    </div>",
            "                 </div>",
            "					",
            "			</div>",
            "		</div>",
            "	</div>",
            "    <div>",
            "        ",
            "        <div>",
            "            <div>",
            "                <div>",
            "                    <div>Before you apply</div>",
            "                </div>",
            "                <div>",
            "                    <div>",
            "                        <div>",
            "                            <div>",
            "                    			<div>",
            "                    				<div>",
            "                                    	",
            "                                        <p>To study on a course at Harper Adams, you'll need to meet the entry requirements listed on the <a>English language requirements</a> and you may need to take an English language test.</p>",
            "										<p>Like UK students, if you're applying for one of our undergraduate courses, you'll need to apply through the <a>complete an application form</a>.</p>",
            "									</div>",
            "                                </div><div>",
            "                                    <div>    ",
            "                                        ",
            "                                     </div>",
            "                                </div>",
            "                            </div>",
            "                        </div>",
            "                    </div>",
            "                </div>",
            "            </div>",
            "            <div>",
            "                <div>",
            "                    <div>After you apply</div>",
            "                </div>",
            "                <div>",
            "                    <div>",
            "                        <div>",
            "                            <div>",
            "                    			<div>",
            "                    				<div>",
            "                                    	",
            "                                        <p>We'll look at your application and decide if you meet the entry requirements. We may ask to interview you. We'll keep you updated about the status of your application by email or post.</p>",
            "										<p>If we accept your application, we'll send you either an unconditional or conditional offer. Unconditional offers mean you have been accepted to study on a course without any other requirements. A conditional offer means you'll have to give us some additional information or prove a qualification.</p>",
            "									</div>",
            "                                </div><div>",
            "                                    <div>    ",
            "                                        ",
            "                                     </div>",
            "                                </div>",
            "                            </div>",
            "                        </div>",
            "                    </div>",
            "                </div>",
            "            </div>",
            "            <div>",
            "                <div>",
            "                    <div>Before you arrive</div>",
            "                </div>",
            "                <div>",
            "                    <div>",
            "                        <div>",
            "                            <div>",
            "                    			<div>",
            "                    				<div>",
            "                                    	",
            "                                        <p>Depending on where you're coming from, you'll need to <a>visa pages</a> to find out more.</p>",
            "                                        <p>As part of the visa application process, you may need to submit a Confirmation of Acceptance for Studies (CAS) number or a similar letter that says you've been accepted to study here.</p>",
            "                                        <h3>Confirmation of Acceptance for Studies (CAS) number</h3>",
            "                                        <p>If you meet all of the conditions of your offer by the deadline printed on your offer letter, we'll give you a Confirmation of Acceptance for Studies (CAS) number. You'll need your CAS number to apply for your visa.</p>",
            "                                        <p>Your CAS number is unique to you and your place at Harper Adams. It can't be transferred to any other university. If you decide to withdraw your application, you must let us know so we can cancel your CAS number.</p>",
            "                                        <h3>Short-term study visa letters</h3>",
            "                                        <p>If you're applying for a course that requires a <a>short-term study visa</a>, and you've met any offer conditions we've set, we'll give you a letter that confirms we've accepted you. You'll need to submit this with your visa application. You may also need to show it when you enter the UK.</p>",
            "                                        <h3>Applying for accommodation</h3>",
            "                                        <p>You'll need to apply for <a>accommodation</a> before you arrive in the UK. We'll send you details of how to do this along with your offer letter. You'll need to tell the university in advance if you're bringing family to live with you.</p>",
            "									</div>",
            "                                </div><div>",
            "                                    <div>    ",
            "                                        ",
            "                                     </div>",
            "                                </div>",
            "                            </div>",
            "                        </div>",
            "                    </div>",
            "                </div>",
            "            </div>",
            "            <div>",
            "                <div>",
            "                    <div>When you arrive</div>",
            "                </div>",
            "                <div>",
            "                    <div>",
            "                        <div>",
            "                            <div>",
            "                    			<div>",
            "                    				<div>",
            "                                    	",
            "                                        <p>We'll let you know the date that you need to arrive by in your offer letter. You'll need to make arrangements to travel to the UK and get to Harper Adams by this date.</p>",
            "										<p>When you first arrive in the UK, you'll need to go through immigration controls. To help you get through immigration as quickly and easily as possible, you should:</p>",
            "                                        <ul>",
            "                                            <li>Not arrive before the start date of your visa</li>",
            "                                            <li>Make sure you've filled in a landing card (if required) and included details of a UK contact - this can be the university's address or the address of a landlord</li>",
            "                                            <li>Have your passport, CAS or offer letter, details of where you'll stay and proof that you have enough money to study here ready to show immigration officers</li>",
            "                                            <li>Make sure you know the conditions of your visa, when it expires, and the number of hours you are allowed to work</li>",
            "                                            <li>Declare any sums of cash over &euro;10,000 (or equivalent in your currency).</li>",
            "                                        </ul>",
            "                                        <p>To avoid any issues at immigration, you should not:</p>",
            "                                        <ul>",
            "                                            <li>Bring food or drink (such as meat, dairy products, fish, eggs, honey, fruit, vegetables or plants) with you.</li>",
            "                                            <li>Bring counterfeit goods, firearms, weapons or indecent/obscene material with you.</li>",
            "                                        </ul>",
            "                                        ",
            "                                        <p>More information on travelling through the UK border can be found at <a>www.gov.uk/government/publications/coming-to-the-uk/faster-travel-through-the-uk-border</a></p>",
            "									</div>",
            "                                </div><div>",
            "                                    <div>    ",
            "                                        ",
            "                                     </div>",
            "                                </div>",
            "                            </div>",
            "                        </div>",
            "                    </div>",
            "                </div>",
            "            </div>",
            "            <div>",
            "                <div>",
            "                    <div>After you arrive</div>",
            "                </div>",
            "                <div>",
            "                    <div>",
            "                        <div>",
            "                            <div>",
            "                    			<div>",
            "                    				<div>",
            "                                    	",
            "                                        <p>On your first day at Harper Adams, you'll need to bring your passport and visa (as well as any certificates or documents we've requested) so we can make a copy for our reference.</p>",
            "									</div>",
            "                                </div><div>",
            "                                    <div>    ",
            "                                        ",
            "                                     </div>",
            "                                </div>",
            "                            </div>",
            "                        </div>",
            "                    </div>",
            "                </div>",
            "            </div>",
        ])

        item["rntry_requirements"] = EntryRequirements
        item["url"] = response.url
        item['location'] = 'Edgmond'

        modu = response.xpath(
            '//div[@class="tabmenu"]/ul/li/a/@onclick').extract()
        mod = response.xpath(
            '//div[@class="tabmenu"]/ul/li/a/@title').extract()
        print(mod)
        print(modu)
        modules = []
        for i, j in zip(mod, modu):
            if 'M' in i:
                print('要这个专业的课程')
                print(i)
                id = re.findall('\d+', j)
                fullurl = 'https://www.harper-adams.ac.uk/shared/get-pg-route-modules.cfm?id=' + str(
                    id[0]) + '&year_of_entry=' + str(id[1]) + '&route=' + str(
                        id[2])
                print(fullurl)
                modre = etree.HTML(requests.get(fullurl).content).xpath(
                    '//div[@class="content-section-inner"]')
                ma = ''
                for mas in modre:
                    ma += etree.tostring(mas,
                                         method='html',
                                         encoding='unicode')
                # parMod=remove_class(ma)
                modules += ma

                # print(id)
            else:
                modules = ''
        # print(modules)
        item['modules_en'] = remove_class(modules)

        # print(item)
        yield item

Exemple #6

0

Afficher le fichier

Fichier : StaffordshireUniversity_P.py Projet : histudent/python_spider

    def parses(self, response):
        # print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = 'Staffordshire University'
        item['url'] = response.url
        item['location'] = 'Staffordshire'
        programme = response.xpath('//h1/text()').extract()
        programme = ''.join(programme).strip()
        degree_name = response.xpath(
            '//h2[@class="hero_header text-center"]/text()').extract()
        if degree_name == []:
            degree_name = re.findall('[A-Z]{2,}[a-z]*', programme)
            degree_name = ''.join(degree_name).strip()
            item['degree_name'] = degree_name
        else:
            item['degree_name'] = ''.join(degree_name).strip()
        item['programme_en'] = programme
        programme = response.xpath(
            '//div[@class="col-sm-9"]/h1/text()|//div[@id="main"]//h1/text()'
        ).extract()
        programme = ''.join(programme).strip()
        degree = re.findall('[A-Z]{2}[/a-zA-Z\s]*', programme)
        programme = programme.replace(''.join(degree), '').strip()
        if degree == []:
            degree = response.xpath(
                '//h2[@class="hero_header text-center"]/text()').extract()
        elif degree != []:
            degree = ''.join(degree)
        else:
            degree = ''
        item['degree_name'] = ''.join(degree).strip()
        item['programme_en'] = programme
        duration = response.xpath(
            '//th[contains(text(),"Duration")]/following-sibling::td/text()|//dt[contains(text(),"Duration")]/following-sibling::dd[1]/text()'
        ).extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        start_date = response.xpath(
            '//dt[contains(text(),"Academic year:")]/following-sibling::dd/text()'
        ).extract()
        if start_date == []:
            start_date = response.xpath(
                '//th[contains(text(),"Course start")]/following-sibling::td/text()'
            ).extract()
        start_date = tracslateDate(start_date)
        item['start_date'] = ','.join(start_date).strip()
        department = response.xpath(
            '//th[contains(text(),"School")]/following-sibling::td/text()'
        ).extract()
        department = ''.join(department).strip()
        item['department'] = department
        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        tuition_fee = getTuition_fee(fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'
        overview = response.xpath(
            '//div[@id="key-features"]|'
            '//section[@class="course-details_section summary-section"]//div[@class="medium-8 medium-pull-4 large-pull-3 column"]'
        ).extract()

        overview = remove_class(overview)
        item['overview_en'] = overview
        modules = response.xpath(
            '//div[@id="course-content"]|//section[@id="contents"]|//div[@id="course-summary"]'
        ).extract()
        modules = remove_class(modules)
        item['modules_en'] = modules
        rntry = response.xpath(
            '//div[@id="course-entry-requirements"]|//section[@id="entry"]'
        ).extract()
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry
        career = response.xpath(
            '//div[@id="graduate-destinations"]|//section[@id="careers"]'
        ).extract()
        career = remove_class(career)
        item['career_en'] = career
        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = ''.join(ielts).strip()
        item['ielts_desc'] = ielts
        ielts = get_ielts(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass
        assessment = response.xpath(
            '//a[contains(text(),"ssessment")]/../following-sibling::div[1]'
        ).extract()
        item['assessment_en'] = remove_class(assessment)

        yield item

Exemple #7

0

Afficher le fichier

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        item['university']='Anglia Ruskin University'
        item['url']=response.url
        item['teach_time']='1'
        programme=response.xpath('//h1/text()').extract()
        programme=''.join(programme).split('\r\n')
        if len(programme)==4:
            prog=programme[1].strip()
            degr=programme[2].strip()
            item['degree_name'] = degr
        else:
            prog=''.join(programme)
        item['programme_en']=prog


        location=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__locations"]/a/text()').extract()
        location=set(location)
        # print(location)
        location=','.join(location)
        item['location']=location

        start_date=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__entry"]/text()').extract()
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date']=start_date

        duration=response.xpath('//div[@class="course-summary__teaching"]/p[1]/text()').extract()
        try:
            duration=clear_duration(duration)
            item['duration']=duration['duration']
            item['duration_per']=duration['duration_per']
        except:
            pass

        overview=response.xpath('//div[@id="overview"]').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        career=response.xpath('//div[@id="careers"]').extract()
        career=remove_class(career)
        # print(career)
        item['career_en']=career

        modules=response.xpath('//div[@id="modulesassessment"]').extract()
        modules=remove_class(modules)
        item['modules_en']=remove_class(modules)

        item['ielts']='6.5'
        item['ielts_l']='5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['ielts_desc']='Our standard entry criteria for postgraduate courses is IELTS 6.5 or equivalent, with nothing lower than 5.5 in any of the four elements (listening, speaking, reading and writing).'
        item['toefl']='88'
        item['toefl_l']='17'
        item['toefl_s'] = '20'
        item['toefl_r'] = '18'
        item['toefl_w'] = '17'
        item['toefl_desc']="TOEFL iBT with 88 overall and a minimum of 17 in Writing and Listening, 18 in Reading and 20 in Speaking"

        fee=response.xpath('//div[@id="feesfunding"]//text()').extract()
        tuition_fee=getTuition_fee(fee)
        # print(tuition_fee)
        if tuition_fee==2018:
            tuition_fee=0
        item['tuition_fee']=tuition_fee
        item['tuition_fee_pre']='£'

        department=response.xpath('//a[contains(text(),"Visit your")]/@href').extract()
        # print(department)
        department=''.join(department).split('/')[-1]
        # print(department)
        department=department.title().replace('-',' ')
        # print(department)
        item['department']=department

        how_to_apply=["<p>Step 1 - Choose your course</p>",
"<p>Step 2 - Submit your application form</p>",
"<p>Step 3 - Check your email regularly</p>",
"<p>Step 5 - Start your visa application</p>",
"<p>Step 4 - Receive our decision on your application</p>",]
        how_to_apply='\n'.join(how_to_apply)
        item['apply_proces_en']=how_to_apply

        apply_d=["<ul><li>Qualification certificates and transcripts, including certified translations, where applicable</li>",
"<li>A personal statement. You can download and complete our Personal Statement Form.</li>",
"<li>References/recommendation letters</li>",
"<li>Curriculum vitae/resume</li>",
"<li>Passport</li>",
"<li>Current and previous visa(s) (if applicable)</li>",
"<li>Proof of name change (if applicable)</li>",
"<li>Portfolio (if applicable)</li></ul>",]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        courseid=response.xpath('//input[@id="erastracode"]/@value').extract()
        # print(courseid)
        if courseid==['']:
            rntry=response.xpath('//h4[contains(text(),"ain")]/following-sibling::*').extract()
            rntry=remove_class(rntry)
            # print(rntry)
            item['rntry_requirements'] = rntry
        else:
            cid=re.findall('[A-Z0-9]+',courseid[0])
            courseid='%20'.join(cid)
            rntry_url='https://www.anglia.ac.uk/api/coursewidget/multipleentryrequirements?academicYears=2017%2C2018&moaCode=FT&astraCode='+courseid
            # print(rntry_url)
            try:
                rntry_content=json.loads(requests.get(rntry_url).text)[0]['GroupItems'][0]['Text'][0]
                rntry_content='<div>'+rntry_content+'</div>'
            except:
                rntry_content=''

            item['rntry_requirements'] = rntry_content
            # print(rntry_content)
        # yield item

Exemple #8

0

Afficher le fichier

    def pro_parse(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['url'] = response.url
        item['university'] = 'London South Bank University'
        item['location'] = 'London'
        item['tuition_fee_pre'] = '£'
        pro = response.xpath('//div[@id="breadcrumbs"]//span/text()').extract()
        prog = pro[-1].split('-')
        if len(prog) == 2:
            programme = prog[0]
            degree_type = prog[1]
            degree_type = degree_type.strip()
            item['degree_name'] = degree_type
            if degree_type[0] == 'M':
                item['degree_type'] = '2'
            elif degree_type[0] == 'P':
                item['degree_type'] = '3'
        else:
            programme = prog
        item['programme_en'] = programme
        fee = response.xpath(
            '//div[@id="tab_fees_and_funding"]//*[contains(text(),"£")]//text()'
        ).extract()
        # print(fee)
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        overview = response.xpath('//div[@id="tab_overview"]').extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath('//div[@id="tab_modules"]').extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        career = response.xpath('//div[@id="tab_employability"]').extract()
        career = remove_class(career)
        item['career_en'] = career

        rntry = response.xpath('//div[@id="tab_entry_requirements"]').extract()
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        ielts = get_ielts(rntry)
        # print(ielts)
        if ielts != [] and ielts != {}:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']

        apply_desc_en = response.xpath(
            '//div[@id="tab_how_to_apply"]').extract()
        apply_desc_en = remove_class(apply_desc_en)
        item['apply_desc_en'] = apply_desc_en

        duration = response.xpath(
            '//td/span[contains(text(),"Duration")]/following-sibling::div/text()'
        ).extract()
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        mode = response.xpath(
            '//td/span[contains(text(),"Mode")]/following-sibling::div/text()'
        ).extract()
        mode = set(mode)
        mode = ''.join(mode)
        # print(mode)
        mode = re.findall('(?i)full', mode)
        if mode != []:
            item['teach_time'] = '1'
        else:
            item['teach_time'] = '2'
        start_date = response.xpath(
            '//td/span[contains(text(),"Start")]/following-sibling::div/text()'
        ).extract()
        # start_date=tracslateDate(start_date)
        # start_date=set(start_date)
        try:
            start_date = tracslateDate(start_date)
            start_date = list(set(start_date))
            start_list = []
            for i in start_date:
                start_list.append('2019' + '-' + i)
            start_date = ','.join(start_list)
            item['start_date'] = start_date
        except:
            pass

        item['department'] = ''.join(
            response.xpath(
                '//a[contains(text(),"School of")]/text()').extract())
        yield item

Exemple #9

0

Afficher le fichier

 def parse_main(self, response):
     print('进入一个详情页')
     # print(response.url)
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = 'Oxford Brookes University'
     item['url'] = response.url
     item['location'] = 'London'
     programme = response.xpath('//h1/text()').extract()
     programme = ''.join(programme).strip()
     # print(programme)
     item['programme_en'] = programme
     degree_name = response.xpath(
         '//h1/following-sibling::h2/text()').extract()
     degree_name = ''.join(degree_name).strip()
     # print(degree_name)
     item['degree_name'] = degree_name
     department = response.xpath(
         '//h1/following-sibling::h2/following-sibling::p/a/text()'
     ).extract()
     department = ''.join(department).strip()
     # print(department)
     item['department'] = department
     start_date = response.xpath(
         '//h3[contains(text(),"Available")]/following-sibling::p[1]/text()'
     ).extract()
     start_date = tracslateDate(start_date)
     start_date = ','.join(start_date)
     # print(start_date)
     item['start_date'] = start_date
     duration = response.xpath(
         '//h3[contains(text(),"Course length")]/following-sibling::ul//text()'
     ).extract()
     # print(duration)
     mode = re.findall('(?i)full', ''.join(duration))
     if mode != []:
         item['teach_time'] = 'fulltime'
     else:
         item['teach_time'] = 'parttime'
     try:
         duration = clear_duration(duration)
         # print(duration)
         item['duration'] = duration['duration']
         item['duration_per'] = duration['duration_per']
     except:
         pass
     overview = response.xpath(
         '//h1/following-sibling::h2/following-sibling::p/following-sibling::*'
     ).extract()
     overview = remove_class(overview)
     item['overview_en'] = overview
     modules = response.xpath('//div[@id="section-two"]').extract()
     modules = remove_class(modules)
     item['modules_en'] = modules
     fee = response.xpath('//p[contains(text(),"£")]/text()').extract()
     tuition_fee = getTuition_fee(fee)
     # print(tuition_fee)
     item['tuition_fee'] = tuition_fee
     item['tuition_fee_pre'] = '£'
     rntry = response.xpath('//div[@id="section-four"]').extract()
     rntry = remove_class(rntry)
     item['rntry_requirements'] = rntry
     career = response.xpath('//div[@id="section-five"]').extract()
     career = remove_class(career)
     item['career_en'] = career
     ielts = response.xpath(
         '//*[contains(text(),"IELTS")]/text()').extract()
     ielts = ''.join(ielts)
     IELTS = ielts
     ielts = re.findall('\d\.\d', ielts)
     if len(ielts) == 2:
         # print('长度为二的ielts',ielts)
         ielts = list(map(float, ielts))
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min(
                 ielts), min(ielts), min(ielts)
     elif len(ielts) == 3:
         # print('长度为三的ielts',ielts,IELTS)
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = ielts[0], ielts[2], ielts[
                 2], ielts[1], ielts[1]
     elif len(ielts) == 0:
         pass
     elif len(ielts) == 1:
         # print('长度为一的ielts',ielts)
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = ielts[0], ielts[0], ielts[
                 0], ielts[0], ielts[0]
     else:
         # print('其他长度的ielts',ielts,response.url)
         item['ielts'], item['ielts_l'], item['ielts_s'], item[
             'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min(
                 ielts), min(ielts), min(ielts)
     # print(item)
     yield item

Exemple #10

0

Afficher le fichier

Fichier : CityUniversityOfLondon_P.py Projet : histudent/python_spider

    def parse_main(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # print(response.url)
        item['university'] = "City, University of London"
        item['url'] = response.url
        item['location'] = 'London'
        item['programme_en'] = response.meta['programme']
        item['degree_name'] = response.meta['degree_name']
        item['tuition_fee_pre'] = '£'
        item['teach_type'] = 'taught'
        department = response.meta['department']
        department = set(department)
        department = ' '.join(department)
        item['department'] = department

        fee = response.xpath(
            '//h3[contains(text(),"Fee")]/../../following-sibling::div//text()'
        ).extract()
        tuition_fee = getTuition_fee(fee)
        if tuition_fee == 0:
            fee = response.xpath(
                '//span[contains(text(),"£")]//text()').extract()
            tuition_fee = getTuition_fee(fee)
        item['tuition_fee'] = tuition_fee
        # print(item['tuition_fee'])

        overview = response.xpath(
            '//h2[contains(text(),"Who is it")]/following-sibling::*|'
            '//h2[contains(text(),"Overview")]/following-sibling::*').extract(
            )
        overview = remove_class(overview)
        overview = clear_same_s(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//h2[contains(text(),"Structure")]/following-sibling::*|'
            '//h2[contains(text(),"Modules")]/following-sibling::*').extract()
        modules = remove_class(modules)
        modules = clear_same_s(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry_requirement = response.xpath(
            '//h3[contains(text(),"Entry")]/following-sibling::*|//div[@id="entryreq"]'
        ).extract()
        rntry_requirement = remove_class(rntry_requirement)
        rntry_requirement = clear_same_s(rntry_requirement)
        # print(rntry_requirement)
        item['rntry_requirements'] = rntry_requirement

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        # print(ielts)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::*').extract()
        # print(career)
        career = remove_class(career)
        career = clear_same_s(career)
        item['career_en'] = career
        # print(career)

        duration = response.xpath(
            '//span[contains(text(),"Duration")]/../following-sibling::div//text()|'
            '//h3[contains(text(),"Duration")]/following-sibling::*//text()'
        ).extract()
        mode = re.findall('(?i)full', ''.join(duration))
        if mode != []:
            item['teach_time'] = '1'
        else:
            item['teach_time'] = '2'
        # print(''.join(duration))
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        start_date = response.xpath(
            '//h3[contains(text(),"Start date")]/following-sibling::p/text()'
        ).extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        # print(start_date)

        apply_desc_en = response.xpath(
            '//h3[contains(text(),"How to apply")]/following-sibling::*|//div[@id="howtoapply"]'
        ).extract()
        apply_desc_en = remove_class(apply_desc_en)
        item['apply_proces_en'] = apply_desc_en

        require_chinese = "<p>Applicants will be considered for most postgraduate courses with a good Chinese bachelor’s degree from a recognised University.Students who don’t meet the requirements for direct entry may have the option to undertake our Graduate Diploma programme at INTO City, which then offers the opportunity for guaranteed entry into City’s Masters programmes.</p>"
        item['require_chinese_en'] = require_chinese

        assessment = response.xpath(
            '//h2[contains(text(),"Teaching and learning")]/following-sibling::*|//h3[contains(text(),"ssessment")]/following-sibling::*'
        ).extract()
        # if assessment==[]:
        #     print(response.url)
        # else:
        #     print('不为空')
        item['assessment_en'] = remove_class(assessment)

Exemple #11

0

Afficher le fichier

    def parse_main(self,response):
        item=get_item1(ScrapyschoolEnglandItem1)
        print(response.url)

        item['university'] = 'Middlesex University'
        item['url'] = response.url
        item['location'] = 'London'

        programme=response.xpath('//div[@class="course-page-banner__texts"]/h1/text()').extract()
        # print(programme)
        programme=''.join(programme)
        degree_name=re.findall('[A-Z]{2,}.*',programme)
        # print(degree_name)
        degree_name=''.join(degree_name)
        if degree_name!=programme:
            programme=programme.replace(degree_name,'')
        # print(programme)
        # print(degree_name)
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        try:
            if degree_name[0] == 'M':
                item['degree_type'] = '2'
            elif degree_name[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass


        start_date=response.xpath('//span[contains(text(),"Start")]/../following-sibling::div//text()').extract()
        # print(start_date)
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date'] = start_date

        duration=response.xpath('//span[contains(text(),"Duration")]/../following-sibling::div//text()').extract()
        mode=re.findall('(?i)full',''.join(duration))
        duration=clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        if mode !=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        fee = response.xpath('//span[contains(text(),"Fees")]/../following-sibling::div//text()').extract()
        tuition_fee=getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        overview=response.xpath('//h2[contains(text(),"Overview")]/following-sibling::*').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules=response.xpath('//h2[contains(text(),"Course content")]/following-sibling::*').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry=response.xpath('//h2[contains(text(),"Entry requirements")]/following-sibling::*').extract()
        rntry=remove_class(rntry)
        # print(rntry)
        item['rntry_requirements'] = rntry

        ielts=response.xpath('//p[contains(text(),"IELTS")]//text()').extract()
        ielts=''.join(ielts)
        item['ielts_desc']=ielts
        ielts=get_ielts(ielts)
        # print(ielts)
        try:
            if ielts!=[] or ielts!={}:
                item['ielts_l']=ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        career=response.xpath('//h2[contains(text(),"Careers")]/following-sibling::*').extract()
        career=remove_class(career)
        # print(career)
        item['career_en'] = career

        yield item

Exemple #12

0

Afficher le fichier

Fichier : UniversityOfYork_P.py Projet : histudent/python_spider

    def parse(self, response):
        # print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        university = 'University of York'
        item['university'] = university
        item['url'] = response.url
        item['location'] = 'York'
        item['tuition_fee_pre'] = '£'
        start_date = response.xpath(
            '//h4[contains(text(),"Start date")]/following-sibling::p//text()'
        ).extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        overview = response.xpath(
            '//div[@class="o-grid__box o-grid__box--half o-grid__box--half@medium"]|'
            '//h2[contains(text(),"verview")]/following-sibling::*|'
            '//h2[contains(text(),"At a glance")]/following-sibling::*|'
            '//h2[contains(text(),"Course summary")]/following-sibling::*|'
            '//h2[contains(text(),"At a Glance")]/following-sibling::*|'
            '//div[@id="mdcolumn"]/h1/following-sibling::*[position()<5]'
        ).extract()
        overview = remove_class(overview)
        item['overview_en'] = overview
        # print(overview)
        modules = response.xpath(
            '//div[@id="content_modules"]|'
            '//h2[contains(text(),"Course structure")]/following-sibling::*|'
            '//th[contains(text(),"Module")]/../../..|'
            '//h2[contains(text(),"ontent")]/following-sibling::*|'
            '//h3[contains(text(),"What does the course cover?")]/following-sibling::p[1]|'
            '//strong[contains(text(),"Course structure")]/../following-sibling::*[position()<=5]|'
            '//h2[contains(text(),"Structure and ethos")]/..|'
            '//h2[contains(text(),"Modules")]/following-sibling::*|'
            '//h2[contains(text(),"Structure and Ethos")]/following-sibling::*|'
            '//h2[contains(text(),"module")]/following-sibling::*').extract()
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)
        tuition_fee = response.xpath(
            '//div[@id="fees"]/following-sibling::div[1]//*[contains(text(),"£")]//text()'
        ).extract()
        tuition_fee = getTuition_fee(tuition_fee)
        item['tuition_fee'] = tuition_fee
        # print(tuition_fee)
        assessment = response.xpath(
            '//h2[contains(text(),"Teaching and assessment")]/../../following-sibling::div[1]'
            '|//h2[contains(text(),"ssessment")]/following-sibling::*|'
            '//h2[contains(text(),"ssessment")]/following-sibling::*[position()<=5]|'
            '//strong[contains(text(),"Specialist training tailored to your interests and aspirations")]/../following-sibling::*|'
            '//span[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|'
            '//h3[contains(text(),"ssessment")]/following-sibling::*[position()<=3]|'
            '//strong[contains(text(),"SUMMER TERM")]/../following-sibling::*|'
            '//strong[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|'
            '//h2[contains(text(),"Teaching")]/following-sibling::*|'
            '//blockquote[@class="rightBox"]/following-sibling::*[1]|'
            '//h2[contains(text(),"Dissertation")]/following-sibling::p[1]|'
            '//p[contains(text(),"This programme aims: ")]/following-sibling::table[1]'
        ).extract()
        # if assessment==[]:
        #     print(response.url)
        assessment = remove_class(assessment)
        item['assessment_en'] = assessment
        # print(assessment)

        entry_requirements = response.xpath(
            '//div[@id="entry"]|'
            '//h2[contains(text(),"requirement")]/following-sibling::*|'
            '//h2[contains(text(),"pplicants")]/following-sibling::*|'
            '//h3[contains(text(),"Entry Requirements")]/following-sibling::*|'
            '//h2[contains(text(),"Entry")]/following-sibling::*[position()>1]|'
            '//h3[contains(text(),"International students")]/following-sibling::*|'
            '//h3[contains(text(),"Entry requirements")]/following-sibling::*[position()<4]|'
            '//h2[contains(text(),"English Language Requirements")]/following-sibling::*[position()<3]'
        ).extract()
        # if entry_requirements==[]:
        #     print(response.url)
        entry_requirements = remove_class(entry_requirements)
        item['rntry_requirements'] = entry_requirements
        # print(entry_requirements)

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']
        toefl = response.xpath(
            '//*[contains(text(),"TOEFL")]//text()').extract()
        toefl = ''.join(toefl).strip()
        item['toefl_desc'] = toefl
        toefl = re.findall('\d{2,3}', toefl)
        if len(toefl) == 2:
            toefl = list(map(int, toefl))
            item['toefl'] = max(toefl)
            item['toefl_l'] = min(toefl)
            item['toefl_w'] = min(toefl)
            item['toefl_r'] = min(toefl)
            item['toefl_s'] = min(toefl)

        career = response.xpath(
            '//div[@class="o-grid__box o-grid__box--half"]|'
            '//h2[contains(text(),"areer")]/following-sibling::*|'
            '//h2[contains(text(),"Employment relevance")]/following-sibling::*|'
            '//p[contains(text(),"employment,")]/following-sibling::ul[1]|'
            '//p[contains(text(),"This programme aims: ")]/following-sibling::ul[1]|'
            '//h3[contains(text(),"areers")]/following-sibling::ul[1]|'
            '//h2[contains(text(),"Employment outcomes")]/following-sibling::*|'
            '//h3[contains(text(),"What can it lead to?")]/following-sibling::p[1]'
        ).extract()
        # if career==[]:
        #     print(response.url)
        career = remove_class(career)
        # print(career)
        item['career_en'] = career
        departnemt = response.xpath(
            '//h4[contains(text(),"Department")]/following-sibling::p//text()|//div[@id="location"]/h1//text()'
        ).extract()
        departnemt = ''.join(departnemt)
        item['department'] = departnemt
        # pro = response.meta['programme']
        # item['programme_en'] = pro
        # duration = response.meta['duration']
        # print(duration)
        # duration = clear_duration(duration)
        # item['duration'] = duration['duration']
        # item['duration_per'] = duration['duration_per']
        programme = response.xpath(
            '//div[@id="mdcolumn"]/h1/text()|//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()'
        ).extract()
        # print(programme)
        clears = re.findall('[A-Za-z]+ in ', ''.join(programme))
        programme = ''.join(programme).replace(''.join(clears), '').strip()
        item['programme_en'] = programme
        duration = response.xpath(
            '//h4[contains(text(),"Length")]/following-sibling::p//text()'
        ).extract()
        # print(duration)
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        major_type1 = response.xpath(
            '//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()|//div[@id="content-container"]//h1/text()'
        ).extract()
        major_type1 = ''.join(major_type1)
        item['major_type1'] = major_type1
        # if 'diploma' not in response.url:
        # print(response.url)
        # print(major_type1)
        degree_name = re.findall('[A-Z]{2}[a-zA-Z]*', major_type1)
        # print(degree_name)
        degree_name = '/'.join(degree_name).strip()
        item['degree_name'] = degree_name

Exemple #13

0

Afficher le fichier

Fichier : BirkbeckUniversityOfLondon_P.py Projet : histudent/python_spider

    def programme(self, response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        programme = response.xpath('//h1/text()').extract()
        # print(programme)
        deg = re.findall('\(.*\)', ''.join(programme))
        clears = re.findall(':.*', ''.join(programme))
        # print(deg)
        deg = ''.join(deg)
        programme = ''.join(programme).replace(''.join(clears), '').replace(deg, '').strip()
        # print(programme)
        item['programme_en'] = programme
        item['degree_name'] = deg.replace('(', '').replace(')', '').strip()
        item['url'] = response.url
        start_date = response.xpath('//dt[contains(text(),"tart date")]/following-sibling::dd[1]//text()').extract()
        start_date = tracslateDate(start_date)
        item['start_date'] = ','.join(start_date)
        item['university'] = 'Birkbeck, University of London'
        # item['tuition_fee_pre']='£'
        item['location'] = ''.join(
            response.xpath('//dt[contains(text(),"ocation")]/following-sibling::dd[1]//text()').extract())
        duration = response.xpath('//dt[contains(text(),"uration")]/following-sibling::dd[1]//text()').extract()
        # print(duration)
        mode = re.findall('(?i)full', ''.join(duration))
        # if mode!=[]:
        #     print('这个专业要')
        # else:
        #     print('这个专业只有兼职，不要！！！')
        dura = re.findall('[a-zA-Z0-9\s]+full', ''.join(duration))
        dura = clear_duration(dura)
        # print(dura)
        item['duration'] = dura['duration']
        item['duration_per'] = dura['duration_per']
        overview = response.xpath('//h2[contains(text(),"Highlights")]/preceding-sibling::div[1]').extract()
        overview = remove_class(overview)
        item['overview_en'] = overview
        # print(overview)
        modules = response.xpath('//h2[contains(text(),"Course structure")]/following-sibling::section').extract()
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)
        # if modules=='':
        #     print(response.url)
        entry = response.xpath('//h2[contains(text(),"ntry requirements")]/following-sibling::*').extract()
        entry = remove_class(entry)
        # print(entry)
        item['rntry_requirements']=entry
        chinese = ['<h3 class="content-show">Postgraduate entry requirements</h3>',
"<ul><li>Please <a>check your postgraduate course online</a> to see if your programme of study has an entry requirement of a UK undergraduate degree with a 2:1 or a 2:2 classification. </li><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:2 classification</strong>, you will typically need to have one of the following:</li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 70% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 75% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 75% </li><li>a Master's degree with an overall average grade of 60%. </li></ul><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:1 classification</strong>, you will typically need to have one of the following: </li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 75% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 80% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 80% </li><li>a Master's degree with an overall average grade of 70%. </li></ul><li>If you do not meet these criteria, you can apply for Birkbeck’s <a>International Foundation Programme</a><span>, which acts as a bridge between undergraduate and postgraduate study, preparing students to study a Master’s degree in the UK. There are progression pathways onto various courses at Birkbeck.</span></li><li>Another option is the <a>Master's Foundation programme</a>, at our partner provider OnCampus London, which is available for two- or three-term progression onto a wide range of Master’s Degrees at Birkbeck.</li><li>If your transcript is provided in GPA format and not a percentage value, <a>please contact our International Office</a> to check your equivalency. For most institutions: </li><ul><li>80% is equivalent to 4/5 or 3.3/4 </li><li>75% is equivalent to 3.5/5 or 2.7/4. </li></ul>"]
        item['require_chinese_en'] = remove_class(chinese)
        item['toefl_desc'] = 'overall score of 92, with 22 in Reading, 21 in Listening, 23 in Speaking, 24 in Writing.'
        item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'] = '22', '23', '22', '24'
        ielts = 'overall score of 6.5, with 6.0 in each subtest'
        ielts = response.xpath('//*[contains(text(),"IELTS")]//text()').extract()
        # print(ielts)
        ies = re.findall('\d\.?\d?', ''.join(ielts))
        # print(ies)
        if len(ies) == 2:
            ies = list(map(float, ies))
            item['ielts'] = max(ies)
            item['ielts_l'] = min(ies)
            item['ielts_s'] = min(ies)
            item['ielts_r'] = min(ies)
            item['ielts_w'] = min(ies)
        item['ielts_desc'] = '\n'.join(ielts).strip()
        fee = response.xpath('//h2[contains(text(),"Fees")]/following-sibling::p/text()').extract()
        # print(fee)
        assessment = response.xpath('//h2[contains(text(),"Assessment")]/following-sibling::*').extract()
        assessment = remove_class(assessment)
        item['assessment_en'] = assessment
        department = response.xpath('//a[contains(text(),"isit the")]/text()').extract()
        # print(department)
        department = ''.join(department).replace('Visit the', '').strip()
        # print(department)
        item['department'] = department
        howtoapply = response.xpath('//h2[contains(text(),"How to apply")]/following-sibling::*').extract()
        howtoapply = remove_class(howtoapply)
        # print(howtoapply)
        item['apply_proces_en'] = howtoapply
        # print(item)
        if mode!=[]:
            print('这个专业要')
            yield item
        else:
            print('这个专业只有兼职，不要！！！')

Exemple #14

0

Afficher le fichier

Fichier : UniversityForTheCreativeArts_P.py Projet : histudent/python_spider

    def parse(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)

        item['university'] = 'University for the Creative Arts'
        item['url'] = response.url

        programme = response.xpath('//h1/text()').extract()
        programme = ''.join(programme)
        # print(programme)
        item['programme_en'] = programme

        degr = response.xpath('//h1/following-sibling::p[1]/text()').extract()
        # print(degr)
        degr = ''.join(degr).split('-')
        if len(degr) == 3:
            # print(degr)
            degree_name = degr[0]
            location = degr[1]
            item['degree_name'] = degree_name
            try:
                if degree_name[0] == 'M':
                    item['degree_type'] = '2'
                elif degree_name[0] == 'P':
                    item['degree_type'] = '3'
            except:
                pass
        elif len(degr) == 4:
            # print(degr)
            item['degree_name'] = 'Pre-degree'
            item['degree_type'] = '2'

        duration = response.xpath(
            '//p[contains(text(),"Length of study")]/following-sibling::p/text()'
        ).extract()
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']

        location = response.xpath(
            '//p[contains(text(),"Campus")]/following-sibling::p/text()'
        ).extract()
        location = ''.join(location)
        item['location'] = location

        start_date = response.xpath(
            '//p[contains(text(),"Start month")]/following-sibling::p/text()'
        ).extract()
        start_date = tracslateDate(start_date)
        # print(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date

        overview = response.xpath('//div[@class="cell overview"]').extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//div[@id="syllabus"]/following-sibling::section[@class="article-content-area"][1]'
        ).extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        career = response.xpath(
            '//div[contains(text(),"Career")]/following-sibling::div').extract(
            )
        career = remove_class(career)
        # print(career)
        item['career_en'] = career

        item['ielts'] = '6'
        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'

        rntry = [
            "We will consider equivalent qualifications from your home country for entry onto our Foundation, Bachelor’s and Master’s courses. Please see below for details of the accepted qualifications (including English language qualifications) for each level of course. Each application we receive is considered individually and therefore these qualifications are provided as a guide.",
            "For our International Foundation in Art, Design and Media, we usually require that you have one of the following:",
            "Chinese Senior School graduation with 12 years of completed school study, with an average of 65% or above.",
            "Pre Foundation course at Guildford College Training School (China).",
            "For our Bachelor's courses, we usually require that you have:",
            "Chinese Senior School graduation with 12 years of completed school study, plus a recognised Foundation course.",
            "A transcript showing successful completion of one year of university study at a recognised Chinese university with average of 70%.",
            "For our Master's courses, we usually require that you have:",
            "Bachelor's degree with 80% average grade from a recognised Chinese university.",
        ]
        rntry = '\n'.join(rntry)
        rntry = response.xpath(
            '//h3[contains(text(),"UK entry requirements")]/following-sibling::*'
        ).extract()
        item['rntry_requirements'] = remove_class(rntry)
        portfolio = response.xpath(
            '//h3[contains(text(),"Your portfolio")]/following-sibling::*'
        ).extract()
        item['portfolio_desc_en'] = remove_class(portfolio)

        item['tuition_fee'] = '13540'
        item['tuition_fee_pre'] = '£'

        item['deadline'] = '2019-3'

        # print(item)
        yield item

Exemple #15

0

Afficher le fichier

Fichier : UniversityofBedfordshire_P.py Projet : histudent/python_spider

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)

        item['university']='University of Bedfordshire'
        item['url']=response.url

        programme=response.xpath('//div[@id="inner-course-content"]/h1/text()').extract()
        # print(programme)
        programme=''.join(programme)
        # print(programme)
        item['tuition_fee_pre']='£'
        if 'MBA' in programme:
            # print(programme)
            item['tuition_fee']='14000'
        else:
            item['tuition_fee']='12750'

        programme=programme.split('-')
        if len(programme)==2:
            prog=programme[0].strip()
            degr=programme[1].strip()
            # print(prog)
            # print(degr)
            item['degree_name']=degr
            try:
                if degr[0] == 'M':
                    item['degree_type'] = '2'
                elif degr[0] == 'P':
                    item['degree_type'] = '3'
            except:
                pass
        else:
            prog=''.join(programme).strip()
        item['programme_en']=prog
        location=response.xpath('//strong[contains(text(),"Campus Location")]/../text()').extract()
        location=''.join(location).replace('-','').strip()
        # print(location)
        item['location']=location

        duration=response.xpath('//strong[contains(text(),"Duration")]/../text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        mode=response.xpath('//strong[contains(text(),"Attendance")]/../text()').extract()
        mode=''.join(mode)
        mode=re.findall('(?i)full',mode)
        if mode!=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        start_date=response.xpath('//strong[contains(text(),"Start")]/../text()').extract()
        # print(start_date)
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        # print(start_date)
        item['start_date']=start_date

        overview=response.xpath('//div[@id="why_content"]').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        modules=response.xpath('//div[@id="unit_content"]').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        assessment_en=response.xpath('//div[@id="how_content"]').extract()
        assessment_en=remove_class(assessment_en)
        item['assessment_en']=assessment_en

        rntry=response.xpath('//h2[@id="entry"]/following-sibling::div/ul[@class="tab-content"]/div[3]').extract()
        rntry=remove_class(rntry)
        # print(rntry)
        item['rntry_requirements']=rntry

        item['ielts']='6.0'
        item['ielts_l']='5.5'
        item['ielts_s']='5.5'
        item['ielts_r']='5.5'
        item['ielts_w']='5.5'
        # item['toefl']='80'
        item['toefl_l']='17'
        item['toefl_s']='20'
        item['toefl_r']='18'
        item['toefl_w']='17'

        career=response.xpath('//div[@id="career_content"]').extract()
        career=remove_class(career)
        # print(career)
        item['career_en']=career

        apply_d=['<p>There are two ways you can make a direct application to the University of Bedfordshire:</p><ul><li><a href="https://evision.beds.ac.uk/urd/sits.urd/run/siw_ipp_lgn.login?process=siw_ipp_app&amp;code1=OA_FORM&amp;code2=0007">Apply online now for 2017/18</a> Courses starting from 1 August 2017 to 31 July 2018</li><li>Download <span class="include_asset_summary"><a href="https://www.beds.ac.uk/__data/assets/pdf_file/0006/441798/International-Application-web-2018.pdf">an application form - <img src="https://www.beds.ac.uk/__data/asset_types/pdf_file/icon.png" alt="" title="" height="16" width="16"  class="sq-icon" /> PDF  1.0 MB ',
'</a></span> and submit it to our <a href="https://www.beds.ac.uk/international/international-applications/contactus">Admissions Team</a> along with scans of your supporting documents, via email, post or in person at the International Office.</li></ul><p>You can post your completed form to:</p><p>University of Bedfordshire International Admissions/International Office/University Square/Luton/Bedfordshire/LU1 3JU/United Kingdom</p><h4>Please note</h4><ul><li><strong>BSc (Hons) Nursing Studies</strong> Level 3 and <strong>MSc Advanced Nursing Studies</strong> are available to overseas students - please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a></li><li><strong>Healthcare, Nursing and Midwifery students</strong> - many of these courses are not available to overseas students due to UK immigration law in regard to bursary funding. Please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a> to find out if you are eligible to apply.</li></ul><p>*Please note that international students studying on a Tier 4 Student Visa must choose a full-time Undergraduate or Postgraduate course and are not eligible for part-time study.</p><p>Watch some more tips and advice on making your application to Bedfordshire:</p>',]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        # item['application_open_date']='2018-8'
        # item['deadline']='2019-7'

        # print(item)
        yield item

Exemple #16

0

Afficher le fichier

Fichier : BuckinghamshireNewUniversity_P.py Projet : histudent/python_spider

    def parses(self, response):
        # print('进入专业链接页面',response.url)
        item = get_item1(ScrapyschoolEnglandItem1)
        item['url'] = response.url
        item['university'] = 'Buckinghamshire New University'
        location = response.xpath(
            '//ul[@class="course-details"]/li[contains(text(),"Location")]/text()'
        ).extract()
        location = ''.join(location).replace('Location:', '').strip()
        # print(location)
        programme = response.xpath(
            '//h1[@class="banner-title"]/text()').extract()
        item['programme_en'] = ''.join(programme).strip()
        degree_name = response.xpath(
            '//p[@class="school-code"]/text()').extract()
        item['degree_name'] = ''.join(degree_name).strip()
        item['location'] = location
        duration = response.xpath(
            '//ul[@class="course-details"]/li[contains(text(),"Duration")]/text()'
        ).extract()
        duration = clear_duration(duration)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        start_date = response.xpath(
            '//ul[@class="course-details"]/li[contains(text(),"Start Date")]/text()'
        ).extract()
        start_date = tracslateDate(start_date)
        # print(start_date)
        overview = response.xpath(
            '//h2[contains(text(),"Course Overview")]/..').extract()
        item['overview_en'] = remove_class(overview)
        modules = response.xpath(
            '//h2[contains(text(),"Course Modules")]/..').extract()
        item['modules_en'] = remove_class(modules)
        career = response.xpath(
            '//h2[contains(text(),"Employability")]/..').extract()
        item['career_en'] = remove_class(career)
        entry = response.xpath(
            '//h3[contains(text(),"What are the course entry requirements?")]/following-sibling::p[position()<=3]'
        ).extract()
        if entry == []:
            print(response.url)
        else:
            print(entry)
        item['rntry_requirements'] = remove_class(entry)
        item['tuition_fee'] = '11500'
        # item['apply_desc_en']=remove_class(entry)
        chi = [
            ' <div>  ',
            ' <p>Academic entry requirements</p ><p>We require successful completion of a 学士学位 (Bachelor degree) or successful completion of a three-year 本科毕业证书 (Benke) with an overall pass from a UK NARIC-recognised or Ministry of Education-listed institution.</p ><p>Mathematics entry requirements</p ><p>Students need the equivalent of GCSE Mathematics grade C/4.</p >  ',
            ' </div>  ',
        ]
        htp = [
            '<p>There&rsquo;s still time to apply for September 2018. Visit our <a hre>clearing section</a> to find out more.</p><p><strong>Check you meet the entry requirements</strong></p><p>Once you&rsquo;ve had a good look at our course information, and chosen which one feels right for you, before applying it&rsquo;s worth checking that you meet the entry requirements for your country.</p><p>We welcome applications from students with a wide range of qualifications from around the world. You&rsquo;ll find details of the exact academic and English language requirements for your country on our <a hre>country pages</a>.</p><p>Every student studying with us also needs to meet our <a hre>English language requirements</a> and we will ask you to provide evidence to show you have good enough English to study a higher education course in the UK.</p><p><strong>Different ways to apply</strong></p><p>When you are ready to apply for your course, you can do so in one of three ways:</p><ul><li>directly through our <a href="https://www.applycpd.com/bucks?tabid=21">application portal</a></li><li>through <a hre>UCAS</a>, or</li><li>through a recruitment agent in your country (see <a hre>your country page</a> for details of agents we work with who are operating locally to you).</li></ul><p>It doesn&rsquo;t matter which of these routes you use, but we advise you to apply early to give yourself enough time to prepare for moving to the UK and arranging your visa, if you need one.</p><p>If you&rsquo;ve missed out on your first choices, declined any offers made to you, or you&rsquo;re applying to university after&nbsp;30 June, you can also apply to us through <a hre>Clearing</a>.</p>',
        ]
        item['require_chinese_en'] = remove_class(chi)
        item['apply_desc_en'] = remove_class(htp)
        item['ielts'] = '6.0'
        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'

        yield item

Exemple #17

0

Afficher le fichier

Fichier : ManchesterMetropolitanUniversity_P.py Projet : histudent/python_spider

    def parses(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # print('接受了')
        print('开始下载', response.url, '的数据')
        # print(response.status)
        item['university'] = 'Manchester Metropolitan University'
        item['url'] = response.url
        item['location'] = 'Manchester'
        degree_name = response.xpath('//h1/span/text()').extract()
        degree_name = ''.join(degree_name)
        item['degree_name'] = degree_name
        programme = response.xpath('//h1/text()').extract()
        # print(programme)
        programme = ''.join(programme).strip()
        item['programme_en'] = programme
        # print(degree_name)
        # print(programme)
        item['degree_type'] = 2
        overview = response.xpath(
            '//h2[contains(text(),"Overview")]/following-sibling::article'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::p').extract()
        career = remove_class(career)
        item['career_en'] = career

        rntry = response.xpath(
            '//h2[contains(text(),"Entry")]/following-sibling::p').extract()
        ieltssss = re.findall('\d\.?\d?', ''.join(rntry))
        print(ieltssss)
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        modules = response.xpath(
            '//h2[contains(text(),"Course")]/following-sibling::div').extract(
            )
        modules = remove_class(modules)
        item['modules_en'] = modules

        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        tuition = getTuition_fee(fee)
        # print(tuition)
        item['tuition_fee'] = tuition
        item['tuition_fee_pre'] = '£'

        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['ielts'] = '6.5'
        item[
            'ielts_desc'] = 'For Postgraduate courses, we usually ask for IELTS 6.5 (No less than 5.5 in any section) or equivalent.'

        item[
            'toefl_desc'] = 'Overall score: 89 With no individual test score below: Listening: 17 Reading: 18 Speaking: 20 Writing : 17'
        item['toefl'] = '89'
        item['toefl_l'] = '17'
        item['toefl_s'] = '20'
        item['toefl_r'] = '18'
        item['toefl_w'] = '17'

        turation = response.xpath(
            '//li[contains(text(),"Length")]/span//text()').extract()
        duration = clear_duration(turation)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        ieltsopen = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        # print(ieltsopen)
        start_date = response.xpath(
            '//li[contains(text(),"Start")]/span//text()').extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        item['department'] = ''.join(
            response.xpath(
                '//span[@id="department_name"]/text()').extract()).strip()
        if response.status == 404:
            print("****404****")
            with open("errorurl.txt", 'a+') as f:
                f.write(response.url + "\n")
        else:
            yield item

Exemple #18

0

Afficher le fichier

    def parse(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['location'] = 'Newcastle'
        item['university'] = 'Northumbria University'
        item['url'] = response.url

        programme = response.xpath(
            '//div[@class="col-sm-6"]/h1/text()|//div[@class="hero-content"]/h1/text()|//header[@class="course-heading"]/h1/text()'
        ).extract()
        programme = ''.join(programme).strip()
        degree_name = re.findall('[A-Z]{2,}.*', programme)
        degree_name = ''.join(degree_name)
        if degree_name != programme:
            programme = programme.replace(degree_name, '')
        item['programme_en'] = programme
        item['degree_name'] = degree_name
        try:
            if degree_name[0] == 'M':
                item['degree_type'] = '2'
            elif degree_name[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass

        dur = response.xpath(
            '//strong[contains(text(),"Mode")]/../text()|//span[contains(text(),"uration")]/../text()'
        ).extract()
        # print(dur)
        duration = clear_duration(dur)
        # print(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        item['teach_time'] = '1'

        start_date = response.xpath(
            '//strong[contains(text(),"Start")]/../text()|//span[contains(text(),"Start")]/../text()'
        ).extract()
        start_date = list(set(start_date))
        # print(start_date)
        start_date = tracslateDate(start_date)
        # print(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date

        deadline = response.xpath(
            '//span[contains(text(),"deadline")]/../text()').extract()
        deadline = list(set(deadline))
        # print(deadline)
        deadline = tracslateDate(deadline)
        # print(deadline)
        deadline = ''.join(deadline)
        item['deadline'] = deadline

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]/text()').extract()
        item['ielts_desc'] = ''.join(ielts).strip()
        ielts = get_ielts(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass
        if ielts == []:
            ielts = response.xpath(
                '//*[contains(text(),"English Language requirements")]/../text()'
            ).extract()
            ielts = get_ielts(ielts)
            try:
                if ielts != [] or ielts != {}:
                    item['ielts_l'] = ielts['IELTS_L']
                    item['ielts_s'] = ielts['IELTS_S']
                    item['ielts_r'] = ielts['IELTS_R']
                    item['ielts_w'] = ielts['IELTS_W']
                    item['ielts'] = ielts['IELTS']
            except:
                pass
            # print(ielts)

        overview = response.xpath(
            '//div[@id="tab-0"]//div[@class="rich-text"]|//h3[contains(text(),"Overview")]/following-sibling::p'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//div[@id="tab-1"]//div[@class="rich-text"]|//div[@id="modules"]'
        ).extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry = response.xpath(
            '//*[contains(text(),"English Language requirements")]/..'
        ).extract()
        rntry = remove_class(rntry)
        # print(rntry)
        item['rntry_requirements'] = rntry

        howtoapply = response.xpath('//div[@id="how-to-apply"]').extract()
        howtoapply = remove_class(howtoapply)
        item['apply_proces_en'] = howtoapply

        department = response.xpath(
            '//strong[contains(text(),"Department")]/../text()').extract()
        department = ''.join(department).strip()
        item['department'] = department

        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        # print(fee)
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        career = response.xpath(
            '//h1[contains(text(),"career")]/../following-sibling::div|//div[@id="tab-5"]'
        ).extract()
        career = remove_class(career)
        # print(career)
        item['career_en'] = career