Ejemplos de get_item1 en Python

Lenguaje de programación: Python

Namespace/Package Name: scrapySchool_England.getItem

Método / Función: get_item1

Ejemplos en hotexamples.com: 30

Python get_item1 - 30 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de scrapySchool_England.getItem.get_item1 extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: UniversityOfTheArtsLondon_P.py Proyecto: histudent/python_spider

 def parses(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['url'] = response.meta['url']
     item['university'] = 'University of the Arts London'
     print(response.url)
     modules = response.xpath(
         '//h2[contains(text(),"ourse detail")]/../following-sibling::div|//h2[contains(text(),"ourse unit")]/../following-sibling::*[1]'
     ).extract()
     # print(modules)
     item['modules_en'] = remove_class(modules)
     yield item

Ejemplo n.º 2

Mostrar archivo

Archivo: UniversityCollegeLondon_P_copy.py Proyecto: histudent/python_spider

 def parses(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = 'University College London'
     item['url'] = response.meta['url']
     print(response.url)
     tuition = response.xpath(
         '//strong[contains(text(),"Overseas")]/following-sibling::span[1]/text()'
     ).extract()
     print(tuition)
     tui = re.findall('\d{2}\,\d{3}', ''.join(tuition))
     item['tuition_fee'] = ''.join(tui).replace(',', '').strip()
     if tui != []:
         yield item

Ejemplo n.º 3

Mostrar archivo

Archivo: getUrl.py Proyecto: histudent/python_spider

 def parse(self, response):
     pass
     item = get_item1(ScrapyschoolEnglandItem1)
     url_list = response.xpath("//div[@class='l-teaser--image c-teaser--image']//@href").extract()
     # ucascode = response.xpath('//*[@id="search-results"]/li/ul/li[3]/text()').extract()
     # major_type1 = response.xpath('//*[@id="search-results"]/li/h3/a').extract()
     # clear_space(major_type1)
     for i in url_list:
         print(i)
     # for k in ucascode:
     #     print(k)
     # for j in major_type1:
     #     print(j)
     # print(response.url)

Ejemplo n.º 4

Mostrar archivo

 def parsea(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['url'] = response.meta['url']
     item['university'] = "University of St Andrews"
     print(response.url)
     tuition = response.xpath(
         '//strong[contains(text(),"versea")]/following-sibling::text()[1]'
     ).extract()
     print(tuition)
     tui = re.findall('\d{2}\,\d{3}', ''.join(tuition))
     print(tui)
     item['tuition_fee'] = ''.join(tui).replace(',', '').strip()
     if tui != []:
         yield item

Ejemplo n.º 5

Mostrar archivo

Archivo: UniversityOfOxford_P.py Proyecto: histudent/python_spider

 def pars(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['url'] = response.meta['url']
     item['university'] = "University of Oxford"
     print(response.url)
     tuition = response.xpath(
         '//td[contains(text(),"versea")]/following-sibling::td[contains(text(),"£")]/text()'
     ).extract()
     print(tuition)
     if tuition != []:
         tuition = tuition[0]
         tui = re.findall('\d{2}\,\d{3}', tuition)
         item['tuition_fee'] = ''.join(tui).replace(',', '').strip()
         print(item['tuition_fee'])
         yield item

Ejemplo n.º 6

Mostrar archivo

 def parsess(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = 'University of Glasgow'
     item['url'] = response.url
     overview1 = response.xpath('//p[@class="intro-sentence"]').extract()
     overview = response.xpath(
         '//h2[contains(text(),"Why this programme")]/following-sibling::*'
     ).extract()
     overview = remove_class(overview)
     if overview1 != []:
         overview1 = remove_class(overview1)
         item['overview_en'] = overview1 + overview
     else:
         item['overview_en'] = overview
     yield item

Ejemplo n.º 7

Mostrar archivo

 def tuition(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['url'] = response.meta['url']
     item['university'] = 'University of Glasgow'
     print(response.url)
     programme = response.xpath(
         '//div[@id="prog-title"]/h1/text()').extract()
     # print(programme)
     item['programme_en'] = ''.join(programme).strip()
     tuition = response.xpath(
         '//h4[contains(text(),"nternationa")]/following-sibling::ul/li/strong[contains(text(),"ull")]/following-sibling::text()[1]'
     ).extract()
     print(tuition)
     # yield item
     if tuition != []:
         item['tuition_fee'] = tuition[0].replace('£', '').strip()
         yield item

Ejemplo n.º 8

Mostrar archivo

Archivo: UniversityOfOxford_P.py Proyecto: histudent/python_spider

 def parsesss(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = "University of Oxford"
     item['url'] = response.url
     coursePageUrl = response.xpath(
         '//a[contains(text(),"Course webpage")]/@href').extract()
     # print(coursePageUrl,response.url)
     #//dd[@id="panel-structure"]/div
     if coursePageUrl != []:
         mod = []
         modRes = etree.HTML(requests.get(coursePageUrl[0]).content)
         modules = modRes.xpath(
             '//dd[@id="panel-structure"]/div|//h2/strong[contains(text()," in ")]/../following-sibling::p'
         )
         for mo in modules:
             mod += etree.tostring(mo, method='html', encoding='unicode')
         item['modules_en'] = remove_class(mod)
         yield item

Ejemplo n.º 9

Mostrar archivo

Archivo: UniversityOfLiverpool_P.py Proyecto: histudent/python_spider

 def parse(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     rentry = response.xpath('//article[@class="content"]').extract()
     item['university'] = 'University of Liverpool'
     item['url'] = response.url.replace('entry-requirements/', '').strip()
     # if rentry==[]:
     #     print(response.url)
     rntry = remove_class(rentry)
     item['rntry_requirements'] = rntry
     # print(rntry)
     career_url = response.url.replace('entry-requirements/', 'careers/')
     carRs = self.getTag(
         self.getRes(career_url).xpath('//article[@class="content"]'))
     if carRs == []:
         print(response.url)
     item['career_en'] = remove_class(carRs)
     # print(carRs)
     yield item

Ejemplo n.º 10

Mostrar archivo

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "The University of Sheffield"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            tuition_fee_str = re.findall(r'course=.+"', response.text)
            tuition_fee_str = ''.join(tuition_fee_str).replace("course=",
                                                               '').replace(
                                                                   '"', '')
            # print("tuition_fee_str: ", tuition_fee_str)
            tuition_fee_url = "https://ssd.dept.shef.ac.uk/fees/pgt/api/lookup.php?year=2019&status=Overseas&course=" + tuition_fee_str
            # print("tuition_fee_url: ", tuition_fee_url)
            r = requests.get(tuition_fee_url, headers=self.headers)
            # print(r.text)
            tuition_fee = re.findall(r"&pound;\d+", r.text)
            # print(tuition_fee, "*******")
            if len(tuition_fee) != 0:
                item['tuition_fee'] = int(''.join(tuition_fee).replace(
                    '&pound;', ''))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 11

Mostrar archivo

 def parse(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = 'Oxford Brookes University'
     item['url'] = response.url
     tall = response.xpath(
         '//h2[contains(text(),"Approach to assessment")]/following-sibling::*'
     ).extract()
     tnext = response.xpath(
         '//h2[contains(text(),"Approach to assessment")]/following-sibling::h2[1]/self::*'
     ).extract()
     if tnext != []:
         assessment = tall[0:tall.index(tnext[0])]
     else:
         assessment = tall
     # print(assessment)
     if assessment == []:
         assessment = response.xpath(
             '//h2[contains(text(),"Approach to assessment")]/../text()'
         ).extract()
         item['assessment_en'] = '<div>' + remove_class(
             assessment) + '</div>'
     else:
         item['assessment_en'] = remove_class(assessment)
     yield item

Ejemplo n.º 12

Mostrar archivo

Archivo: UniversityOfOxford_P.py Proyecto: histudent/python_spider

 def parsess(self, response):
     item = get_item1(ScrapyschoolEnglandItem1)
     item['university'] = "University of Oxford"
     item['url'] = response.url
     overview_pre = response.xpath(
         '//div[@class="field field-name-field-intro field-type-text-long field-label-hidden"]//text()'
     ).extract()
     overview = response.xpath(
         '//div[@id="content-tab"]/child::*').extract()
     overview_spilt = response.xpath(
         '//div[@id="content-tab"]/child::h2[1]').extract()
     overview = overview[0:overview.index(overview_spilt[0])]
     item['overview_en'] = '<p>' + ''.join(
         overview_pre).strip() + '</p>' + remove_class(overview)
     career = response.xpath(
         '//h2[contains(text(),"Graduate destinations")]/self::*|//h2[contains(text(),"Graduate destinations")]/following-sibling::*'
     ).extract()
     career_spilt = response.xpath(
         '//h2[contains(text(),"Graduate destinations")]/following-sibling::h2[1]'
     ).extract()
     if career_spilt != []:
         career = career[0:career.index(career_spilt[0])]
     item['career_en'] = remove_class(career)
     yield item

Ejemplo n.º 13

Mostrar archivo

Archivo: SheffieldHallamUniversity_u.py Proyecto: histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Sheffield Hallam University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath("/html/body/section[1]//h1").extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en,response.url)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name = response.xpath('/html/body/section[1]/div/div[2]/span').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.tuition_fee
        tuition_fee = response.xpath("//*[contains(text(),'What is the fee?')]//following-sibling::*").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee =getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #7.tuition_fee_pre
        tuition_fee_pre = '£'

        #8.duration
        duration_list = response.xpath("//*[contains(text(),'How long will I study?')]//following-sibling::*").extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list).strip()
        duration =duration_list
        duration_per = 1

        #9.location
        location = 'Sheffield'

        #10.ucascode
        ucascode = response.xpath("//*[contains(text(),'What is the UCAS code?')]//following-sibling::*").extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #11.overview_en
        overview_en = response.xpath("//*[contains(text(),'Course summary')]//following-sibling::*").extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #12.career_en
        career_en = response.xpath("//*[contains(text(),'Future careers')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #13.alevel
        alevel = response.xpath('//*[@id="entry-requirements"]/div/div[1]/ul[2]/li[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        # print(alevel)


        #14.apply_proces_en
        apply_proces_en = response.xpath('//*[@id="apply-now"]/div[1]//a/@href').extract()
        apply_proces_en = ''.join(apply_proces_en)
        # print(apply_proces_en)

        #16.duration_per
        duration_per = 1

        #17.ielts_desc
        ielts_desc = response.xpath('//*[@id="entry-requirements"]/div/div[1]').extract()
        ielts_desc = ''.join(ielts_desc)
        ielts_desc = remove_tags(ielts_desc)
        ielts_list = re.findall(r'[567]\.\d',ielts_desc)
        # print(ielts_list,response.url)
        if len(ielts_list) == 2:
            a = ielts_list[0]
            b = ielts_list[1]
            ielts = a
            ielts_r = b
            ielts_l = b
            ielts_s = b
            ielts_w = b
        else:
            ielts = None
            ielts_r = None
            ielts_l = None
            ielts_s = None
            ielts_w = None

        #18.require_chinese_en
        require_chinese_en = '<p>The following qualifications from China will be considered for entry on to undergraduate programmes, with a minimum average of 60 per cent: Diploma from Specialised College (Zhongzhnan) Diploma from Vocational Secondary School (Zhixiao) Three year middle school diploma plus foundation degree A levels Graduate Diploma from: Radio and TV Universities Spare Time Universities Training Colleges for Administrative cadres Higher Education Self Study Examinations Adult Education/Adult Education in Science and Technology subjects Senior High School Diploma Chinese University Entrance Examination (until 2003) College Graduation Diploma (Dazhuan awarded by university/college on completion of 2-3 years study) Applicants who have completed the first year of an undergraduate degree at a Chinese university may be considered for direct entry to Sheffield Hallam University undergraduate programmes.Sheffield Hallam welcomes applications from international school students taking the International Baccalaureate Diploma and those achieving 28 points or more will usually be successful in obtaining an offer of a place on our undergraduate programmes. For information about IB points equivalences against the UCAS tariff please visit the UCAS website.</p>'
        #19.apply_fre
        apply_pre = '£'
        #20.start_date
        start_date = response.xpath("//*[contains(text(),'When do I start?')]//following-sibling::*").extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        # print(start_date)
        if'September, January' in start_date:
            start_date = '2018-9,2019-1'
        elif 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = translate_month(start_date)
            start_date = '2018-'+str(start_date)
        # print(start_date)

        #21.modules_en
        modules_en = response.xpath('//div[@data-section="split"][6]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        item['modules_en'] = modules_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['start_date'] = start_date
        item['apply_pre'] = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['ielts_desc'] = ielts_desc
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['duration'] = duration
        item['location'] = location
        item['ucascode'] = ucascode
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['alevel'] = alevel
        item['apply_proces_en'] = apply_proces_en
        item['duration_per'] = duration_per
        yield  item

Ejemplo n.º 14

Mostrar archivo

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Bournemouth University'
        # print(university)

        #2.location
        location = response.xpath(
            "//*[contains(text(),'Location:')]//following-sibling::p").extract(
            )
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #3.programme_en 4.degree_name
        programme_en = response.xpath('/html/body/div/section//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        try:
            degree_name = programme_en.split()[0]
        except:
            degree_name = ''
        if '-' in programme_en:
            programme_en = programme_en.replace('-', '')
        programme_en = programme_en.replace(degree_name, '')
        programme_en = clear_space_str(programme_en)
        if '–' in programme_en:
            programme_en = programme_en.replace('–', '').strip()
        programme_en = programme_en.replace('&amp;', '')
        # print('programme_en:',programme_en)
        # print('degree_name:',degree_name)

        # 5.degree_type
        degree_type = 2

        #6.teach_time
        teach_time = response.xpath(
            "//*[contains(text(),'Delivery:')]//following-sibling::*").extract(
            )
        teach_time = ''.join(teach_time)
        teach_time = remove_tags(teach_time)
        if 'Full time' in teach_time:
            teach_time = 'Full time'
        else:
            teach_time = 'Part time'
        # print(teach_time)

        #7.duration #8.duration_per
        duration = response.xpath(
            "//*[contains(text(),'Duration:')]//following-sibling::p").extract(
            )
        duration = ''.join(duration)
        duration = remove_tags(duration)
        # print(duration)
        if '1 year' in duration:
            duration = 1
            duration_per = 1
        elif '12-18 months' in duration:
            duration = 12
            duration_per = 3
        elif '36 months' in duration:
            duration = 36
            duration_per = 3
        elif '1 to 2 years' in duration:
            duration = 1
            duration_per = 1
        elif '2 years' in duration:
            duration = 2
            duration_per = 1
        elif '3-5 years' in duration:
            duration = 3
            duration_per = 1
        elif '48 months' in duration:
            duration = 48
            duration_per = 3
        elif '18-36 months' in duration:
            duration = 18
            duration_per = 3
        elif '12 months' in duration:
            duration = 12
            duration_per = 3
        elif '5 years' in duration:
            duration = 5
            duration_per = 1
        elif '3 years' in duration:
            duration = 3
            duration_per = 1
        elif '14 months' in duration:
            duration = 14
            duration_per = 3
        elif '15 months' in duration:
            duration = 15
            duration_per = 3
        elif '18-24 months' in duration:
            duration = 18
            duration_per = 3
        elif '27 months' in duration:
            duration = 27
            duration_per = 3
        elif '8 months' in duration:
            duration = 8
            duration_per = 3
        elif 'Nine months' in duration:
            duration = 9
            duration_per = 3
        else:
            duration_per = 1
            duration = 1
        # print('duration_per:',duration_per)
        # print('duration:',duration)

        #9.overview_en
        overview_en = response.xpath(
            '//*[@id="main-content"]/div/section[2]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.teach_time
        teach_time = 'full time'

        #11.modules_en
        modules_en = response.xpath(
            "//section[@id='course-details']//div[@id='accordion-1']").extract(
            )
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #12.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Next start date:')]//following-sibling::p"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #13.rntry_requirements
        rntry_requirements = response.xpath(
            "//*[contains(text(),'Entry requirements')]/../following-sibling::div[1]"
        ).extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        rntry_requirements = clear_space_str(rntry_requirements)
        # print(rntry_requirements,'******************************************************************************')

        #14.ielts 15.16.17.18
        ielts_list = re.findall('\d\.\d', rntry_requirements)
        # print(ielts_list)
        if len(ielts_list) == 4:
            ielts = ielts_list[2]
            ielts_l = ielts_list[3]
            ielts_s = ielts_list[3]
            ielts_r = ielts_list[3]
            ielts_w = ielts_list[3]
        elif len(ielts_list) == 3:
            ielts = ielts_list[1]
            ielts_l = ielts_list[2]
            ielts_s = ielts_list[2]
            ielts_r = ielts_list[2]
            ielts_w = ielts_list[2]
        elif len(ielts_list) == 2:
            ielts = ielts_list[0]
            ielts_l = ielts_list[1]
            ielts_s = ielts_list[1]
            ielts_r = ielts_list[1]
            ielts_w = ielts_list[1]
        elif len(ielts_list) == 1:
            ielts = ielts_list[0]
            ielts_l = ielts_list[0]
            ielts_s = ielts_list[0]
            ielts_r = ielts_list[0]
            ielts_w = ielts_list[0]
        else:
            ielts = None
            ielts_l = None
            ielts_s = None
            ielts_r = None
            ielts_w = None
        # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s)

        #19.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Careers')]/../following-sibling::*|//*[contains(text(),'Careers')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #20.tuition_fee,#21.tuition_fee_pre
        tuition_fee_list = response.xpath(
            '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]'
        ).extract()
        tuition_fee_list = ''.join(tuition_fee_list)
        #
        # if len(tuition_fee) == 0:
        #     tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract()
        # tuition_fee = ''.join(tuition_fee)
        # tuition_fee = remove_tags(tuition_fee)
        # tuition_fee = tuition_fee.replace(',','')
        # tuition_fee = tuition_fee.replace('£','')
        # print(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee_list)
        # print(tuition_fee)
        tuition_fee_pre = '£'

        #22.url
        url = response.url
        # print(url)

        #23.application_open_date
        application_open_date = '2018-7-18'
        #24.apply_pre
        apply_pre = '£'

        #25.apply_fee
        apply_fee = 0
        #26.apply_proces_en
        apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>'
        #27.require_chinese_en

        require_chinese_en = "<p>This is a guide to the normal entry requirements, assuming you’ve followed the Chinese education system. An admissions tutor will study your application, so make sure you include your academic background and personal information when you apply.Entry requirements vary depending on what sort of course you’re coming to BU to study. BU International College Foundation Certificates You can undertake a Foundation Certificate before going on to an undergraduate course if you’ve completed 11 years of schooling or Senior High School Year 2 in China and have a minimum of IELTS (Academic) 5.0. Undergraduate courses You can apply to study a Bachelor's degree from year one if you hold a Chinese Senior High School Diploma plus successful completion of a relevant first-year undergraduate programme in a recognised Chinese university, or a Diploma from Specialized College (zhongzhuan). Chinese Senior High School certificate of graduation with overall HuiKao result grade B average,  transcripts of 3 years with 85% average (85% also eligible for AES). Top-up courses You need to hold a College Graduation Diploma (Dazhuan awarded by a university/college on completion of two to three years of study), or a BTEC Higher National Diploma or Foundation degree in a relevant subject.Postgraduate courses You need to have a Bachelor's (Honours) degree from a recognised Chinese university, normally from a four-year undergraduate programme, or a Bachelors degree from Higher Education Self-Study Examinations, or a Top-up degree or university-recognised Pre-Master’s Foundation programme. Grade requirements from Chinese Bachelor's degree holders are as below: Applicants from 985 or 211 universities Media studies and other subjects equivalent to UK 2:1 degree	65% +	GPA 2.25 + Business and subjects equivalent to UK 2:2 degree	60% +	GPA 2.0 + Academic Excellence Scholarship (automatic award of £3500)	75% +	GPA 2.75 + Applicants from other universities Media studies and other subjects equivalent to UK 2:1 degree	70% +	GPA 2.5 + Business and subjects equivalent to UK 2:2 degree	65% +	GPA 2.25 + Academic Excellence Scholarship (automatic award of £3500)	80% +	GPA 3.0 + Research programmes You need a good postgraduate degree to be considered for a BU research programme. Please see more detail on the postgraduate research page.You can find more information about English language requirements for entry to BU on our English language requirements page. Full information about preparatory courses is available on the Bournemouth University International College website.If you need help with your visa or want more information about the immigration process, you can find it on our immigration information page.</p>"

        item['require_chinese_en'] = require_chinese_en
        item['apply_proces_en'] = apply_proces_en
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['location'] = location
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['degree_type'] = degree_type
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['teach_time'] = teach_time
        item['modules_en'] = modules_en
        item['start_date'] = start_date
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['url'] = url
        item['application_open_date'] = application_open_date
        yield item

Ejemplo n.º 15

Mostrar archivo

Archivo: RoyalAgriculturalUniversity_u.py Proyecto: histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Royal Agricultural University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="site"]//div[1]/div//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        if '(Hons) ' in programme_en:
            programme_en = programme_en.replace('(Hons) ', '')
        degree_name = programme_en.split()[0]
        programme_en = programme_en.replace(degree_name, '').strip()
        # print(degree_name)
        # print(programme_en)

        #6.overview_en
        overview_en = response.xpath(
            '//*[@id="course-overview"]/div[1]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.ucascode
        ucascode = response.xpath(
            '//*[@id="site"]/div/main/div/div/div[2]/div/div/div/h3').extract(
            )
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #8.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Modules')]//following-sibling::ul/li"
        ).extract()
        modules_en = '\n'.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #9.apply_desc_en
        apply_desc_en = response.xpath(
            '//*[@id="course-requirements"]/div[1]').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)

        #10.tuition_fee
        tuition_fee = response.xpath(
            '//*[@id="course-fees"]/div[1]/table[1]/tbody/tr[1]/td[3]'
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #11.tuition_fee_pre
        tuition_fee_pre = '£'

        #12.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Prospects')]//following-sibling::*").extract(
            )
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).strip()
        # print(career_en)

        #13.apply_proces_en
        apply_proces_en = response.xpath(
            "//*[contains(text(),'Apply now')]//following-sibling::div[1]"
        ).extract()
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = remove_class(apply_proces_en).strip()
        # print(apply_proces_en)

        #14.start_date
        start_date = '2018-9-1'

        #15.assessment_en
        assessment_en = '<p>During your undergraduate degree, you probably became familiar with many of the methods of delivery and study that we expect you to continue with during your postgraduate course. It is expected that you come already equipped with the basics in academic study, such as the ability to find, evaluate, manage, present and critique research or industry relevant output. There is a greater emphasis on independence and individual contribution towards the topics covered, and so the expectation is that students will actively participate in class-based activities from the outset. Giving presentations, critiquing case studies, using peer-to-peer feedback, working in groups on topical problems and justifying opinions based on the evidence is the norm for postgraduate study. It is not uncommon for students to arrive at a particular postgraduate qualification with very diverse backgrounds, qualifications and experience and we welcome these different perspectives in the classroom to bring a debate alive, however, it does require the student to take responsibility for their own subject knowledge gaps and motivate themselves to fill them. Of course, there will be support and guidance provided for good sources of information, however, it is not expected that these gaps will be specifically addressed within the taught sessions.For most postgraduate programmes group sizes are in the range of between 20 – 100 depending on the course and electives chosen (if relevant). However, alongside the lectures are small group seminars and tutorials where you will have the opportunity to explore key concepts in more detail, discuss topical issues relating to the key themes and undertake practical activities that help set the theories in context. To compliment the lectures and seminars, there may also be practical sessions, laboratory classes, off-site visits, case studies, guest speakers and field trips that are included in your timetabled activities depending on the modules you are studying.</p>'

        #16.deadline
        deadline = '2018-11,2019-5'

        #17.require_chinese_en
        require_chinese_en = '<p>International Foundation Year We run an International Foundation Year programme in partnership with our partner, INTO London World education Centre based in London.  To enquire about the programme please get in touch with our admissions team: [email protected] Undergraduate Degrees (Bachelors) Senior Secondary School Graduation certificate 高中毕业证书 with overall grade B or higher (to include Maths) Plus Gao Kao – Chinese University/College Entrance examination (高考) with good grades OR completion of a recognised International Foundation course with overall grade 60% or above OR successful completion of 1 year of University degree with a minimum of 60%.And IELTS band score 6.0 overall or above with no less than 5.5 in each component of the academic IELTS test. (The test must have been taken within two years of the start of the course). =Academic transfers to RAU into Years 2 and 3 are possible. For more information contact [email protected]</p>'

        #18.ielts 19202122
        ielts = 6.0
        ielts_s = 5.5
        ielts_w = 5.5
        ielts_l = 5.5
        ielts_r = 5.5

        #19.apply_pre
        apply_pre = '£'

        #20.alevel
        alevel = response.xpath(
            '//*[@id="course-requirements"]/div[1]/div/ul[1]/li[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_class(alevel)
        # print(alevel)

        #21.duration
        duration = 3

        #22.ib
        ib = response.xpath(
            '//*[@id="course-requirements"]/div[1]/div/ul[1]/li[4]').extract()
        ib = ''.join(ib)
        ib = remove_class(ib)
        # print(ib)

        item['ib'] = ib
        item['duration'] = duration
        item['alevel'] = alevel
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['apply_desc_en'] = apply_desc_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['career_en'] = career_en
        item['apply_proces_en'] = apply_proces_en
        item['start_date'] = start_date
        item['assessment_en'] = assessment_en
        item['deadline'] = deadline
        item['require_chinese_en'] = require_chinese_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['ucascode'] = ucascode
        yield item

Ejemplo n.º 16

Mostrar archivo

Archivo: ManchesterMetropolitanUniversity_P.py Proyecto: histudent/python_spider

    def parses(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # print('接受了')
        print('开始下载', response.url, '的数据')
        # print(response.status)
        item['university'] = 'Manchester Metropolitan University'
        item['url'] = response.url
        item['location'] = 'Manchester'
        degree_name = response.xpath('//h1/span/text()').extract()
        degree_name = ''.join(degree_name)
        item['degree_name'] = degree_name
        programme = response.xpath('//h1/text()').extract()
        # print(programme)
        programme = ''.join(programme).strip()
        item['programme_en'] = programme
        # print(degree_name)
        # print(programme)
        item['degree_type'] = 2
        overview = response.xpath(
            '//h2[contains(text(),"Overview")]/following-sibling::article'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::p').extract()
        career = remove_class(career)
        item['career_en'] = career

        rntry = response.xpath(
            '//h2[contains(text(),"Entry")]/following-sibling::p').extract()
        ieltssss = re.findall('\d\.?\d?', ''.join(rntry))
        print(ieltssss)
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        modules = response.xpath(
            '//h2[contains(text(),"Course")]/following-sibling::div').extract(
            )
        modules = remove_class(modules)
        item['modules_en'] = modules

        fee = response.xpath('//*[contains(text(),"£")]//text()').extract()
        tuition = getTuition_fee(fee)
        # print(tuition)
        item['tuition_fee'] = tuition
        item['tuition_fee_pre'] = '£'

        item['ielts_l'] = '5.5'
        item['ielts_s'] = '5.5'
        item['ielts_r'] = '5.5'
        item['ielts_w'] = '5.5'
        item['ielts'] = '6.5'
        item[
            'ielts_desc'] = 'For Postgraduate courses, we usually ask for IELTS 6.5 (No less than 5.5 in any section) or equivalent.'

        item[
            'toefl_desc'] = 'Overall score: 89 With no individual test score below: Listening: 17 Reading: 18 Speaking: 20 Writing : 17'
        item['toefl'] = '89'
        item['toefl_l'] = '17'
        item['toefl_s'] = '20'
        item['toefl_r'] = '18'
        item['toefl_w'] = '17'

        turation = response.xpath(
            '//li[contains(text(),"Length")]/span//text()').extract()
        duration = clear_duration(turation)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        ieltsopen = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        # print(ieltsopen)
        start_date = response.xpath(
            '//li[contains(text(),"Start")]/span//text()').extract()
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        item['start_date'] = start_date
        item['department'] = ''.join(
            response.xpath(
                '//span[@id="department_name"]/text()').extract()).strip()
        if response.status == 404:
            print("****404****")
            with open("errorurl.txt", 'a+') as f:
                f.write(response.url + "\n")
        else:
            yield item

Ejemplo n.º 17

Mostrar archivo

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Murdoch University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.degree_name
        degree_name = response.xpath(
            '//*[@id="course-overview"]/div/div/div/div/h3').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name).strip()
        # print(degree_name)

        #4.programme_en
        programme_en = response.xpath(
            '//*[@id="course-overview"]/div/div/div/div/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en).strip()
        # print(programme_en)

        #5.degree_overview_en
        degree_overview_en = response.xpath(
            '//*[@id="course-description-and-structure"]/div/div/div/div/p'
        ).extract()
        degree_overview_en = ''.join(degree_overview_en)
        degree_overview_en = remove_class(degree_overview_en)
        # print(degree_overview_en)

        #6.location
        location = response.xpath(
            '//*[@id="course-description-and-structure"]/div/div/div/div/ul/li[1]/ul/li/strong'
        ).extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #7.duration
        duration = response.xpath(
            "//*[contains(text(),'Course Duration')]//following-sibling::*"
        ).extract()[0]
        duration = ''.join(duration)
        duration = remove_tags(duration).strip()
        if '1.5' in duration:
            duration = 1.5
        else:
            duration = re.findall(r'\d', duration)[0]
        # print(duration)

        #8.department
        department = response.xpath(
            "//*[contains(text(),'School')]//following-sibling::*//strong"
        ).extract()
        department = ''.join(department)
        department = remove_tags(department).strip()
        if 'School of Business and GovernanceSocial' in department:
            department = 'School of Business and GovernanceSocial'
        # print(department)

        #9.degree_type
        degree_type = 2

        #10.start_date
        start_date = '2,7'

        #11.modules_en
        cour = response.xpath("//input[@name='course']//@value").extract()
        cour = ''.join(cour)
        cour = remove_tags(cour)
        # print(cour)
        modules_en_url = 'https://handbook.murdoch.edu.au/courses/details/?us=' + str(
            cour) + '&year=2019&structure=true'
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
        }
        data = requests.get(modules_en_url, headers=headers)
        response_modules_en = etree.HTML(data.text)
        # print(modules_en_url)
        modules_en = response_modules_en.xpath('/html/body/div')
        # modules_en = ''.join(modules_en)
        doc = ""
        if len(modules_en) > 0:
            for a in modules_en:
                doc += (etree.tostring(a,
                                       encoding='unicode',
                                       pretty_print=False,
                                       method='html'))
                doc = remove_class(doc)
                modules_en = doc
        # print(modules_en)

        #12.rntry_requirements_en
        # rntry_url = 'https://webapps2.murdoch.edu.au/entry-requirements/?study_level=Postgrad&course='+str(cour)+'&htmlOnly=1&student_origin=international&country=China'
        # data = requests.get(rntry_url, headers=headers)
        # response_rntry = etree.HTML(data.text)
        # # print(rntry_url)
        # response_rntry = response_rntry.xpath('/html/body/div//text()')[0]
        # response_rntry = '<p>'+response_rntry+'</P>'
        # # print(response_rntry)
        # rntry_requirements_en = response_rntry

        #13.tuition_fee
        tuition_fee = '1390/学分'

        #14.apply_pre
        apply_pre = '$'

        #15.tuition_fee_pre
        tuition_fee_pre = '$'

        #16.ielts 17181920 #21.toefl 22232425
        if 'Master of Teaching (Primary)' in degree_name:
            ielts = 7.5
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_s = 8.0
            ielts_l = 8.0
            toefl = 102
            toefl_r = 24
            toefl_w = 24
            toefl_s = 29
            toefl_l = 29
        elif 'Master of Veterinary' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
            toefl = 92
            toefl_r = 20
            toefl_w = 20
            toefl_s = 20
            toefl_l = 20
        elif 'Master of Food Security' in degree_name:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
            toefl = 79
            toefl_r = 18
            toefl_w = 18
            toefl_s = 18
            toefl_l = 18
        elif 'Master of Education' in degree_name:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
            toefl = 90
            toefl_r = 20
            toefl_w = 20
            toefl_s = 20
            toefl_l = 20
        else:
            ielts = 6.0
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
            toefl = 73
            toefl_r = 18
            toefl_w = 18
            toefl_s = 18
            toefl_l = 18

        #26.apply_documents_en
        apply_documents_en = '<p>Ready to apply? Before you start, make sure you have all of the following documentation ready for a quick application.Completed official Academic Transcripts and Certificates of Completion – both original and English translated versions A certified copy of your veterinary degree A certified copy of current registration with your local Veterinary Surgeon?s Board A recent Curriculum Vitae Two referee reports – one academic and one personal A typed, signed 500-word personal statement outlining how your veterinary work experience relates to this course English Language Proficiency Document (if available)</p>'

        #27.apply_desc_en
        apply_desc_en = "<p>Your Application Checklist Check the course details Check the entry requirements for the course to clarify your eligibility Check your eligibility for a scholarship Prepare your documentation (see the checklist below) Ask us any questions you might have (we're here to help!) Now you're ready to apply!</p>"

        item['university'] = university
        item['url'] = url
        item['degree_name'] = degree_name
        item['programme_en'] = programme_en
        item['degree_overview_en'] = degree_overview_en
        item['location'] = location
        item['duration'] = duration
        item['department'] = department
        item['degree_type'] = degree_type
        item['start_date'] = start_date
        item['modules_en'] = modules_en
        # item['rntry_requirements_en'] = rntry_requirements_en
        item['tuition_fee'] = tuition_fee
        item['apply_pre'] = apply_pre
        item['tuition_fee_pre'] = tuition_fee_pre
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['apply_documents_en'] = apply_documents_en
        item['apply_desc_en'] = apply_desc_en
        yield item

Ejemplo n.º 18

Mostrar archivo

Archivo: TheUniversityOfManchester_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.manchester.ac.uk/"
        item['university'] = "The University of Manchester"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "Oxford Rd, Manchester, M13 9PL, UK"
        print("===============================")
        print(response.url)
        try:
            # print(response.url)
            # 专业、学位类型
            programmeDegree = response.xpath(
                "//div[@id='course-profile']/div[@class='heading']/h1//text()"
            ).extract()
            clear_space(programmeDegree)
            programmeDegreeStr = ''.join(programmeDegree).strip()
            print(programmeDegreeStr)
            # degree_type = list(re.findall(r"^(\w{0,6})|(\w{0,6}/\w{0,6})\s", programmeDegreeStr)[0])
            degree_type = re.findall(
                r"^(Postgraduate\sCertificate)|(MBA)|^(\w{0,6}/\w{0,6}/\w{0,6})|^(\w{0,6}/\w{0,6})|^(\w{0,6})\s",
                programmeDegreeStr)
            if len(degree_type) > 0:
                degree_type = list(degree_type[0])
                print("degree_type = ", degree_type)
                item['degree_name'] = ''.join(degree_type).strip()
                if item['degree_name'] == "MBA":
                    item['programme_en'] = programmeDegreeStr
                else:
                    item['programme_en'] = programmeDegreeStr.replace(
                        item['degree_name'], "").strip("in").strip()
                # item['programme_en'] = programme[-1].strip()
            else:
                item['programme_en'] = programmeDegreeStr
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            start_date = response.xpath(
                "//*[contains(text(), 'Year of entry:')]//text()").extract()
            item['start_date'] = ''.join(start_date).replace(
                "Year of entry:", "").strip()
            # print("item['start_date'] = ", item['start_date'])

            duration = response.xpath(
                "//div[@id='course-profile']/div[@class='course-profile-content full-page']/div[@class='fact-file']/dl/dd[2]//text()"
            ).extract()
            durationStr = ''.join(duration)
            # print("durationStr = ", durationStr)
            if "full" in durationStr or "Full" in durationStr or "FT" in durationStr or "ft" in durationStr:
                item['teach_time'] = "fulltime"
            duration_re = re.findall(
                r"([a-zA-Z0-9\.]+\s)(year|month|week|yr|yft){1}|([0-9\.]+)(yr|yft|\-month){1}",
                durationStr, re.I)
            # print("duration_re = ", duration_re)
            d_dict = {
                "One": "1",
                "Two": "2",
                "Three": "3",
                "Four": "4",
                "Five": "5",
                "Six": "6",
                "Seven": "7",
                "Eight": "8",
                "Nine": "9",
                "Ten": "10",
                "one": "1",
                "two": "2",
                "three": "3",
                "four": "4",
                "five": "5",
                "six": "6",
                "seven": "7",
                "eight": "8",
                "nine": "9",
                "ten": "10",
            }
            if len(duration_re) > 0:
                d_int = re.findall(r"\d+", ''.join(duration_re[0]))
                if len(d_int) > 0:
                    item['duration'] = int(''.join(d_int))
                else:
                    d = re.findall(
                        r"(One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)",
                        ', '.join(duration_re[0]))
                    print("d = ", d)
                    item['duration'] = int(d_dict.get(''.join(d[0]).strip()))
                if "y" in ''.join(duration_re[0]) or "Y" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 1
                elif "m" in ''.join(duration_re[0]) or "M" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 3
                elif "w" in ''.join(duration_re[0]) or "W" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 4
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//*[contains(text(), 'Academic department')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            if len(department) > 0:
                item['department'] = department[0]
            # print("item['department'] = ", item['department'])

            # 专业描述，雅思托福，就业方向, 学术要求，How To Apply
            overview = response.xpath(
                '//h3[@id="course-overview"]/following-sibling::div[1]'
            ).extract()
            overview1 = response.xpath(
                '//h3[@id="course-description"]/following-sibling::div[1]'
            ).extract()
            print('===', len(overview1))
            if len(overview1) == 2:
                overview1 = [overview1[0]]
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)) + remove_class(
                    clear_lianxu_space((overview1)))
            print("item['overview_en'] = ", item['overview_en'])

            # Entry requirements
            rntry_requirements = response.xpath(
                '//h2[@id="entry-requirements"]/following-sibling::*[position()<9]//text()'
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'IELTS')]//text()"
            ).extract()
            if len(ielts_desc) == 0:
                ielts_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1][contains(text(), 'IELTS')]//text()"
                ).extract()
            if ''.join(ielts_desc).strip() == "IELTS":
                ielts_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'IELTS')]/..//text()"
                ).extract()
            clear_space(ielts_desc)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = clear_lianxu_space(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            toefl_desc = response.xpath(
                "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'TOEFL')]//text()"
            ).extract()
            if len(toefl_desc) == 0:
                toefl_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1][contains(text(), 'TOEFL')]//text()"
                ).extract()
            if ''.join(toefl_desc).strip() == "IBT TOEFL:":
                toefl_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'TOEFL')]/..//text()"
                ).extract()
            clear_space(toefl_desc)
            item['toefl_desc'] = clear_lianxu_space(toefl_desc).replace(
                "\nTOEFL code for Manchester is 0757", "").strip()
            # print("item['toefl_desc']: ", item['toefl_desc'])

            # ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            ielts_list = re.findall(r"[567]\.\d|[678]", item['ielts_desc'])
            # print("ielts_list: ", ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3 or len(ielts_list) > 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[2]
                item['ielts_s'] = ielts_list[2]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[1]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl_list = re.findall(r"1[0-1]\d|[12789]\d", item['toefl_desc'])
            # print(toefl_list)
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
                # item['toefl_l'] = toefl_list[0]
                # item['toefl_r'] = toefl_list[0]
                # item['toefl_s'] = toefl_list[0]
                # item['toefl_w'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 3:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[2]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 4:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[3]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 5:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[3]
                item['toefl_s'] = toefl_list[4]
                item['toefl_w'] = toefl_list[2]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #                             item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            apply_proces_en = response.xpath(
                '//h2[@id="application-and-selection"]/following-sibling::*[position()<15]'
            ).extract()
            apply_proces_en_str = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print(apply_proces_en_str.index("<h2>Course details</h2>"))
            if apply_proces_en_str.find("<h2>Course details</h2>") == -1:
                apply_proces_en_s1 = apply_proces_en_str[
                    0:len(apply_proces_en_str)]
            else:
                apply_proces_en_s1 = apply_proces_en_str[:apply_proces_en_str.find(
                    "<h2>Course details</h2>") - 1]
            item['apply_proces_en'] = apply_proces_en_s1
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            interview_desc_en = response.xpath(
                '//h3[contains(text(), "Interview requirements")]/following-sibling::div[1]'
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            modules_en = response.xpath(
                "//*[contains(text(), 'Course unit details')]/following-sibling::*[position()<5]"
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(), 'Course unit list')]/following-sibling::*[position()<3]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                '//*[@id="teaching-and-learning"]/following-sibling::*[position()<4]'
            ).extract()
            if len(assessment_en) == 0:
                assessment_en = response.xpath(
                    '//*[@id="coursework-and-assessment"]/following-sibling::*[position()<4]'
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            career_en = response.xpath(
                '//*[@id="careers"]/following-sibling::*').extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])

            fee1 = response.xpath(
                "//div[@id='course-profile']/div[@class='course-profile-content full-page']/ul[1]/li[1]//text()"
            ).extract()
            # print(fee1)
            fee = clear_lianxu_space(fee1)
            fee_re = re.findall(
                r"International\sstudents\s\(per\sannum\):\s£[\d,]+", fee)
            fee_re1 = re.findall(r"£[\d,]+", ''.join(fee_re))
            # print("fee_re1: ", fee_re1)
            f = ''.join(fee_re1).replace("£", "").replace(",", "").strip()
            if len(f) != 0:
                item['tuition_fee'] = int(f)
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            item[
                'require_chinese_en'] = """<h2>Master's entry requirements</h2>
<p><span>For entry onto our master&rsquo;s degrees we require&nbsp;a minimum overall mark of 80% or CGPA of 3.0/4.0 in a Law degree with an average of 80% or higher in law units from a well ranked institution. We will accept relevant degrees for the MA study.</span></p>
<p>For all our LLM courses (except the LLM Healthcare Ethics and Law) we require an undergraduate Law degree. For MA courses we consider degrees in relevant disciplines.</p>"""
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 19

Mostrar archivo

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Sheffield Hallam University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath("/html/body/section[1]//h1").extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = response.xpath(
            '/html/body/section[1]/div/div[2]/span').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'What is the fee?')]//following-sibling::*"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #7.tuition_fee_pre
        tuition_fee_pre = '£'

        #8.duration
        duration_list = response.xpath(
            "//*[contains(text(),'How long will I study?')]//following-sibling::*"
        ).extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        try:
            duration = re.findall('\d+', duration_list)[0]
        except:
            duration = 1
        # print(duration_list)
        if int(duration) > 5:
            duration_per = 3
        else:
            duration_per = 1
        # print(duration,'*********',duration_per)

        #9.location
        location = 'Sheffield'

        #10.teach_time
        teach_time = response.xpath('/html/body/section[1]//span[1]').extract()
        teach_time = ''.join(teach_time)
        teach_time = remove_tags(teach_time)
        if 'Full-time' in teach_time:
            teach_time = 'Full-time'
        else:
            teach_time = 'Part-time'
        # print(teach_time)

        #11.overview_en
        overview_en = response.xpath(
            "//*[contains(text(),'Course summary')]//following-sibling::*"
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #12.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Future careers')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #13.rntry_requirements
        rntry_requirements = response.xpath(
            '//*[@id="entry-requirements"]/div').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #14.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Compulsory modules')]/../following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #15.apply_proces_en
        apply_proces_en = response.xpath(
            '//*[@id="apply-now"]/div[1]//a/@href').extract()
        apply_proces_en = ''.join(apply_proces_en)
        # print(apply_proces_en)

        #16.duration_per
        duration_per = 1

        #17.ielts 18192021
        ielts_list = re.findall(r'[567]\.\d', rntry_requirements)
        # print(ielts_list,response.url)
        if len(ielts_list) != 0:
            a = ielts_list[0]
            b = ielts_list[1]
            ielts = a
            ielts_r = b
            ielts_l = b
            ielts_s = b
            ielts_w = b
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_l = 6.0
            ielts_s = 6.0
            ielts_w = 6.0

        #22.require_chinese_en
        require_chinese_en = '<p>The following qualifications from China will be considered for entry on to postgraduate taught programmes, with a usual minimum average of 60 per cent Four year Bachelor Degree from a recognised university Three year university diploma plus relevant work experience Successful completion of a recognised pre-masters course</p>'
        #23.apply_fre
        apply_pre = '£'
        #24.start_date
        start_date = response.xpath(
            "//*[contains(text(),'When do I start?')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        # print(start_date)
        if 'September, January' in start_date:
            start_date = '2018-9,2019-1'
        elif 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = translate_month(start_date)
            start_date = '2018-' + str(start_date)
        # print(start_date)

        item['start_date'] = start_date
        item['apply_pre'] = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['duration'] = duration
        item['location'] = location
        item['teach_time'] = teach_time
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['rntry_requirements'] = rntry_requirements
        item['modules_en'] = modules_en
        item['apply_proces_en'] = apply_proces_en
        item['duration_per'] = duration_per
        yield item

Ejemplo n.º 20

Mostrar archivo

Archivo: TeessideUniversity_u.py Proyecto: histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Teesside University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[1]/h1/text()'
        ).extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en).strip()
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[1]/h1/span'
        ).extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name).strip()
        if '(Hons)' in degree_name:
            degree_name = degree_name.replace('(Hons)', '').strip()
        # print(degree_name)

        #6.overview_en
        overview_en = response.xpath(
            '//*[@id="tab1"]/div/div[1]/div').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.modules_en
        modules_en = response.xpath(
            '//*[@id="tab2"]/div[1]/div/div[1]/div[1]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        print(modules_en)

        #8.assessment_en
        assessment_en = response.xpath(
            '//*[@id="tab2"]/div[1]/div/div[1]/div[3]/p').extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #9.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Career opportunities')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #10.ucascode
        ucascode = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[2]/div/div[2]/p/text()'
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = clear_space_str(ucascode)
        try:
            ucascode = ucascode[:4]
        except:
            ucascode = 'N/A'
        # print(ucascode)

        #11.department
        department = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[2]/div/div[3]/a/p'
        ).extract()
        department = ''.join(department)
        department = remove_tags(department)
        department = department.replace('&amp; ', '')
        # print(department)

        #12.duration
        duration = response.xpath(
            '//*[@id="courseinfopdf"]/div[1]/ul/li[1]').extract()
        duration = ''.join(duration)
        duration = remove_tags(duration)
        # print(duration)

        #13.tuition_fee
        tuition_fee = 11825

        #14.apply_desc_en
        apply_desc_en = response.xpath(
            '//*[@id="tab3"]/div/div[1]/div').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)
        # print(apply_desc_en)

        #15.start_date
        start_date = '2018-10-13'

        #16.tuition_fee_pre
        tuition_fee_pre = '£'

        #17.other
        other = 'https://www.tees.ac.uk/sections/international/fees.cfm'

        #18.require_chinese_en
        require_chinese_en = '<p>For entry onto a Foundation or Extended programme, applicants require:  Huikao (Chinese senior secondary school graduation certificate) Successful completion of the first two years of Senior Secondary School with a minimum average of 70% or successful completion of Senior Secondary School with a minimum average of 60% For entry onto an Undergraduate programme, applicants require:  For entry onto Year 1:Huikao (Chinese senior secondary school graduation certificate) Successful completion of Senior Secondary School with a minimum average of 80% Or Gaokao (Chinese university or college entrance exam) with a minimum score of 500 For entry onto Higher National Diploma: Gaokao with a minimum score of 450 For entry onto Integrated Master of Engineering – MEng (Hons): Gaokao with a minimum score of 550 For entry onto Undergraduate top-up programmes (third-year entry) Dazhuan (three-year college graduation diploma) with a minimum of 70% average or, SQA Higher National Diploma with BBC as minimum or, Edexcel Higher National Diploma – standard UK entry requirements or, UK accredited foundation degree</p>'

        #19.ielts,20212223
        if 'Dental Hygiene and Dental Therapy' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Diagnostic Radiography' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Midwifery' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Physiotherapy' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Occupational Therapy' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Nursing Studies' in degree_name:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        else:
            ielts = 6
            ielts_r = 5.5
            ielts_w = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        if 'Fine Art' in programme_en:
            ielts = 5.5
        elif 'Design' in programme_en:
            ielts = 5.5
        elif 'Media Production' in programme_en:
            ielts = 5.5
        elif 'Engineering' in programme_en:
            ielts = 5.5
        elif 'Science' in programme_en:
            ielts = 5.5
        elif 'Computing' in programme_en:
            ielts = 5.5
        elif 'Media Studies' in programme_en:
            ielts = 5.5
        elif 'Journalism' in programme_en:
            ielts = 5.5
        elif 'Business' in programme_en:
            ielts = 6.0
        elif 'English' in programme_en:
            ielts = 6.0
        elif 'Sport' in programme_en:
            ielts = 6.0
        elif 'History' in programme_en:
            ielts = 6.0
        elif 'Psychology' in programme_en:
            ielts = 6.0
        elif ' Criminology' in programme_en:
            ielts = 6.0
        elif 'Sociology' in programme_en:
            ielts = 6.0
        elif 'Youth Studies' in programme_en:
            ielts = 6.0
        elif 'Education' in programme_en:
            ielts = 6.0
        elif 'Law' in programme_en:
            ielts = 6.0
        elif 'Crime' in programme_en:
            ielts = 6.0
        elif 'Investigation' in programme_en:
            ielts = 6.0
        elif 'Health' in programme_en:
            ielts = 7.0
        else:
            ielts = 6.0
        # print(ielts,ielts_w,ielts_l,ielts_r,ielts_s)
        #24.apply_pre
        apply_pre = '£'

        #25.alevel
        # alevel = response.xpath('')
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['ucascode'] = ucascode
        item['department'] = department
        item['duration'] = duration
        item['tuition_fee'] = tuition_fee
        item['apply_desc_en'] = apply_desc_en
        item['start_date'] = start_date
        item['other'] = other
        item['tuition_fee_pre'] = tuition_fee_pre
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['require_chinese_en'] = require_chinese_en
        item['apply_pre'] = apply_pre

Ejemplo n.º 21

Mostrar archivo

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of South Wales'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="uni"]/section[1]/div//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en).strip()
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        if 'BA (Anrh)' in programme_en:
            degree_name = 'BA (Anrh)'
        elif 'BA (Hons)' in programme_en:
            degree_name = 'BA (Hons)'
        elif 'BEng (Hons)' in programme_en:
            degree_name = 'BEng (Hons)'
        elif 'BMus (Hons)' in programme_en:
            degree_name = 'BMus (Hons)'
        elif 'BSc (Hons)' in programme_en:
            degree_name = 'BSc (Hons)'
        elif 'Bachelor of Nursing (Hons)' in programme_en:
            degree_name = 'Bachelor of Nursing (Hons)'
        elif 'Bachelor of Midwifery (Hons)' in programme_en:
            degree_name = 'Bachelor of Midwifery (Hons)'
        elif 'Foundation Degree' in programme_en:
            degree_name = 'Foundation Degree'
        elif 'Foundation Year' in programme_en:
            degree_name = 'Foundation Year'
        elif 'HNC' in programme_en:
            degree_name = 'HNC'
        elif 'HND' in programme_en:
            degree_name = 'HND'
        elif 'LLB (Hons)' in programme_en:
            degree_name = 'LLB (Hons)'
        elif 'MComp' in programme_en:
            degree_name = 'MComp'
        elif 'MEng' in programme_en:
            degree_name = 'MEng'
        elif 'MGeog' in programme_en:
            degree_name = 'MGeog'
        elif 'MMath' in programme_en:
            degree_name = 'MMath'
        elif 'MSci' in programme_en:
            degree_name = 'MSci'
        else:
            degree_name = ''
        programme_en = programme_en.replace(degree_name, '').strip()
        # print(degree_name)
        # print(programme_en)

        #6.overview_en
        overview_en = response.xpath(
            '//*[@id="uni"]/section[1]/div[2]//p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.duration #8.duration_per
        duration = response.xpath(
            '//*[@id="2019"]/div/table/tbody/tr/td[3]').extract()
        if len(duration) == 0:
            duration = response.xpath(
                '//*[@id="2018"]/div/table/tbody/tr/td[3]').extract()
        duration = ''.join(duration)
        duration = remove_tags(duration)
        try:
            duration = re.findall(r'\d+', duration)[0]
        except:
            duration = 0
        if int(duration) > 6:
            duration_per = 3
        else:
            duration_per = 1
        # print(duration,'**',duration_per)

        #9.ucascode
        ucascode = response.xpath(
            '//*[@id="2019"]/div/table/tbody/tr/td[1]').extract()
        if len(ucascode) == 0:
            ucascode = response.xpath(
                '//*[@id="2018"]/div/table/tbody/tr/td[1]').extract()
        ucascode = '*'.join(ucascode)
        ucascode = remove_tags(ucascode)
        # print(ucascode,response.url)

        #10.start_date
        start_date = response.xpath(
            '//*[@id="2018"]/div/table/tbody/tr[1]/td[4]').extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date).strip()
        # print(start_date)

        #11.modules_en
        modules_en = response.xpath('//*[@id="eleven"]/div').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #12.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'Assessment')]//following-sibling::div[1]"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #13.apply_desc_en
        apply_desc_en = response.xpath('//*[@id="odin"]/div').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)
        # print(apply_desc_en)

        #14.career_en
        career_en = response.xpath('//*[@id="careers_panel"]/div').extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #15.tuition_fee #16.tuition_fee_pre
        tuition_fee = response.xpath(
            "//*[contains(text(),'Full-time International')]//following-sibling::*"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getT_fee(tuition_fee)
        # print(tuition_fee,response.url)
        tuition_fee_pre = '£'

        #17.ielts 18192021
        ielts = 6.0
        ielts_l = 5.5
        ielts_w = 5.5
        ielts_s = 5.5
        ielts_r = 5.5
        #22.apply_pre
        apply_pre = '£'

        #23.alevel
        alevel = response.xpath(
            "//*[contains(text(),'Typical A-Level Offer')]//following-sibling::*"
        ).extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        # print(alevel)

        #24.ib
        ib = response.xpath(
            "//*[contains(text(),'Typical IB Offer')]//following-sibling::*"
        ).extract()
        ib = ''.join(ib)
        ib = remove_tags(ib)
        # print(ib)

        #25.location
        location = response.xpath(
            '//*[@id="2018"]/div/table/tbody/tr/td[5]/a').extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        item['alevel'] = alevel
        item['ib'] = ib
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['duration_per'] = duration_per
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['apply_desc_en'] = apply_desc_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['ielts_w'] = ielts_w

        ucascode_a = response.xpath(
            '//*[@id="2019"]/div/table/tbody/tr/td[1]').extract()
        if len(ucascode_a) == 0:
            ucascode_a = response.xpath(
                '//*[@id="2018"]/div/table/tbody/tr/td[1]').extract()
        duration_a = response.xpath(
            '//*[@id="2019"]/div/table/tbody/tr/td[3]').extract()
        if len(duration_a) == 0:
            duration_a = response.xpath(
                '//*[@id="2018"]/div/table/tbody/tr/td[3]').extract()
        start_date_a = response.xpath(
            '//*[@id="2019"]/div/table/tbody/tr/td[4]').extract()
        if len(start_date_a) == 0:
            start_date_a = response.xpath(
                '//*[@id="2018"]/div/table/tbody/tr/td[4]').extract()
        location_a = response.xpath(
            '//*[@id="2019"]/div/table/tbody/tr/td[5]').extract()
        if len(location_a) == 0:
            location_a = response.xpath(
                '//*[@id="2018"]/div/table/tbody/tr/td[5]').extract()
        if len(ucascode_a) > 1:
            for i, j, k, l in zip(ucascode_a, duration_a, start_date_a,
                                  location_a):
                response_ucascode = i
                response_ucascode = remove_tags(response_ucascode)
                response_duration = j
                try:
                    response_duration = re.findall('\d+', response_duration)[0]
                except:
                    response_duration = ''
                response_start_date = k
                response_start_date = remove_tags(response_start_date).strip()
                response_location = l
                response_location = remove_tags(response_location)
                item['ucascode'] = response_ucascode
                item['duration'] = response_duration
                item['start_date'] = response_start_date
                item['location'] = response_location
                yield item
        else:
            item['ucascode'] = ucascode
            item['duration'] = duration
            item['start_date'] = start_date
            item['location'] = location
            yield item

Ejemplo n.º 22

Mostrar archivo

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Suffolk'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '/html/body/div/div[2]/div/div[1]/div[1]/div[2]/header/h1'
        ).extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        if 'BSc (Hons)' in programme_en:
            degree_name = 'BSc (Hons)'
        elif 'MSci' in programme_en:
            degree_name = 'MSci'
        elif 'BA (Hons)' in programme_en:
            degree_name = 'BA (Hons)'
        elif 'FdSc' in programme_en:
            degree_name = 'FdSc'
        elif 'BA' in programme_en:
            degree_name = 'BA'
        elif 'FdA' in programme_en:
            degree_name = 'FdA'
        elif 'Dip/HE' in programme_en:
            degree_name = 'Dip/HE'
        elif 'LLB (Hons)' in programme_en:
            degree_name = 'LLB (Hons)'
        elif 'HNC' in programme_en:
            degree_name = 'HNC'
        elif 'HND' in programme_en:
            degree_name = 'HND'
        elif 'BEng (Hons)' in programme_en:
            degree_name = 'BEng (Hons)'
        else:
            degree_name = ''
        programme_en = programme_en.replace(degree_name, '').strip()
        # print(degree_name)
        # print(programme_en)

        #6.location
        location = response.xpath(
            "//*[contains(text(),'Location:')]//following-sibling::*").extract(
            )
        location = ','.join(location)
        location = remove_tags(location).strip()
        # print(location)

        #7.duration
        duration = response.xpath(
            "//*[contains(text(),'Duration:')]//following-sibling::*//p[1]"
        ).extract()
        duration = ''.join(duration)
        duration = remove_tags(duration)
        if 'Also available as online' in duration:
            duration = 1
        try:
            duration = re.findall(r'(.*)full-time', duration)[0].strip()
        except:
            duration = 1
        duration = 3
        # print(duration)
        #
        #8.duration_per
        duration_per = 1

        #9.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS code:')]//following-sibling::*"
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode).strip()
        ucascode = ucascode[:4]
        # print(ucascode)

        #10.overview_en
        overview_en = response.xpath(
            '//*[@id="group-description"]/div[1]//p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #11.modules_en
        modules_en = response.xpath(
            '//*[@id="group-duration-modules"]/*').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # if len(modules_en)==0:
        #     print(response.url)
        # print(modules_en)

        #12.tuition_fee
        tuition_fee = response.xpath('//*[@id="group-fees"]').extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #13.tuition_fee_pre
        tuition_fee_pre = '£'

        #14.apply_desc_en
        apply_desc_en = response.xpath(
            "//*[contains(text(),'Academic Requirements')]/../../following-sibling::*[1]"
        ).extract()
        if len(apply_desc_en) == 0:
            apply_desc_en = response.xpath(
                '//*[@id="group-entry-requirements"]').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)
        # print(apply_desc_en)

        #15.ielts 16171819
        ielts_list = response.xpath(
            "//*[contains(text(),'International Requirements')]/../../following-sibling::*[1]"
        ).extract()
        ielts_list = ''.join(ielts_list)
        ielts_list = remove_tags(ielts_list)
        # print(ielts_list)
        try:
            ielts = re.findall('\d\.\d', ielts_list)[0]
        except:
            ielts = 6.5
        # print(ielts,url)
        ielts_r = 5.5
        ielts_l = 5.5
        ielts_w = 5.5
        ielts_s = 5.5

        #20.apply_proces_en
        apply_proces_en = 'https://www.uos.ac.uk/content/how-apply-0'

        #21.apply_pre
        apply_pre = '£'

        #22.alevel
        try:
            alevel = response.xpath(
                "//*[contains(text(),'A-Level')]/.").extract()[-1]
            alevel = remove_tags(alevel).strip()
        except:
            alevel = 'N/A'
        # print(alevel)

        #23.career_en
        career_en = response.xpath('//*[@id="group-career"]').extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        item['career_en'] = career_en
        item['alevel'] = alevel
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['location'] = location
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_desc_en'] = apply_desc_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['apply_proces_en'] = apply_proces_en
        item['ucascode'] = ucascode
        yield item

Ejemplo n.º 23

Mostrar archivo

Archivo: LeedsBeckettUniversity_P.py Proyecto: histudent/python_spider

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)
        item['university']='Leeds Beckett University'
        item['url']=response.url
        location=response.xpath('//div[contains(text(),"Location")]/following-sibling::span/text()').extract()
        location=set(location)
        location=''.join(location).strip()
        # print(location)
        item['location']='Leeds'

        degree_name=response.xpath('//div[@class="course-hero__label"]/text()').extract()
        degree_name=''.join(degree_name).strip()
        item['degree_name']=degree_name
        programme=response.xpath('//h1[@class="course-hero__title"]/text()').extract()
        programme=''.join(programme).strip()
        # print(programme)
        # print(degree_name)
        item['programme_en']=programme

        department=response.xpath('//div[@class="course-hero__labels"]/a/text()').extract()
        department=''.join(department)
        # print(department)
        item['department']=department

        mode=response.xpath('//div[contains(text(),"Attendance")]/following-sibling::div//text()').extract()
        mode=''.join(mode)
        mode=re.findall('(?i)full',mode)
        if mode!=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        start_date=response.xpath('//div[contains(text(),"Start Date")]/following-sibling::div//text()').extract()
        start_date=tracslateDate(start_date)
        start_date=set(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        item['start_date']=start_date

        duration=response.xpath('//div[contains(text(),"Duration")]/following-sibling::span//text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        overview=response.xpath('//h2[contains(text(),"Overview")]/../following-sibling::div').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        rntry=response.xpath('//h2[contains(text(),"Entry Requirements")]/../following-sibling::div').extract()
        rntry=remove_class(rntry)
        item['rntry_requirements']=rntry

        IELTS=response.xpath('//div[@class="entry-ielts"]/text()').extract()
        ielts=get_ielts(IELTS)
        # print(ielts)
        try:
            if ielts!=[] or ielts!={}:
                item['ielts_l']=ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        career=response.xpath('//h3[contains(text(),"Careers")]/following-sibling::div').extract()
        career=remove_class(career)
        item['career_en']=career

        modules=response.xpath('//div[@class="course-modules__table-modules"]//div[@class="course-modules__dropdowns"]').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        fee=response.xpath('//div[contains(text(),"£")]/text()').extract()
        fee=''.join(fee).strip()
        fee=re.findall('£\d{3,}',fee)
        fee = '-'.join(fee).replace(',', '').replace('£', '')
        fee = fee.split('-')
        try:
            fee = list(map(int, fee))
            fee = max(fee)
            item['tuition_fee']=fee
        except:
            pass
        item['tuition_fee_pre']='£'

        apply_d=["Academic Certificates.",
"Evidence of your English language ability (see below).",
"A photocopy of your passport.",
"A reference to support your application – either academic or professional.",
"A completed Agent Consent Form (required if you are applying via or with the help of an agent).",]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        apply_p=["Applying for a postgraduate course",
"Once you have found the course you want to study in our online prospectus you will then click on the ‘Apply Now’ button located at the top of the online course page. ",
"You will be asked to create an account on our application portal and complete your application via your Leeds Beckett account. Once you have submitted your application you should receive a decision within six weeks of applying. The exception to this is if the course you have applied for has a closing date specified. In this case, we will wait until the closing date has passed before we contact you",]
        apply_p='\n'.join(apply_p)
        item['apply_proces_en']=apply_p

        # print(item)
        yield item

Ejemplo n.º 24

Mostrar archivo

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace("Award:",
                                                               "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(item['degree_name'],
                                                       '').replace("()",
                                                                   "").strip()
            item['programme_en'] = programme
            # print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time']: ", item['teach_time'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            item[
                'require_chinese_en'] = "<p><strong>Postgraduate</strong></p><p><em>Taught Postgraduate Programmes:</em></p><p>Bachelor degree from a recognised Chinese university.</p>"

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "https://courses.bolton.ac.uk/course" in item['url']:
                if "postgraduate" in isup_str or len(isup) == 0:
                    print("******存到数据库*****")
                    yield item

        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 25

Mostrar archivo

Archivo: UniversityofBedfordshire_P.py Proyecto: histudent/python_spider

    def parse_main(self,response):
        print(response.url)
        item=get_item1(ScrapyschoolEnglandItem1)

        item['university']='University of Bedfordshire'
        item['url']=response.url

        programme=response.xpath('//div[@id="inner-course-content"]/h1/text()').extract()
        # print(programme)
        programme=''.join(programme)
        # print(programme)
        item['tuition_fee_pre']='£'
        if 'MBA' in programme:
            # print(programme)
            item['tuition_fee']='14000'
        else:
            item['tuition_fee']='12750'

        programme=programme.split('-')
        if len(programme)==2:
            prog=programme[0].strip()
            degr=programme[1].strip()
            # print(prog)
            # print(degr)
            item['degree_name']=degr
            try:
                if degr[0] == 'M':
                    item['degree_type'] = '2'
                elif degr[0] == 'P':
                    item['degree_type'] = '3'
            except:
                pass
        else:
            prog=''.join(programme).strip()
        item['programme_en']=prog
        location=response.xpath('//strong[contains(text(),"Campus Location")]/../text()').extract()
        location=''.join(location).replace('-','').strip()
        # print(location)
        item['location']=location

        duration=response.xpath('//strong[contains(text(),"Duration")]/../text()').extract()
        duration=clear_duration(duration)
        # print(duration)
        item['duration']=duration['duration']
        item['duration_per']=duration['duration_per']

        mode=response.xpath('//strong[contains(text(),"Attendance")]/../text()').extract()
        mode=''.join(mode)
        mode=re.findall('(?i)full',mode)
        if mode!=[]:
            item['teach_time']='1'
        else:
            item['teach_time']='2'

        start_date=response.xpath('//strong[contains(text(),"Start")]/../text()').extract()
        # print(start_date)
        start_date=tracslateDate(start_date)
        # print(start_date)
        start_date=','.join(start_date)
        # print(start_date)
        item['start_date']=start_date

        overview=response.xpath('//div[@id="why_content"]').extract()
        overview=remove_class(overview)
        # print(overview)
        item['overview_en']=overview

        modules=response.xpath('//div[@id="unit_content"]').extract()
        modules=remove_class(modules)
        # print(modules)
        item['modules_en']=modules

        assessment_en=response.xpath('//div[@id="how_content"]').extract()
        assessment_en=remove_class(assessment_en)
        item['assessment_en']=assessment_en

        rntry=response.xpath('//h2[@id="entry"]/following-sibling::div/ul[@class="tab-content"]/div[3]').extract()
        rntry=remove_class(rntry)
        # print(rntry)
        item['rntry_requirements']=rntry

        item['ielts']='6.0'
        item['ielts_l']='5.5'
        item['ielts_s']='5.5'
        item['ielts_r']='5.5'
        item['ielts_w']='5.5'
        # item['toefl']='80'
        item['toefl_l']='17'
        item['toefl_s']='20'
        item['toefl_r']='18'
        item['toefl_w']='17'

        career=response.xpath('//div[@id="career_content"]').extract()
        career=remove_class(career)
        # print(career)
        item['career_en']=career

        apply_d=['<p>There are two ways you can make a direct application to the University of Bedfordshire:</p><ul><li><a href="https://evision.beds.ac.uk/urd/sits.urd/run/siw_ipp_lgn.login?process=siw_ipp_app&amp;code1=OA_FORM&amp;code2=0007">Apply online now for 2017/18</a> Courses starting from 1 August 2017 to 31 July 2018</li><li>Download <span class="include_asset_summary"><a href="https://www.beds.ac.uk/__data/assets/pdf_file/0006/441798/International-Application-web-2018.pdf">an application form - <img src="https://www.beds.ac.uk/__data/asset_types/pdf_file/icon.png" alt="" title="" height="16" width="16"  class="sq-icon" /> PDF  1.0 MB ',
'</a></span> and submit it to our <a href="https://www.beds.ac.uk/international/international-applications/contactus">Admissions Team</a> along with scans of your supporting documents, via email, post or in person at the International Office.</li></ul><p>You can post your completed form to:</p><p>University of Bedfordshire International Admissions/International Office/University Square/Luton/Bedfordshire/LU1 3JU/United Kingdom</p><h4>Please note</h4><ul><li><strong>BSc (Hons) Nursing Studies</strong> Level 3 and <strong>MSc Advanced Nursing Studies</strong> are available to overseas students - please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a></li><li><strong>Healthcare, Nursing and Midwifery students</strong> - many of these courses are not available to overseas students due to UK immigration law in regard to bursary funding. Please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a> to find out if you are eligible to apply.</li></ul><p>*Please note that international students studying on a Tier 4 Student Visa must choose a full-time Undergraduate or Postgraduate course and are not eligible for part-time study.</p><p>Watch some more tips and advice on making your application to Bedfordshire:</p>',]
        apply_d='\n'.join(apply_d)
        item['apply_documents_en']=apply_d

        # item['application_open_date']='2018-8'
        # item['deadline']='2019-7'

        # print(item)
        yield item

Ejemplo n.º 26

Mostrar archivo

Archivo: TheUniversityofWesternAustralia_p.py Proyecto: histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'The University of Western Australia'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract(
            )
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        if 'Master of' in programme_en:
            programme_en = programme_en.replace('Master of', '').strip()
        # print(programme_en)

        #4.overview_en
        overview_en = response.xpath(
            '//*[@id="course-details"]/div/div/div/section/div[1]/div[1]/div[1]/div/div'
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #5.modules_en
        modules_en = response.xpath(
            "//h2[contains(text(),'Course structure details')]//following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #6.teach_time
        teach_time = 'coursework'

        #7.location
        location = response.xpath(
            "//*[contains(text(),'Locations')]//following-sibling::*").extract(
            )[0]
        # location = ''.join(location)
        location = remove_tags(location).strip()
        location = clear_space_str(location)
        # print(location)

        #8.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Starting dates')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date).strip()
        if 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = 'Semester1,Semester2'
        # print(start_date)

        #9.career_en
        career_en = response.xpath(
            '//*[@id="careers-and-further-study"]/div/div/div/section/div[2]/div/div/div/a'
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).strip()
        # print(career_en)

        #10.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'fee')]//following-sibling::div").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        if tuition_fee == 0:
            tuition_fee = response.xpath(
                "//*[contains(text(),'Fee')]//following-sibling::div").extract(
                )
            tuition_fee = ''.join(tuition_fee)
            tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee,response.url)

        #11.tuition_fee_pre
        tuition_fee_pre = '$'

        #12.rntry_requirements_en
        rntry_requirements_en = response.xpath(
            "//*[contains(text(),'Admission Requirements')]//following-sibling::div"
        ).extract()
        rntry_requirements_en = ''.join(rntry_requirements_en)
        rntry_requirements_en = remove_class(rntry_requirements_en)
        # print(rntry_requirements_en)

        #13.ielts 14151617
        if 'MBA' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Health' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Educational Leadership' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Forensic Odontology' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Dental Medicine' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Dentistry' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Medicine':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Podiatric Medicine' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Neuropsychology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Psychology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Audiology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Audiology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Industrial and Organisational Psychology':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Pharmacy':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Social Work':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Education':
            ielts = 7.5
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif programme_en == 'Teaching':
            ielts = 7.5
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 8.0
            ielts_s = 8.0
        elif 'Law' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Juris Doctor' in programme_en:
            ielts = 7.5
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_l = 6.0
            ielts_s = 6.0

        #18.toefl 19202122
        if 'Law' in programme_en:
            toefl = 100
            toefl_s = 28
            toefl_l = 26
            toefl_r = 26
            toefl_w = 26
        elif 'Juris Doctor' in programme_en:
            toefl = 106
            toefl_s = 28
            toefl_l = 26
            toefl_r = 26
            toefl_w = 26
        elif 'MBA' in programme_en:
            toefl = 100
            toefl_s = 20
            toefl_l = 20
            toefl_r = 20
            toefl_w = 20
        elif 'MBA' in programme_en:
            toefl = 100
            toefl_s = 20
            toefl_l = 20
            toefl_r = 20
            toefl_w = 20
        elif 'Clinical Neuropsychology' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        elif 'Clinical Psychology' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        elif 'Clinical Audiology' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        elif 'Industrial and Organisationa' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        else:
            toefl = 82
            toefl_s = 20
            toefl_l = 20
            toefl_r = 18
            toefl_w = 22

        #23.apply_proces_en
        apply_proces_en = 'Check your chosen course is open to applications. Ensure you meet the admission requirements for this course as detailed on the previous tab. Ensure you meet our English language competency requirement and any course/major prerequisites. Apply'

        #24.apply_pre
        apply_pre = '$'

        #25.apply_fee
        apply_fee = 100

        #26.degree_name
        degree_name = response.xpath(
            '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract(
            )
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #27.degree_type
        degree_type = 2

        #28.duration
        duration = response.xpath(
            "//*[contains(text(),'duration')]//following-sibling::*[1]//ul//li"
        ).extract()
        duration = ''.join(duration)
        if '<li>1.5' in duration:
            duration = 1.5
        elif '<li>1 to 2' in duration:
            duration = '1/2'
        elif '<li>0.5-1.5' in duration:
            duration = '0.5/1.5'
        elif '<li>2-3' in duration:
            duration = '2/3'
        elif '<li>One' in duration:
            duration = 1
        elif '<li>Two' in duration:
            duration = 2
        else:
            duration = re.findall(r'\d+', duration)[0]
        # print(duration,url)

        item['duration'] = duration
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['teach_time'] = teach_time
        item['location'] = location
        item['start_date'] = start_date
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements_en'] = rntry_requirements_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_w'] = toefl_w
        item['apply_proces_en'] = apply_proces_en
        item['apply_pre'] = apply_pre
        item['apply_fee'] = apply_fee
        yield item

Ejemplo n.º 27

Mostrar archivo

Archivo: UniversityofRoehampton_p.py Proyecto: histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Roehampton'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="wrapper"]/div/figure/figcaption/div/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en).replace('Postgraduate', '')
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = response.xpath(
            "//*[contains(text(),'Degree type')]//following-sibling::*"
        ).extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.overview_en
        overview_en = response.xpath(
            "//*[contains(text(),'Summary')]//following-sibling::*[1]"
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.department
        department = response.xpath(
            "//h3[contains(text(),'Department')]//following-sibling::*"
        ).extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #8.duration #9.duration_per #10.teach_time
        duration_list = response.xpath(
            "//h3[contains(text(),'Duration')]//following-sibling::*").extract(
            )
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        # print(duration_list)
        try:
            duration = re.findall('\d+', duration_list)[0]
        except:
            duration = 1
        if int(duration) > 10:
            duration_per = 4
        else:
            duration_per = 1
        if 'full-time' in duration_list:
            teach_time = 'full-time'
        else:
            teach_time = 'part-time'
        # print(duration,teach_time,duration_per)

        #11.start_date
        start_date = response.xpath(
            "//h3[contains(text(),'Programme start')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        # print(start_date)
        if 'September' in start_date:
            start_date = '2018-9'
        elif 'October' in start_date:
            start_date = '2018-10'
        else:
            start_date = '2018-9'
        # print(start_date)

        #12.tuition_fee
        tuition_fee = response.xpath(
            "//h3[contains(text(),'Tuition Fees (per year)')]//following-sibling::*"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #13.tuition_fee_pre
        tuition_fee_pre = '£'

        #14.rntry_requirements
        rntry_requirements = response.xpath(
            '//*[@id="accordion-2"]/div[1]').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #15.ielts 16171819
        ielts = 6.5
        ielts_l = 5.5
        ielts_w = 5.5
        ielts_r = 5.5
        ielts_s = 5.5

        #20.toefl 21222324
        toefl = 89
        toefl_r = 18
        toefl_w = 17
        toefl_l = 17
        toefl_s = 20

        #25.modules_en
        modules_en = response.xpath('//*[@id="accordion"]/div[2]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #26.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Career options')]/../following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).strip()
        # print(career_en)

        #27.apply_pre
        apply_pre = '£'

        #28.apply_proces_en
        apply_proces_en = '<p>You can apply to us now for any postgraduate degree starting in 2018.Postgraduate programmes You can apply to us now for any postgraduate degree starting in 2018.All postgraduate taught applications can be made via our online application form. Check our application deadlines View our entry requirements for postgraduate programmes View our general entry criteria If you need any help or advice with your application, or just want to ask us a question before you apply, please do not hesitate to contact us. International studentsPlease note that most international applicants have to pay a deposit before securing their place.</p>'

        item['apply_pre'] = apply_pre
        item['apply_proces_en'] = apply_proces_en
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['department'] = department
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['teach_time'] = teach_time
        item['start_date'] = start_date
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_l'] = ielts_l
        item['ielts_w'] = ielts_w
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['toefl'] = toefl
        item['toefl_l'] = toefl_l
        item['toefl_s'] = toefl_s
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['modules_en'] = modules_en
        item['career_en'] = career_en
        yield item

Ejemplo n.º 28

Mostrar archivo

Archivo: AstonUniversity_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.aston.ac.uk/"
        item['university'] = "Aston University"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['teach_time'] = 'fulltime'
        item['location'] = "Aston University,Birmingham, B4 7ET"
        print("======================================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='skiplinks']//text()").extract()
            programmeDegreetypeStr = ''.join(programmeDegreetype)
            # print(programmeDegreetypeStr)
            degree_type = re.findall(r"^\w+\s", programmeDegreetypeStr)
            # print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            programme = programmeDegreetypeStr.replace(''.join(degree_type),
                                                       "").strip()
            item['programme_en'] = ''.join(programme).strip().strip(
                "in").strip()
            print("item['degree_name']: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            overview = response.xpath(
                "//*[contains(text(), 'Course outline')]/../../../../../../div/following-sibling::div[1]|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Modules')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Sample module options')]/../../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/..|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Core modules:')]/../preceding-sibling::*|"
                "//strong[contains(text(),'Courses')]/../../following-sibling::div[1]|"
                "//*[contains(text(), 'Programme outline and modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Modules')]/..|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample Module Options')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline & Modules')]/../../../../../following-sibling::div[1]//*[contains(text(),'Modules')]/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            modules_en = response.xpath(
                "//*[contains(text(),'modules:')]/../..").extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(),'Modules')]/../../..").extract()
                if len(modules_en) == 0:
                    modules_en = response.xpath(
                        "//*[contains(text(),'Modules')]/../..").extract()
                    if len(modules_en) == 0:
                        modules_en = response.xpath(
                            "//*[contains(text(),'Modules')]/..").extract()
                        if len(modules_en) == 0:
                            modules_en = response.xpath(
                                "//*[contains(text(),'What you will study')]/../../../../../following-sibling::*"
                            ).extract()
                            if len(modules_en) == 0:
                                modules_en = response.xpath(
                                    "//*[contains(text(), 'Subject guide and modules')]/../../../../../../div/following-sibling::div[1]"
                                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en'] = ", item['modules_en'])

            career_en = response.xpath(
                "//*[contains(text(),'Your future career prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Your future career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional development programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional Development Programme')]/../../../../div/following-sibling::*|"
                # "//*[contains(text(),'Professional Development Programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Opportunities')]/../../../../../following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])

            assessment_en = response.xpath(
                "//*[contains(text(),'Learning, teaching & assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, Teaching & Assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, Teaching and Assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, teaching and assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, teaching and assessments')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, teaching & assesment')]/../../../../../following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//*[contains(text(),'Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'fees')]/../../../../../following-sibling::*//text()"
            ).extract()
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath(
                    "//strong[contains(text(),'Fees:')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            # print("tuition_fee_str: ", tuition_fee_str)
            tuition_fee_re = re.findall(
                r"International.*?£\d+,\d+|non-EU.*?£\d+,\d+|MSc.*?£\d+,\d+|entry:£\d+,\d+|2018/2019:£\d+,\d+|£\d+,\d+\sfor\sOutside\sEU",
                tuition_fee_str, re.I)
            # print(tuition_fee_re)
            if len(tuition_fee_re) != 0:
                t = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
                # item['tuition_fee'] = int(''.join(t).replace(",", "").strip())
                # print("item['tuition_fee']1 = ", item['tuition_fee'])
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            rntry_requirements = response.xpath(
                "//*[contains(text(),'Entry requirements & fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements & Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Key information for applicants & entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements')]/../../../../../following-sibling::*//text()"
            ).extract()
            start_date = rntry_requirements
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            duration = response.xpath(
                "//*[contains(text(),'Duration')]/following-sibling::*//text()|"
                "//*[contains(text(),'Duration')]/..//text()").extract()
            if len(duration) == 0:
                duration = response.xpath(
                    "//*[contains(text(),'Duration of course')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(duration)
            duration_str = ''.join(duration)
            # print("duration_str: ", duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # start_date = response.xpath(
            # "//*[contains(text(),'Start date')]/following-sibling::*//text()|"
            # "//*[contains(text(),'Start date')]/..//text()|"
            # "//*[contains(text(),'Start Date')]/following-sibling::*//text()|"
            # "//*[contains(text(),'Start Date')]/..//text()|"
            # "//*[contains(text(),'Start')]/../..//text()").extract()
            # if len(start_date) == 0:
            #     start_date = response.xpath(
            #         "//*[contains(text(),'Duration of course')]/../following-sibling::*[1]//text()").extract()
            clear_space(start_date)
            start_date_str = '; '.join(start_date)
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r'Start.{1,25}', start_date_str)
            # print("start_date_re", start_date_re)
            item['start_date'] = getStartDate(''.join(start_date_re))
            # print("item['start_date']: ", item['start_date'])

            # ielts_desc = ' '.join(start_date)
            # ielts_desc = re.findall(r'.{1,80}IELTS.{1,80}', ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            allcontent = response.xpath(
                "//div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-rho']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-delta']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma'][2]//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-upsilon']//text()"
            ).extract()
            clear_space(allcontent)
            department_1 = response.xpath(
                "//a[@href='/study/postgraduate/taught-programmes/abs/']//text()"
            ).extract()
            # print(department_1)
            if len(department_1) > 0:
                item['department'] = ''.join(department_1[0]).strip()
            department_re = re.findall(
                r"Life\s&\sHealth\sSciences\s-\sOSPAP|Aston\sBusiness\sSchool|Engineering\s&\sApplied\sScience|Languages\s&\sSocial\sSciences|Life\s&\sHealth\sSciences",
                ''.join(allcontent))
            # print("department_re: ", department_re)
            if item['department'] == "":
                if len(department_re) > 0:
                    item['department'] = ''.join(department_re[0]).strip()
            print("item['department']: ", item['department'])

            # Aston Business School
            de_1 = [
                "full time mba",
                "executive mba - part time",
                "online mba",
                "the executive dba",
                "phd programme",
                "msc business analytics",
                "msc business & management",
                "msc business & management (online)",
                "msc information systems & business analysis",
                "msc supply chain management",
                "msc international business",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc strategy and international business",
                "msc entrepreneurship",
                "msc accounting & finance",
                "msc business economics & finance",
                "msc finance",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc investment analysis",
                "msc strategic marketing management ",
                "msc human resource management & business",
                "msc organisational behaviour",
                "msc work psychology & business",
                "international pre-masters",
            ]
            #Engineering & Applied Science
            de_2 = [
                "msc professional engineering",
                "msc computer science",
                "msc software engineering ",
                "msc software project management",
                "msc professional engineering",
                "msc electrical power engineering and systems ",
                "msc telecommunications systems",
                "msc wireless communications and networking",
                "msc smart telecom and sensing networks (smartnet)",
                "msc photonic integrated circuits, sensors and networks (pixnet)",
                "msc professional engineering",
                "msc engineering management",
                "msc supply chain management",
                "msc engineering leadership & management",
                "msc supply chain leadership and management",
                "msc professional engineering",
                "msc mechanical engineering ",
                "msc product design ",
                "msc professional engineering",
            ]
            #Languages & Social Sciences
            de_3 = [
                "ma in forensic linguistics",
                "ma in the european union & international relations",
                "joint ma in multilevel governance & international relations",
                "double ma in europe & the world",
                "double ma in governance and international politics",
                "ma in international relations and global governance",
                "ma in sociology and social research",
                "ma in policy and social research",
                "ma in teaching english to speakers of other languages (tesol)",
                "ma in tesol and translation studies",
                "ma in tesol and translation studies",
                "ma in translation in a european context",
            ]
            # Life & Health Sciences
            de_4 = [
                "advanced hearing therapy practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "doctor of hearing therapy - professional doctorate",
                "biomedical science - msc",
                "biomedical sciences top modules - all standalone modules",
                "stem cells and regenerative medicine - msc",
                "clinical neurophysiology practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "neurophysiology - pgcert",
                "clinical science (neurosensory sciences) - msc",
                "doctor of optometry / doctor of ophthalmic science - professional doctorate",
                "graduate diploma in optometry - graduate diploma",
                "independent prescribing for optometrists - professional accreditation",
                "optometry / ophthalmic science - msc",
                "overseas pharmacists course (ospap) - full time pgdip / msc",
                "pharmacist independent prescribing - pgcert",
                "pharmacy (includes: msc pharmaceutical sciences, msc drug delivery, and msc pharmacokinetics) – msc",
                "psychiatric pharmacy by distance learning and practice - pgdip",
                "psychiatric pharmacy practice - msc",
                "psychiatric therapeutics by distance learning - pgcert",
                "cognitive neuroscience - msc",
                "health psychology (online) - msc",
                "health psychology (on campus) - msc",
            ]
            if item['department'] == "":
                for de1 in de_1:
                    if item['programme_en'] == de1:
                        item['department'] = "Aston Business School"
                        break
            if item['department'] == "":
                for de2 in de_2:
                    if item['programme_en'] == de2:
                        item['department'] = "Engineering & Applied Science"
                        break
            if item['department'] == "":
                for de3 in de_3:
                    if item['programme_en'] == de3:
                        item['department'] = "Languages & Social Sciences"
                        break
            if item['department'] == "":
                for de4 in de_4:
                    if item['programme_en'] == de4:
                        item['department'] = " Life & Health Sciences"
                        break
            print("item['department']1: ", item['department'])
            if 'business' in item['programme_en'].lower():
                item['department'] = "Aston Business School"
            if 'electrical' in item['programme_en'].lower(
            ) or 'engineering' in item['programme_en'].lower():
                item['department'] = "Engineering & Applied Science"
            if item['department'] == "Life & Health Sciences - OSPAP":
                item['ielts'] = 7
                item['ielts_l'] = 7
                item['ielts_s'] = 7
                item['ielts_r'] = 7
                item['ielts_w'] = 7
            elif 'electrical' in item['programme_en'].lower(
            ) or 'engineering' in item['programme_en'].lower():
                item['ielts'] = 6
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            else:
                item['ielts'] = 6.5
                item['ielts_l'] = 6
                item['ielts_s'] = 6
                item['ielts_r'] = 6
                item['ielts_w'] = 6
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            if item['department'] == "Life & Health Sciences - OSPAP":
                item['toefl'] = 101
                item['toefl_l'] = 26
                item['toefl_r'] = 26
                item['toefl_s'] = 23
                item['toefl_w'] = 28
            else:
                item['toefl'] = 93
                item['toefl_l'] = 19
                item['toefl_r'] = 18
                item['toefl_s'] = 19
                item['toefl_w'] = 23
            print(
                "item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s "
                % (item['toefl'], item['toefl_l'], item['toefl_s'],
                   item['toefl_r'], item['toefl_w']))
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Postgraduate - Taught<span style="line-height: 1.4em; font-size: 16px"> </span></h3> <p>You should have a Bachelors degree from an Chinese university but the specific percentage requirement will vary depending on the course you are applying for at Aston and the Chinese university which you have graduated from. In general applicants should be scoring in the range of 75-85% average as a minimum.  <br />     <br />If you are applying for finance, engineering or science based subjects, you must have studied a similar field in your undergraduate degree.<span style="line-height: 1.4em"> </span></p> <p>There are a number of conversion courses in the Business School which will accept students from any subject background.  <span style="line-height: 1.4em"> </span></p>"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 29

Mostrar archivo

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Queen Mary University of London'


        #2.location
        location = 'London'


        #3.department
        department = response.xpath('//*[@id="count"]/article/div/aside/p[3]/a[1]').extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #4.programme_en
        programme_en = response.xpath('//*[@id="count"]/article/header/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_class(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #5.degree_type
        degree_type = 2

        #6.degree_name  7.duration  8.duration_per
        try:
            degree_name = response.xpath('//*[@id="count"]/article/header/h2').extract()
            degree_name = ''.join(degree_name)
            degree_name = remove_tags(degree_name)
            #print(degree_name)
            duration = re.findall('\(.*\)',degree_name)
            duration = ''.join(duration)
            duration = duration.replace('(','')
            duration = duration.replace(')','')
            if 'months' in duration:
                duration = re.findall('\d',duration)[0]
                duration_per = 3
            else:
                duration = re.findall('\d',duration)[0]
                duration_per = 1
            # print('duration:',duration)
            # print('duration_per:',duration_per)
            if duration in degree_name:
                degree_name = degree_name.replace(duration,'')
                degree_name = degree_name.replace('(','')
                degree_name = degree_name.replace(')', '')
                degree_name = degree_name.split()[0]
            else:
                degree_name = 'N/A'
            # print(degree_name)
        except:
            degree_name = 'N/A'
            duration = None
            duration_per = 1

        #9.overview_en
        overview_en = response.xpath('//*[@id="first"]').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        start = overview_en.find('Overview')
        end = overview_en.find('Why study')
        overview_en= overview_en[start: end]
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.teach_time
        teach_time = 'full time'

        #11.modules_en
        try:
            modules_en = response.xpath('//*[@id="second"]').extract()
            modules_en = ''.join(modules_en)
            modules_en = remove_class(modules_en)
            if 'For more information contact' in modules_en:
                start = modules_en.find('Structure')
                end = modules_en.find('For more information contact')
                modules_en = modules_en[start:end]
                modules_en = clear_space_str(modules_en)
            else:
                modules_en = modules_en
            # print(modules_en)
        except:
            modules_en = 'N/A'

        #12.assessment_en
        try:
            assessment_en = response.xpath('//*[@id="fourth"]').extract()
            assessment_en = ''.join(assessment_en)
            assessment_en = remove_class(assessment_en)
            assessment_en = clear_space_str(assessment_en)
            # print(assessment_en)
        except:
            assessment_en = 'N/A'

        #13.career_en
        try:
            career_en = response.xpath('//*[@id="sixth"]').extract()
            career_en = ''.join(career_en)
            career_en = remove_class(career_en)
            career_en = clear_space_str(career_en)
            # print(career_en)
        except:
            career_en = 'N/A'

        #14.tuition_fee
        try:
            tuition_fee1 = response.xpath('//*[@id="fifth"]/p[2]/text()').extract()
            tuition_fee1 = ''.join(tuition_fee1)
            tuition_fee1 = remove_tags(tuition_fee1)
            tuition_fee1 = re.findall('\d{1,3},\d{3}', tuition_fee1)
            if tuition_fee1 == []:
                tuition_fee = response.xpath('//*[@id="fifth"]/p[1]/text()').extract()
                tuition_fee = ''.join(tuition_fee)
                tuition_fee = remove_tags(tuition_fee)
                tuition_fee = re.findall('\d{1,3},\d{3}', tuition_fee)[0]
                # print(tuition_fee)
            else:
                tuition_fee = tuition_fee1[0]
            tuition_fee = tuition_fee.replace(',','')
            # print(tuition_fee)
        except:
            tuition_fee = 0

        #15.tuition_fee_pre
        tuition_fee_pre = '£'

        #16.entry_requirements
        try:
            entry_requirements_list = response.xpath('//*[@id="third"]').extract()
            entry_requirements_list = ''.join(entry_requirements_list)
            entry_requirements_list = remove_class(entry_requirements_list)
            # entry_requirements_list = remove_tags(entry_requirements_list)
            if 'International applicants' in entry_requirements_list:
                start = entry_requirements_list.find('Entry requirements')
                mid = entry_requirements_list.find('International applicants')
                end = entry_requirements_list.find('For more information')
                entry_requirements = entry_requirements_list[start:mid]
                other = entry_requirements_list[mid:end]
            else:
                entry_requirements = entry_requirements_list
                other = 'N/A'
            if 'mso-fareast-language:EN-US' in entry_requirements:
                start = entry_requirements.findall('Entry requirements')
                end = 'Normal'
                entry_requirements = entry_requirements[start:end]
            else:
                entry_requirements = entry_requirements
            entry_requirements = clear_space_str(entry_requirements)
            other = clear_space_str(other)
            # print(entry_requirements)
            #print(other)
        except:
            entry_requirements = 'N/A'
            other = 'N/A'

        #17.雅思
        if department == 'School of Business and Management':
            ielts=7
            ielts_l=5.5
            ielts_s=5.5
            ielts_r=5.5
            ielts_w=6
            toefl=100
            toefl_l=17
            toefl_s=20
            toefl_r=18
            toefl_w=21
        elif department =='School of English and Drama':
            ielts = 7
            ielts_l = 7
            ielts_s = 7
            ielts_r = 7
            ielts_w = 7
            toefl = 100
            toefl_l = 22
            toefl_s = 25
            toefl_r = 24
            toefl_w = 27
        elif department =='School of Geography':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6.5
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        elif department=='School of History':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6.5
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        elif department =='School of Languages, Linguistics and Film':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 7
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 27
        elif department=='School of Law':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 7
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        elif department =='School of Politics and International Relations':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6.5
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        else:
            ielts = 6.5
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6
            toefl = 92
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 21
        # print(ielts,ielts_l,ielts_r,ielts_s,ielts_w)

        url = response.url
        apply_documents_en = 'You must provide the following supporting documentation: Completed application form  Degree transcripts. Please provide a transcript of your degree(s). If you have not yet completed your degree please provide a transcript of your results achieved to date  If your degree was from a UK university, please upload a transcript of your marks for each year If your degree was from an overseas institution, you should supply a transcript of your marks for each year of your studies and a copy of your degree certificate together with a certified translation if the document is not in English. Please note that original documentation will be required before you enrol. International applicants are also advised to include high school transcripts Please provide the contact details of two referees on your application, at least one reference must be from an academic referee who is in a position to comment on the standard of your academic work and suitability for postgraduate level study. Where appropriate, a second referee can provide comment on your professional experience. Your academic referee(s) may already have provided you with a reference that you can use to support any application for study or research that you make. We call these ‘open’ references. Open references will normally only be accepted if they are written on headed paper, provided as a colour copy of the original, and provide the referee’s work contact details. If you have open references, please upload these at the time of application If you do not have open reference, we will contact your referee(s) via email to supply a reference, preferably electronically. Please note, we can only accept references provided by email if it is sent from a university or company email address. References from a personal email address such as Yahoo or Hotmail are not acceptable. Your referee(s) can also supply a paper reference in response to the reference request email your referee will receive. Paper reference forms should be endorsed by an appropriate institution/company stamp or on official institution/company letterhead, and should be provided as a scanned colour copy of the original. Curriculum Vitae (CV)/ Resume This list of documents may vary slightly from course to course, so you will need to check the guidance notes and academic school website for the programme that you are applying for.  Although not mandatory, you are encouraged to send in the following documents in support of your application:  Statement of purpose  Your statement of purpose should explain why you want to study your chosen programme and how it will help your life and career. This should typically be one side of A4 paper. IELTS/TOEFL certificate (if applicable) International applicants should provide evidence of English language ability: IELTS, TOEFL, or other acceptable proof. Please see the international students section for more details.'
        require_chinese_en = 'Taught degrees (MSc/MA: one year) For entry onto our masters level courses students should normally have achieved: Four-year bachelors degree from 211 or 985 University with 75%+ average Four-year bachelors degree from non-211 University within top 300 with average 80%+ The usual entrance requirement to a taught masters degree is a four-year bachelors degree from a 211 University. However, all applications are considered on an individual basis and students may be admitted to masters programmes with a lower level degree if they have work experience relevant to the degree applied for. Students with a three-year diploma (dazhuan) from a recognised institution may apply for the Pre-Masters Graduate Diploma, a year-long course which will gain them access to a masters programme.Research degree (MPhil/PhD: three years) For entry onto our research degree courses students should normally have a masters degree from a recognised university.'
        apply_fee = 0
        apply_pre = '£'
        item['apply_fee']  = apply_fee
        item['apply_pre']  = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['apply_documents_en'] = apply_documents_en
        item['university'] = university
        item['location'] = location
        item['department'] = department
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['teach_time'] = teach_time
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements'] = entry_requirements
        item['ielts'] = ielts
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['toefl'] = toefl
        item['toefl_l'] = toefl_l
        item['toefl_s'] = toefl_s
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['url'] = url
        yield item

Ejemplo n.º 30

Mostrar archivo

Archivo: JamesCookUniversity_p.py Proyecto: histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'James Cook University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.degree_name
        degree_name = response.xpath(
            '//*[@id="main"]/div/div[2]/div[2]/div[1]/div/h1').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #4.degree_overview_en
        degree_overview_en = response.xpath(
            '//*[@id="main"]/div/div[2]/div[3]/div/div/div[2]/div[2]/div[1]/p'
        ).extract()
        degree_overview_en = ''.join(degree_overview_en)
        degree_overview_en = remove_class(degree_overview_en)
        # print(degree_overview_en)

        #5.location
        location = response.xpath(
            "//*[contains(text(),'Campus')]//following-sibling::div").extract(
            )
        location = ''.join(location)
        location = remove_tags(location).strip()
        # print(location)

        #6.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Start date')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date).strip()
        if 'February, July' in start_date:
            start_date = '2,7'
        elif 'February' in start_date:
            start_date = '2'
        elif 'March,  July,  November' in start_date:
            start_date = '3,7,11'
        # print(start_date)

        #7.duration
        duration = response.xpath(
            "//*[contains(text(),'Duration')]//following-sibling::*").extract(
            )
        duration = ''.join(duration)
        duration = remove_tags(duration).strip()
        # print(duration)

        #8.career_en
        career_en = response.xpath('//*[@id="accordion__career"]/p').extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #9.modules_en
        modules_en = response.xpath(
            '//*[@id="accordion__subjects"]/div').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #10.apply_proces_en
        apply_proces_en = response.xpath(
            '//*[@id="accordion__internationalapply"]').extract()
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = remove_class(apply_proces_en)
        # print(apply_proces_en)

        #11.overview_en
        overview_en = response.xpath(
            '//*[@id="accordion__highlights"]/*').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #12.ielts 13141516 17.toefl 18192021
        ielts_list = response.xpath(
            "//*[contains(text(),'Minimum')]/../../following-sibling::*"
        ).extract()
        if len(ielts_list) == 0:
            ielts_list = response.xpath(
                "//*[contains(text(),'Minimum')]/../following-sibling::*"
            ).extract()
        ielts_list = ''.join(ielts_list)
        ielts_list = remove_tags(ielts_list)
        if 'Band 2' in ielts_list:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_l = 6.0
            ielts_s = 6.0
            toefl = 90
            toefl_r = 21
            toefl_w = 21
            toefl_s = 21
            toefl_l = 21
        elif 'Band 3a' in ielts_list:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
            toefl = 100
            toefl_r = 23
            toefl_w = 23
            toefl_s = 23
            toefl_l = 23
        elif 'Band 1' in ielts_list:
            ielts = 6.0
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_l = 6.0
            ielts_s = 6.0
            toefl = 79
            toefl_r = 19
            toefl_w = 19
            toefl_s = 19
            toefl_l = 19
        elif 'Band 3c' in ielts_list:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
            toefl = 100
            toefl_r = 23
            toefl_w = 23
            toefl_s = 23
            toefl_l = 23
        else:
            ielts = 7.5
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 8.0
            ielts_s = 8.0
            toefl = 100
            toefl_r = 23
            toefl_w = 23
            toefl_s = 23
            toefl_l = 23
        # print(ielts_list)

        #22.deadline
        deadline = response.xpath(
            '//*[@id="accordion__internationaldeadlines"]').extract()
        deadline = ''.join(deadline)
        deadline = remove_tags(deadline).strip()
        if '31 January' in deadline and '30 June' in deadline:
            deadline = '2019-1-31,2019-6-30'
        elif '31 October' in deadline:
            deadline = '2018-10-31'
        elif '31 January' in deadline:
            deadline = '2019-1-31'
        elif '15 December' in deadline:
            deadline = '2018-12-15'
        elif '31 December' in deadline:
            deadline = '2018-12-31'
        else:
            deadline = '2019-1-31,2019-6-30'
        # print(deadline)

        #23.programme_en
        if 'majoring in' in degree_name:
            programme_en = re.findall(r'majoring\sin\s(.*)', degree_name)[0]
            # print(programme_en)
        else:
            programme_en = degree_name.replace('Master of', '').strip()
        # print(programme_en)

        #24.apply_pre
        apply_pre = '$'

        #25，tuition_fee_pre
        tuition_fee_pre = '$'

        #26.rntry_requirements_en
        rntry_requirements_en = response.xpath(
            '//*[@id="accordion__internationalrequirements"]').extract()
        rntry_requirements_en = ''.join(rntry_requirements_en)
        rntry_requirements_en = remove_class(rntry_requirements_en)
        # print(rntry_requirements_en)

        #27.tuition_fee
        tuition_fee = response.xpath(
            '//*[@id="accordion__internationalcosts"]').extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        item['university'] = university
        item['url'] = url
        item['degree_name'] = degree_name
        item['degree_overview_en'] = degree_overview_en
        item['location'] = location
        item['start_date'] = start_date
        item['duration'] = duration
        item['career_en'] = career_en
        item['modules_en'] = modules_en
        item['apply_proces_en'] = apply_proces_en
        item['overview_en'] = overview_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_w'] = toefl_w
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['deadline'] = deadline
        item['programme_en'] = programme_en
        item['apply_pre'] = apply_pre
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements_en'] = rntry_requirements_en
        item['tuition_fee'] = tuition_fee
        yield item