Ejemplos de get_item en Python

Lenguaje de programación: Python

Namespace/Package Name: scrapySchool_Canada_Ben.getItem

Método / Función: get_item

Ejemplos en hotexamples.com: 30

Python get_item - 30 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de scrapySchool_Canada_Ben.getItem.get_item extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Newfoundland Memorial University'
        # print(school_name)

        #2.url
        url = response.url

Ejemplo n.º 2

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "Brock University"
        item['url'] = response.url
        print("===========================")
        print(response.url)

        try:
            major_name_en = response.xpath("//a[@name='sec1']/text()").extract()
            clear_space(major_name_en)
            item['major_name_en'] = ''.join(major_name_en).strip()
            print("item['major_name_en']: ", item['major_name_en'])

            tmp_html = response.text
            find_sec = re.findall(r'<a\sname=\"sec\d+\">', tmp_html)
            # print("find_sec: ", find_sec)
            if len(find_sec) > 0:
                for sec in find_sec:
                    tmp_html = tmp_html.replace(sec, "<a>")
            # find_sec1 = re.findall(r'<a\sname=\"sec\d+\">', tmp_html)
            # print("find_sec1: ", find_sec1)
            # print(tmp_html)
            # modules_key1 = r'<td valign="bottom" colspan="2"><a>BSc with Major'
            # if modules_key1 not in tmp_html:
            #     modules_key1 = r'<td valign="bottom" colspan="2"><a>BA with Major'
            # modules_key2 = r'<td valign="bottom" colspan="2"><a>Pass Program</a></td>'
            #
            # if modules_key1 in tmp_html and modules_key2 in tmp_html:
            #     item['modules_en'] = remove_class(getContentToXpath(tmp_html, modules_key1, modules_key2))
            # print("item['modules_en']: ", item['modules_en'])
            # if item['modules_en'] is None:
            #
            # modules_en = response.xpath("//a[contains(text(), 'Computing and Solid-State Device Technology Co-op')]/../../../following-sibling::*[1]").extract()
            modules_en = response.xpath(
                "//a[contains(text(), 'Pass Program')]/../../preceding-sibling::*[position()<3]|//p[contains(text(),'HEALTH SCIENCES COURSES')]/following-sibling::*"
                "//a[contains(text(), 'Bachelor of Science in Kinesiology Program')]/../../../following-sibling::*[1]").extract()
            print("modules_en:", modules_en)
            if len(modules_en) > 0:
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en)).replace("<div><div><ul><li>Home</li><li>Courses</li><li>Search</li><li>Contents</li><li>Previous Page</li><li>Next Page</li><li>Login</li><li>Printable<br>Version</li></ul></div></div>", "").strip()
            print("item['modules_en']: ", item['modules_en'])
            yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] + ".txt",
                      'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 3

Mostrar archivo

Archivo: BrockUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "Brock University"
        item['url'] = response.url
        print("===========================")
        print(response.url)
        item['other'] = '''问题描述：1.没有课程设置和课程长度
        2.专业描述和就业为空的是详情页没有的'''
        '''公共字段'''
        # item['campus'] = 'Hamilton'
        item['location'] = 'St. Catharines, Ontario, Canada'
        item['sat_code'] = item['toefl_code'] = '0895'

        # item['duration'] = '4'
        # item['duration_per'] = 1
        # https://www.stu.ca/future-students/how-to-apply/
        item['apply_pre'] = 'CAD$'
        item['apply_fee'] = '150'
        item['start_date'] = '1月,5月,9月'
        item['deadline'] = '2018-10-01,2019-02-01,2019-05-01'
        # https://brocku.ca/safa/undergraduate-tuition-and-fees-2018-academic-year/#2017-ug-ancillary-fees7420-ef5cf380-e567
        item['tuition_fee_pre'] = 'CAD$'
        item['tuition_fee'] = '21154.24'

        # https://brocku.ca/admissions/international/international-secondary-school-student/
        item['ap'] = """Advanced placement (AP)
Advanced placement courses may be used to determine admissibility and also granting of transfer credits or exemption. If you have completed advanced placement courses as part of an appropriate secondary school credential, and submit an examination grade of 4 on individual results, you may be eligible to receive university credit to a maximum of 2.0 Brock credits. An official AP transcript is required for the evaluation process. """
        item['ib'] = """International Baccalaureate (IB)
Applicants who have successfully completed the IB diploma with the appropriate prerequisite subjects will be considered for admission and may be awarded a maximum of 3.0 transfer credits for HL examinations completed at a minimum grade of 5. A scholarship worth $1,000 will also be granted. Applicants who successfully complete an IB certificate program with a minimum of six subjects, including prerequisites, may also be considered for admission and transfer credit."""
        item[
            'entry_requirements_en'] = """<h2>General admission requirements:</h2>
<ul>
<li>Senior Secondary school credential appropriate for entry to university in your home country;</li>
<li>Academically rigorous grade 12 year</li>
<li>Minimum B- average (higher for some programs);</li>
<li>English Language proficiency requirements must be satisfied</li>
</ul>"""

        # https://brocku.ca/admissions/international/requirements-by-country/
        item[
            'require_chinese_en'] = '<p>(Grade 12) Senior High School Graduation Certificate (3 Years) plus final transcript showing Grade 12 first and second term grades</p>'
        item[
            'alevel'] = "Five GCE/GCSE/IGCSE subjects with at least two at A-level (GCSE grades at C or above). One GCSE/IGCSE/O-level subject (graded C or above) and four AS-level subjects will be considered provided the AS-levels do not duplicate subject matter at the GCSE/IGCSE or O level. VCE A-level, VCE A-level Double Award and BTEC Certificate/Diploma qualifications."
        try:
            department = response.xpath(
                "//a[contains(@class, 'btn faculty')]//text()").extract()
            item['department'] = ', '.join(department).strip()
            print("item['department']: ", item['department'])

            major_name_en = response.xpath(
                "//h1[@class='entry-title']/text()").extract()
            clear_space(major_name_en)
            item['major_name_en'] = ''.join(major_name_en).strip()
            print("item['major_name_en']: ", item['major_name_en'])

            overview = response.xpath(
                "//div[@class='entry-content']").extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)).replace("<p></p>", "").strip()
            if item['overview_en'] == "":
                item['overview_en'] = None
                print("***overview_en 为空")
            print("item['overview_en']: ", item['overview_en'])

            career = response.xpath(
                "//h2[contains(text(),'Career outcomes')]/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            if item['career_en'] == "":
                item['career_en'] = None
            print("item['career_en']: ", item['career_en'])

            # https://brocku.ca/admissions/english-proficiency/
            if item['major_name_en'] == "Accounting":
                item['ielts_desc'] = 'overall 7.0 with no band below 6.5'
                item['ielts'] = '7.0'
                item['ielts_l'] = '6.5'
                item['ielts_s'] = '6.5'
                item['ielts_r'] = '6.5'
                item['ielts_w'] = '6.5'
                item[
                    'toefl_desc'] = 'overall 100, minimum 27 on writing, 27 on speaking'
                item['toefl'] = '100'
                item['toefl_s'] = '27'
                item['toefl_w'] = '27'
            elif 'Teacher education' in item['major_name_en']:
                item['ielts_desc'] = 'overall 7.0'
                item['ielts'] = '7.0'
                item[
                    'toefl_desc'] = 'overall 100, minimum 27 on writing, 27 on speaking'
                item['toefl'] = '100'
                item['toefl_s'] = '27'
                item['toefl_w'] = '27'
            else:
                item['ielts_desc'] = 'overall 6.5, no band below 6.0'
                item['ielts'] = '6.5'
                item['ielts_l'] = '6.0'
                item['ielts_s'] = '6.0'
                item['ielts_r'] = '6.0'
                item['ielts_w'] = '6.0'
                item[
                    'toefl_desc'] = 'overall 88, with minimum 21 on speaking and 21 on writing'
                item['toefl'] = '88'
                item['toefl_s'] = '21'
                item['toefl_w'] = '21'

            degree_name = response.xpath(
                "//a[contains(@class, 'btn bachelor-of')]//text()").extract()
            if len(degree_name) > 0:
                for de in degree_name:
                    item['degree_name'] = de.strip()
                    print("item['degree_name']: ", item['degree_name'])
                    if item['degree_name'] is not None:
                        yield item
            else:
                if item['degree_name'] is not None:
                    yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 4

Mostrar archivo

Archivo: University_of_Northern_British_Columbia_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)



#1.学校名称
        school_name = 'University of Northern British Columbia'

#2.地点
        try:
            location = 'Prince George, British Columbia, Prince George, YXS'
            location = remove_tags(location)
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus =  response.xpath('').extract()[0]
            campus = remove_tags(campus)
            campus = campus.replace(', Online','')
            campus = campus.replace(' ','')
            campus = campus.split(',')
            #print(campus)
        except:
            campus = None
            #print(campus)

#4. 学院
        try:
            department = None

            #print(len(department))
            #print(department)
            #print(response.url)
        except:
            department = None
           # print(department)

# 4.
        try:
            degree_name = response.xpath('//*[@id="page-title"]').extract()[0]
            degree_name = remove_tags(degree_name)
            degree_name = re.findall('\((.*)\)',degree_name)[0]
            degree_name = degree_name.replace(' Program','')
            #print(degree_name)
        except:

            degree_name = None
            #print(degree_name)

#5.学位描述
        try:
            degree_overview_en = response.xpath('//span[contains(text(),"Major in")]/following-sibling::text()').extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            print(degree_overview_en)
        except:
            degree_overview_en = None
            print(degree_overview_en)

#6.专业英文
        try:
            major_name_en = response.xpath('//*[@id="page-title"]').extract()[0]
            major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
            major_name_en = remove_tags(major_name_en)
            #print(major_name_en)
        except:
            major_name_en = None
            #print(major_name_en)

#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = '2020-01,2019-09'
            #start_date = ','.join(start_date)
            #start_date = remove_tags(start_date)
            #start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            #start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
        # try:
        #     duration = response.xpath('').extract()[0]
        #     duration = remove_tags(duration)
        #     # print(duration)
        # except:
        #     duration = None
        #     # print(duration)

#10.课程设置
        try:
            modules_en = response.xpath('//*[@id="content"]//table/tbody').extract()[0]
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',modules_en)
            modules_en = modules_en.replace('\r\n','').replace('\n','').replace('\t','').replace('                                                                                                                         ','').replace('                                                                       ','')
            #print(modules_en)
        except:
            modules_en = None
            #print(modules_en)

#11.就业方向
        try:
            career_en = '<div><h3>UNBC Alumni Services</h3><p>Through the Student Career Centre,&nbsp;UNBC&nbsp;Alumni can attend career building workshops and view job postings.&nbsp;</p><p>Alumni with jobs to fill can also list postings for their own companies.&nbsp;</p><p>These services are offered in partnership with the&nbsp;UNBC&nbsp;Alumni Association.</p></div>'
           # career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',career_en)
            #career_en = career_en.replace('\r\n','').replace('\n','').replace('\t','').replace('                                                                                                                         ','').replace('                                                                       ','')
            #print(career_en)
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            deadline = '2019-06-01,2019-03-01'

            #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            tuition_fee = '620.39'
            #tuition_fee = remove_tags(tuition_fee)
            #tuition_fee = tuition_fee.replace('$','')
            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)


        item["tuition_fee_per"] = 5
#14 申请费:
        apply_fee = '125'

#15 申请要求
        try:


            entry_requirements_en = '<p>Graduation Certificate - Academic Senior Middle School,High School Transcript</p>'
            #entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en)
            #entry_requirements_en = ''.join(entry_requirements_en)
            #entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en)
            #entry_requirements_en = remove_tags(entry_requirements_en)
            #print(entry_requirements_en)
        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = '<p>Graduation Certificate - Academic Senior Middle School,High School Transcript</p>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = None
#
            #specific_requirement_en = remove_tags(specific_requirement_en)
            #specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
            #specific_requirement_en = specific_requirement_en.replace('\r\n','')
            #specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0]
            #specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
            #print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = response.xpath('').extract()[0]
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'IELTS (International English Language Testing System) Academic score of at least 6.5 overall, with not less than 6.0 in any of the four modules.'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'IELTS (International English Language Testing System) Academic score of at least 6.5 overall, with not less than 6.0 in any of the four modules.'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

#28 toefl_code
        try:
            toefl_code = '0320'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = 'TOEFL (Test of English as a Foreign Language) score of 90 or higher in the internet-based test, with not less than 20 in each of the Reading, Listening, Writing or Speaking components; Score of at least 230 in the computer based or at least 570 in the paper based test. UNBC’s institutional TOEFL code is 0320.'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '90'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
           # print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 20
        toefl_s = 20
        toefl_r = 20
        toefl_w = 20

# 32 alevel
        try:
            alevel = 'Possess the (International) General Certificate of Secondary Education with: Passes in at least five subjects: Two of which must be at the Advanced Level (G.C.E.) Two subjects at the Advanced Supplementary (A.S.) Level may be substituted for one subject at the Advanced Level.  For example, 4 Advanced Supplementary (A.S.) Level courses equal two A Level Courses.  The remaining three passes may be at the Ordinary Level (G.C.S.E.) Acceptable standing must be achieved in all subjects Applicants may apply for admission in the year they will be sitting for their final A-Level examinations provided they can present excellent grades in their O-Level examinations and strong predicted A-Level results. With the exception of the Faculty of Engineering, for all other programs that require "Mathematics" as a prerequisite, AS-Level Mathematics is required. Applicants presenting A-Level examinations with a minimum grade of "C" may be considered for advanced standing. In addition to the above, applicants interested in the four year Bachelor of Engineering degree program must complete the following prerequisite courses: A-Level Mathematics A-Level Physics  A-Level Chemistry is preferred; however, AS-level Chemistry will be accepted  O-Level English '
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'Students who are awarded an International Baccalaureate Diploma may be awarded up to 29 credit hours of transfer credit upon receipt of the official transcript from the International Baccalaureate headquarters. Students who are awarded the diploma must have an overall standing of four, with no course below a three. Diploma students are required to present three Higher level subjects and three Subsidiary level subjects in order to be eligible for transfer credits.'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'Students who take the College Board Advanced Placement courses in high school may be awarded transfer credit upon receipt of the official exam results from the College Board. Courses completed with a grade of four or above will be awarded transfer credit. Students who have completed AP courses with a grade of three may be considered for Advanced Standing. Advanced Standing allows a student to register in a higher level course without the required prerequisite. However, as credit is not awarded advanced standing will not reduce the number of credits that a student must accumulate to obtain a UNBC degree. As a result a student must make up this credit by completing another course to be used towards their degree requirements. A listing of acceptable AP courses for transfer credit is available on the BC Transfer Guide\'s website at  www.bctransferguide.ca/guides/ap).'
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = ''
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0320'
        sat1_desc = ''
        sat2_desc = None
        act_code = ''
        act_desc = 'The Enhanced Composite ACT with a Total Score of twenty-four (24)'

        item["ap"] = ap
        item["duration_per"] = 1
        #item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        item["average_score"] = '70'
            #print(degree_name_list)


        yield item

Ejemplo n.º 5

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Simon Fraser University'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en = response.xpath(
            "//div[@class='greyBanner']//h1").extract()
        major_name_en = ''.join(major_name_en)
        major_name_en = remove_tags(major_name_en)
        # print(major_name_en,url)

        #4.overview_en
        overview_en = response.xpath(
            "//div[contains(@class,'cq-dd-paragraph intro')]//div/div|//div[contains(@class,'cq-colctrl-lt5-c0')]/div/div/p"
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #5.degree_name
        degree_name = response.xpath(
            "//h4[contains(text(),'VERVIEW') or contains(text(),'verview')]//following-sibling::p[last()]"
        ).extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        degree_name = clear_space_str(degree_name).replace('Degree: ', '')
        if 'Bachelor of Arts' in degree_name and 'Bachelor of Business Administration' in degree_name:
            degree_name = 'Bachelor of Arts or Bachelor of Business Administration'
        elif 'Bachelor of Science degree' in degree_name and 'Bachelor of Business Administration' in degree_name:
            degree_name = 'Bachelor of Science degree or Bachelor of Business Administration'
        elif 'bachelor of environment' in degree_name and 'bachelor of business administration' in degree_name:
            degree_name = 'Bachelor of environment or Bachelor of business administration'
        elif 'Resource and Environmental Management' in major_name_en:
            degree_name = 'Bachelor of Environment'
        # print(degree_name)

        #6.career_en
        career_en = response.xpath(
            "//h3[contains(text(),'Career possibilities')]/../../../..//following-sibling::*"
        ).extract()
        if len(career_en) == 0:
            career_en = response.xpath(
                "//h3[contains(text(),'Are you curious?')]/../../../..//following-sibling::*"
            ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).replace(
            '<h3>Career possibilities</h3><p>The world is changing rapidly and you have no way of knowing the full range of career opportunities available to you in the future. Graduates of this program may end up in a range of occupations, including these:<br>',
            ''
        ).replace(
            'Not sure where to start? Career Services can help you explore your options and create possibilities.<br>',
            '')
        career_en = clear_space_str(career_en)
        # print(career_en)

        #7.deadline
        deadline = '2019-01-31'

        #8.apply_fee
        apply_fee = '79.5'

        #9.apply_pre
        apply_pre = '$'

        #10.location
        location = 'Vancouver, British Columbia'

        #11.entry_requirements_en
        entry_requirements_en = '<p>Graduation from a university-preparatory program at a senior high school Submit a transcript which includes grades for all courses completed and final grades for the first semester of Senior Year. (An admission average is calculated on final year results.</p>'

        #12.require_chinese_en
        require_chinese_en = "<p>Senior Middle School Graduation Diploma Submit transcript which includes grades for all courses completed and final grades for the first semester of Senior Year 3. Admission is calculated on Senior Year 3 academic subjects. GaoKao Exam Offers of admission are conditional upon receipt of the GaoKao results in July. The required results will depend on the program. In lieu of the GaoKao, you may submit SAT (score of 1130 out of 1600) or ACT (Composite Score of 22) results. If you are following an IB curriculum, or completing GCE A-Levels, you do not need to submit GaoKao results. All applicants must meet the university's Quantitative and Analytical Skills requirements. You need a minimum of 70% in Senior Year 2 or Senior Year 3 Mathematics for admission (based on a 60% pass scale).</p>"

        #13.modules_en
        modules_en_url = response.xpath(
            "//a[contains(text(),'program description') or contains(text(),'Program description') or contains(text(),'PROGRAM DESCRIPTION ')]/@href"
        ).extract()[0]
        # print(modules_en_url)
        if 'http://www.sfu.ca' in modules_en_url or 'https://www.sfu.ca' in modules_en_url:
            modules_en_url = modules_en_url.replace(
                'http://www.sfu.ca', '').replace('https://www.sfu.ca', '')
        modules_en_url = 'http://www.sfu.ca' + modules_en_url
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
        }
        data = requests.get(modules_en_url, headers=headers)
        response1 = etree.HTML(data.text)
        modules_en = response1.xpath(
            "//h2[contains(text(),'Program Requirements')]//following-sibling::div[position()<last()-2]"
        )
        doc = ""
        if len(modules_en) > 0:
            for a in modules_en:
                doc += (etree.tostring(a,
                                       encoding='unicode',
                                       pretty_print=False,
                                       method='html'))
                doc = clear_space_str(remove_class(doc))
                modules_en = doc
        else:
            modules_en = None
        # print(modules_en)
        #

        #判断是否拆分
        # modules_en_url = response.xpath(
        #     "//a[contains(text(),'program description') or contains(text(),'Program description') or contains(text(),'PROGRAM DESCRIPTION ')]/@href").extract()[
        #     0]
        # # print(modules_en_url)
        # if 'http://www.sfu.ca' in modules_en_url or 'https://www.sfu.ca' in modules_en_url:
        #     modules_en_url = modules_en_url.replace('http://www.sfu.ca', '').replace('https://www.sfu.ca', '')
        # modules_en_url = 'http://www.sfu.ca' + modules_en_url
        # headers = {
        #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        # data = requests.get(modules_en_url, headers=headers)
        # response1 = etree.HTML(data.text)
        # judge = response1.xpath('//*[@id="page-content"]/section/div[4]/div[1]/ul[1]//li')
        # doc1 = ""
        # if len(judge) > 0:
        #     for a in judge:
        #         doc1 += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html'))
        #
        #         doc1 = clear_space_str(remove_class(doc1))
        #         # doc1 = re.findall('<li>.*?</li>',doc1)
        # else:
        #     doc1 =''
        # print(doc1,url)

        #judge2
        # judge2 = response.xpath('//*[@id="page-content"]/section/div[2]/div[2]/div[1]/div[2]/div/h4').extract()
        # print(judge2,url)

        #14.ap
        ap = 'Transfer credit and/or advanced standing are granted to students who complete AP examinations in certain transferable subjects and achieve a score of 4 or 5. Course challenge (credit by examination) is also available in some disciplines. '

        #15.ib
        ib = "Completion of IB Diploma Program including English A1 or A2 or English Literature and Performance (HL or SL) with a minimum grade of 3"

        #16.ielts_desc 1718192021
        ielts_desc = 'International English Language Testing System (IELTS - Academic) with a minimum overall band score of 6.5 with no part less than 6.0.'
        ielts = 6.5
        ielts_r = 6
        ielts_w = 6
        ielts_s = 6
        ielts_l = 6

        #22.toefl_desc 2324252627
        toefl_desc = 'TOEFL iBT (Test of English as a Foreign Language internet based test) with an overall score of 88 or better with a minimum score of 20 in each of the four components (listening, speaking, writing, reading)'
        toefl = 88
        toefl_l = 20
        toefl_w = 20
        toefl_s = 20
        toefl_r = 20

        #28.tuition_fee_pre
        tuition_fee_pre = '$'

        #29.tuition_fee
        if major_name_en == 'Business':
            tuition_fee = '891.42/per unit'
        elif 'Engineering' in major_name_en:
            tuition_fee = '833.19/per unit'
        elif 'Computing' in major_name_en:
            tuition_fee = '820.69/per unit'
        else:
            tuition_fee = '808.34/per unit'

        #30.toefl_code
        toefl_code = '0999'

        #31.sat_code
        sat_code = '0999'
        #32.campus
        campus = response.xpath(
            "//h4[contains(text(),'Overview')]//following-sibling::*").extract(
            )
        campus = ''.join(campus)
        # if 'either the Burnaby or Surrey campus' in campus:
        #     # print(major_name_en,'---','either the Burnaby or Surrey campus')
        # elif 'Surrey campus' in campus:
        #     # print(major_name_en,'---','Surrey campus')
        # elif 'Vancouver campus' in campus:
        #     # print(major_name_en,'---','Vancouver campus')
        # else:
        #     pass

        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['overview_en'] = overview_en
        item['degree_name'] = degree_name
        item['career_en'] = career_en
        item['deadline'] = deadline
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['location'] = location
        item['entry_requirements_en'] = entry_requirements_en
        item['require_chinese_en'] = require_chinese_en
        item['modules_en'] = modules_en
        item['ap'] = ap
        item['ib'] = ib
        item['ielts_desc'] = ielts_desc
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl_desc'] = toefl_desc
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['tuition_fee_pre'] = tuition_fee_pre
        item['tuition_fee'] = tuition_fee
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        yield item

Ejemplo n.º 6

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Trinity Western University'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en = response.xpath('//*[@id="title-wrapper"]/div/h1').extract()
        major_name_en = ''.join(major_name_en)
        major_name_en = remove_tags(major_name_en).replace('amp;','')
        # print(major_name_en)

        #4.career_en
        career_en = response.xpath("//div[@class='field field-name-field-careers-title field-type-text field-label-hidden']/../../following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)


        #6.overview_en
        overview_en = response.xpath("//div[@class='field field-name-body field-type-text-with-summary field-label-hidden']").extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #7.location
        location = 'Vancouver, Columbia'

        #8.modules_en
        modules_en = response.xpath("//a[contains(@href,'sites/default/files/2017-2018academic')]//@href").extract()
        if len(modules_en)>0:
            modules_en = modules_en[0]
        else:
            modules_en = None
        # print(modules_en)

        #9.deadline
        deadline = '2019-01-15'

        #10.ielts_desc 1112131415
        ielts_desc = 'Overall score: 6.5 with 6.0 on Writing Band'
        ielts = 6.5
        ielts_r = 6
        ielts_l = 6
        ielts_w = 6
        ielts_s = 6

        #16.toefl_desc 1718192021
        toefl_desc = 'Overall Score: 88 with a minimum score of 21 in each of the four areas and a TWE of at least 5.'
        toefl = 88
        toefl_r = 21
        toefl_w = 21
        toefl_s = 21
        toefl_l = 21

        #22.toefl_code sat_code
        toefl_code = '0876'
        sat_code =toefl_code

        #24.ib
        ib = ' Achieve a final grade of 3 or better on the IB English 12 A1 or A2 (HL) or a final grade of 4 or better on the IB English 12 A1 or A2 (SL) or IB English 12 B with a grade of 4 or better.'

        #25.ap
        ap = 'Achieve a final grade of 4 or better on the AP (Advanced Placement English Language and Composition or AP Literature & Composition.'

        #26.start_date
        start_date = '2019-09'

        #27.tuition_fee
        tuition_fee = '22,260'

        #28.tuition_fee_pre
        tuition_fee_pre = '$'

        #29.require_chinese_en
        require_chinese_en = '<p>Students must have standing in five subjects of which at least two must be taken at the Advanced level.Students may be eligible for three semester hours of credit for each Advanced level course completed to a maximum of 12 semester hours provided that a minimum grade of C is achieved in that course. Departments will determine if credit awarded is general or course specific. The following equivalents will be considered: School Certificate: same as GCE with at least two passes at the principal level of the Higher School Certificate. Certificate of matriculation from recognized universities. International Baccalaureate with at least two subjects at the higher level. Other countries: write or email the Admissions Office for information.</p>'

        #30.entry_requirements_en
        entry_requirements_en = '<p>Students are required to graduate from high school or equivalent with a university preparatory program. This must include English 12 plus three additional Grade 12 academic subjects at a minimum overall average of 70%. A minimum grade of 60% or better is expected on the provincial examination portion of English 12. </p>'

        #31.apply_pre
        apply_pre = '$'

        #32.apply_fee
        apply_fee = '150'

        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['career_en'] = career_en
        item['overview_en'] = overview_en
        item['location'] = location
        item['modules_en'] = modules_en
        item['deadline'] = deadline
        item['ielts_desc'] = ielts_desc
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl_desc'] = toefl_desc
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        item['ib'] = ib
        item['ap'] = ap
        item['start_date'] = start_date
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['require_chinese_en'] = require_chinese_en
        item['entry_requirements_en'] = entry_requirements_en
        item['apply_pre'] = apply_pre
        item['apply_fee'] = apply_fee

        #5.degree_name
        degree_name = response.xpath("//h2[contains(text(),'Degrees')]//following-sibling::*").extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_class(degree_name)
        degree_name = clear_space_str(degree_name)
        degree_name = re.findall('<div>(.*?)</div>',degree_name)
        if 'Minor' in degree_name:
            degree_name.remove('Minor')
        if 'Concentration' in degree_name:
            degree_name.remove('Concentration')
        for i in degree_name:
            degree_name = i
            degree_name = degree_name.replace('<div>','').strip()
            item['degree_name'] = degree_name
            yield item

Ejemplo n.º 7

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "York University"
        item['url'] = response.url
        print("===========================")
        print(response.url)

        item['other'] = '''问题描述：1.专业描述和课程设置、就业为空的是详情页没有的
        2.entry_requirements没有找到
        3.雅思托福是根据专业名、学院或者学位判断的，存在个别的为空
        4.没有找到课程长度'''

        '''公共字段'''
        # item['campus'] = 'Toronto'
        item['location'] = 'Toronto'

        # http://futurestudents.yorku.ca/requirements/usa
        item['sat_code'] = item['toefl_code'] = '0894'
        item['act_code'] = '5250'
        item['sat1_desc'] = '1170'
        item['act_desc'] = '24'
        item['ap'] = "Transfer credit granted for final scores of 4 or 5 on the Advanced Placement (AP) exams, depending on the program (maximum 30 credits)."

        # http://futurestudents.yorku.ca/requirements/apply
        item['apply_pre'] = 'CAD$'
        item['apply_fee'] = '120'


        # item['entry_requirements_en'] = """<strong>IF YOU ARE A HIGH SCHOOL STUDENT</strong>
        # <p>If you’re coming to university directly from high school or have completed less than one year of university studies, you’ll take the direct entry route into a faculty or program: this means beginning in University 1 (U1), or applying to a program that offers a direct entry option. U1 is a unique approach to your first year at the U of M, giving you the opportunity to design an individualized schedule that meets the admission and/or first year requirements for one or more target degree programs. U1 will not add any time or cost to your degree; it serves as year 1 of any 3 or 4 year degree program</p>"""

        # item['alevel'] = 'General Certificate of Education (GCE) – 2 A Level Courses (or equivalent)'

        # http://futurestudents.yorku.ca/requirements
        item['average_score'] = '75'
        try:
            major_name_en = response.xpath("//h1[@id='page-title']//text()").extract()
            if "/" in ''.join(major_name_en).strip():
                m = ''.join(major_name_en).strip().split("/")
                item['major_name_en'] = ''.join(m[:-1]).strip()
            else:
                item['major_name_en'] = ''.join(major_name_en).strip()
            print("item['major_name_en']: ", item['major_name_en'])

            department = response.xpath("//div[@class='fs-ug-program-details']//label[contains(text(),'Offered by')]/../div//text()").extract()
            item['department'] = ', '.join(department).strip()
            print("item['department']: ", item['department'])

            degree_name = response.xpath("//div[@class='fs-ug-program-details']//label[contains(text(),'Degrees Offered')]/../div//text()").extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            # is_campus = "Biology, Business Economics, Canadian Studies, Communications, Drama Studies, Economics, Education, English, French Studies , Gender & Women’s Studies, History, International Studies, International Studies & Business Administration dual degree, emlyon Business School, Linguistics & Language Studies, Mathematics, Philosophy, Political Science, Psychology, Sexuality Studies , Sociology, Spanish (Hispanic Studies), Translation, Undecided Major"
            if "Glendon" in item['department']:
                item['campus'] = "Glendon"
            else:
                if item['degree_name'] == 'BEd':
                    item['campus'] = "Glendon"
                else:
                    item['campus'] = "Keele"

            if "Certificate" not in item['degree_name'] and "JD" not in item['degree_name']:
                # http://futurestudents.yorku.ca/requirements/deadlines
                start_date = response.xpath("//div[@class='fs-ug-program-details']//label[contains(text(),'Offered in')]/../div//text()").extract()
                start_date_tmp = ''.join(start_date).strip()
                print("start_date_tmp: ", start_date_tmp)
                start_date_list = []
                if "Fall" in start_date_tmp:
                    start_date_list.append("9月")
                if "Winter" in start_date_tmp:
                    start_date_list.append("1月")
                if "Summer" in start_date_tmp:
                    start_date_list.append("5月")
                item['start_date'] = ','.join(start_date_list).strip()
                print("item['start_date']: ", item['start_date'])

                # deadline  三种情况
                deadline_list = []
                if "9月" in item['start_date']:
                    if item['major_name_en'] == "Business Administration":
                        deadline_list.append('2019-01-30')
                    elif item['major_name_en'] == "Cinema & Media Studies" or item['major_name_en'] == "Dance" or item['major_name_en'] == "Design" or \
                            item['major_name_en'] == "Digital Media" or item['major_name_en'] == "Music" or item['major_name_en'] == "Theatre" or item['major_name_en'] == "Visual Arts":
                        deadline_list.append('2019-01-16')
                    elif item['major_name_en'] == "Social Work":
                        deadline_list.append('2019-02-06')
                    else:
                        deadline_list.append('2019-03-06')
                if "1月" in item['start_date']:
                    if item['major_name_en'] == "Media Arts" or item['major_name_en'] == "Dance" or item['major_name_en'] == "Music" \
                            or item['major_name_en'] == "Intermedia" or item['major_name_en'] == "Visual Arts":
                        deadline_list.append('2018-10-31')
                    else:
                        deadline_list.append('2019-11-15')
                if "5月" in item['start_date']:
                    if item['major_name_en'] == "Media Arts" or item['major_name_en'] == "Film Production" or item['major_name_en'] == "Dance" or item['major_name_en'] == "Music" \
                            or item['major_name_en'] == "Theatre" or item['major_name_en'] == "Intermedia" or item['major_name_en'] == "Visual Arts":
                        deadline_list.append('2019-01-16')
                    else:
                        deadline_list.append('2019-04-01')
                # print(deadline_list)
                item['deadline'] = ','.join(deadline_list).strip()
                # print("item['deadline']: ", item['deadline'])

                overview = response.xpath("//div[@class='field-item even']").extract()
                if len(overview) > 0:
                    item['overview_en'] = remove_class(clear_lianxu_space(overview)).replace("<p></p>", "").strip()
                # print("item['overview_en']: ", item['overview_en'])

                modules = response.xpath("""//h3[contains(text(),'Sample First-year Schedule')]/..|
                //strong[contains(text(),"Courses You'll Take")]/../following-sibling::ul[1]|
                //strong[contains(text(),'Some Courses You’ll Take')]/../following-sibling::ul[1]|
                //h3[contains(text(),'Some Courses You’ll Take')]/following-sibling::ul[1]""").extract()
                if len(modules) == 0 or ''.join(modules) == "<ul></ul>":
                    modules = response.xpath("//h3[contains(text(),'Some Courses You’ll Take')]/following-sibling::p[1]").extract()
                if len(modules) > 0:
                    item['modules_en'] = remove_class(clear_lianxu_space(modules)).replace("<p></p>", "").strip()
                if item['modules_en'] is None:
                    modules_url = response.xpath("//label[contains(text(),'For more information')]/following-sibling::div//a/@href").extract()
                    if len(modules_url) > 0:
                        modules_en_duration_dict = self.parse_modules(modules_url[0])
                        item['modules_en'] = modules_en_duration_dict.get('modules_en')
                        item['duration'] = modules_en_duration_dict.get('duration')
                        item['duration_per'] = modules_en_duration_dict.get('duration_per')
                print("item['modules_en']: ", item['modules_en'])
                print("item['duration']: ", item['duration'])
                print("item['duration_per']: ", item['duration_per'])

                career = response.xpath("//h3[contains(text(),'Possible Career Paths')]/..").extract()
                if len(career) > 0:
                    item['career_en'] = remove_class(clear_lianxu_space(career)).replace("<p></p>", "").strip()
                # print("item['career_en']: ", item['career_en'])

                # //a[contains(text(),'portfolio')]/..
                portfolio_desc_en = response.xpath("//a[contains(text(),'portfolio')]/..//text()").extract()
                if len(portfolio_desc_en) > 0:
                    item['portfolio_desc_en'] = clear_lianxu_space(portfolio_desc_en)
                print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])

                interview_desc_en = response.xpath("//*[contains(text(),'interview')]//text()").extract()
                if len(interview_desc_en) > 0:
                    item['interview_desc_en'] = clear_lianxu_space(interview_desc_en)
                print("item['interview_desc_en']: ", item['interview_desc_en'])

                # 有多个学位的需要拆分成多条
                if "," in item['degree_name'].replace("(Bilingual, Trilingual)", "").strip():
                    degree_name_list = item['degree_name'].replace("(Bilingual, Trilingual)", "").strip().split(',')
                    for d in range(len(degree_name_list)):
                        item['degree_name'] = degree_name_list[d].strip()

                        # http://futurestudents.yorku.ca/requirements/language-tests
                        if item['department'] == "School of the Arts, Media, Performance & Design" or item['department'] == "Faculty of Environmental Studies" \
                                or item['department'] == "Liberal Arts & Professional Studies" or item['department'] == "Faculty of Science" or item['department'] == "Glendon":
                            item['ielts'] = '6.5'
                            item['toefl'] = '83'
                        elif item['department'] == "Faculty of Health":
                            if "Nursing" not in item['major_name_en']:
                                item['ielts'] = '6.5'
                                item['toefl'] = '83'
                            else:
                                item['ielts'] = '7'
                                item['toefl'] = '89'
                        elif item['department'] == "Lassonde School of Engineering":
                            if "BEng" not in item['degree_name']:
                                item['ielts'] = '6.5'
                                item['toefl'] = '83'
                            else:
                                item['ielts'] = '7.5'
                                item['toefl'] = '96-99'
                        elif item['department'] == "Schulich School of Business":
                            item['ielts'] = '7.5'
                            item['toefl'] = '100'
                        elif item['degree_name'] == "BEd":
                            item['ielts_desc'] = 'An overall score of at least 7 on the IELTS (academic test only), with scores of at least 6.5 in reading and listening and scores of at least 7 in writing and speaking.'
                            item['ielts'] = '7.0'
                            item['ielts_l'] = '6.5'
                            item['ielts_s'] = '7.0'
                            item['ielts_r'] = '6.5'
                            item['ielts_w'] = '7.0'
                            item['toefl_desc'] = 'A minimum overall score of 103, with scores of at least 23 in Listening, 24 in Reading, 28 in Writing and 28 in Speaking. '
                            item['toefl'] = '103'
                            item['toefl_l'] = '23'
                            item['toefl_s'] = '28'
                            item['toefl_r'] = '24'
                            item['toefl_w'] = '28'


                        # http://futurestudents.yorku.ca/tuition
                        item['tuition_fee_pre'] = 'CAD$'
                        if item['department'] == "Lassonde School of Engineering":
                            if item['major_name_en'] == "Computer Security" or item['major_name_en'] == "Digital Media" or item['major_name_en'] == "Earth & Atmospheric Science":
                                item['tuition_fee'] = '26,975.40'
                            elif item['major_name_en'] == "Computer Science":
                                item['tuition_fee'] = '27,206.40'
                            elif item['degree_name'] == "BEng":
                                item['tuition_fee'] = '33,880.32'
                        elif item['department'] == "Lassonde School of Engineering":
                            item['tuition_fee'] = '29,469.60'
                        elif item['major_name_en'] == "Design":
                            item['tuition_fee'] = '25,198.81'
                        else:
                            item['tuition_fee'] = "26,975"


                        # http://futurestudents.yorku.ca/requirements
                        chinese_requirement_pre = """<h4>MINIMUM REQUIREMENTS</h4>
        <ul><li>Senior Secondary School Graduation Certificate</li>
        <li>Successful completion of the final year of Senior 3/Grade 12 level of study with a minimum overall average of 75% on all academic courses.</li>
        <li>Some programs require a higher GPA. Please review the requirements for your program below.</li></ul>"""
                        # item['require_chinese_en'] = '<p></p>'
                        uuid = response.xpath("//div[@id='fs-admit-req-wrapper']/script//text()").extract()
                        clear_space(uuid)
                        uuid = ''.join(uuid).strip()
                        print("uuid2: ", uuid)
                        if "=" in uuid:
                            uuid = uuid.split("=")[-1].strip().strip("[").strip("]").strip()
                            # print("uuid1: ", uuid)
                            uuidUpdate = ""
                            ibUrl = ""
                            chineseHighSchoolUrl = ""
                            if "," in uuid:
                                uuidUpdate = uuid.split(",")
                                for i in range(len(uuidUpdate)):
                                    uuidUpdate[i] = uuidUpdate[i].strip().strip("'")
                                ibUrlTmp = ""
                                for uuidstr in uuidUpdate:
                                    ibUrlTmp += "&uuid%5B%5D=" + uuidstr
                                ibUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=ibcc" + ibUrlTmp
                                chineseHighSchoolUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=chs" + ibUrlTmp
                                alevelUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=gce" + ibUrlTmp
                            else:
                                uuidUpdate = uuid.strip().strip("'").strip()
                                ibUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=ibcc&uuid%5B%5D=" + uuidUpdate
                                chineseHighSchoolUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=chs&uuid%5B%5D=" + uuidUpdate
                                alevelUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=gce&uuid%5B%5D=" + uuidUpdate
                            # print("ibUrl: ", ibUrl)
                            # print("uuidUpdate: ", uuidUpdate)
                            # print("chineseHighSchoolUrl: ", chineseHighSchoolUrl)
                            '''ib'''
                            ib_html = etree.HTML(self.parse_IB(ibUrl).replace("\/", "/").strip())
                            # //div[@class='fs-admit-req-general-req']/following-sibling::div
                            # //div//h3[contains(text(), '"+item['degree_name']+"')]/../div
                            ib = ib_html.xpath("//div[@class='fs-admit-req-general-req']/following-sibling::div")
                            if len(ib) == 1:
                                item['ib'] = remove_tags(etree.tostring(ib[0], encoding='unicode')).replace("&#13;", "").replace("Requirements for admission", "\nRequirements for admission").replace("Transfer credit", "\nTransfer credit").strip()
                            else:
                                item['ib'] = remove_tags(etree.tostring(ib[d], encoding='unicode')).replace("&#13;", "").replace("Requirements for admission", "\nRequirements for admission").replace("Transfer credit", "\nTransfer credit").strip()

                            '''中国学生要求'''
                            # 从接口获取了require_chinese_en中国学生要求数据，需要二次解析
                            require_chinese_link = self.parse_IB(chineseHighSchoolUrl)
                            require_chinese_html = etree.HTML(require_chinese_link.replace("\/", "/").strip())
                            # //div[@class='fs-admit-req-general-req']/following-sibling::div
                            # //div//h3[contains(text(), '"+item['degree_name']+"')]/../div
                            require_chinese = require_chinese_html.xpath("//div[@class='fs-admit-req-general-req']/following-sibling::div")
                            if len(require_chinese) == 1:
                                item['require_chinese_en'] = etree.tostring(require_chinese[0], encoding='unicode')
                            else:
                                item['require_chinese_en'] = etree.tostring(require_chinese[d], encoding='unicode')

                            '''alevel'''
                            alevel_html = etree.HTML(self.parse_IB(alevelUrl).replace("\/", "/").strip())
                            alevel = alevel_html.xpath("//div[@class='fs-admit-req-general-req']/following-sibling::div")
                            if len(alevel) == 1:
                                item['alevel'] = remove_tags(etree.tostring(alevel[0], encoding='unicode'))
                            else:
                                item['alevel'] = remove_tags(etree.tostring(alevel[d], encoding='unicode'))

                        if item['require_chinese_en'] is None:
                            item['require_chinese_en'] = ""
                        item['require_chinese_en'] = chinese_requirement_pre +'\n'+ remove_class(clear_lianxu_space([item['require_chinese_en']]))
                        # print("item['require_chinese_en']: ", str(d), item['require_chinese_en'], "===\n===", item['major_name_en'])
                        # print("item['ib']2: ", str(d), item['ib'])
                        print("item['alevel']2: ", str(d), item['alevel'])

                        yield item
                else:
                    # http://futurestudents.yorku.ca/requirements/language-tests
                    if item['department'] == "School of the Arts, Media, Performance & Design" or item['department'] == "Faculty of Environmental Studies" \
                            or item['department'] == "Liberal Arts & Professional Studies" or item[
                        'department'] == "Faculty of Science" or item['department'] == "Glendon":
                        item['ielts'] = '6.5'
                        item['toefl'] = '83'
                    elif item['department'] == "Faculty of Health":
                        if "Nursing" not in item['major_name_en']:
                            item['ielts'] = '6.5'
                            item['toefl'] = '83'
                        else:
                            item['ielts'] = '7'
                            item['toefl'] = '89'
                    elif item['department'] == "Lassonde School of Engineering":
                        if "BEng" not in item['degree_name']:
                            item['ielts'] = '6.5'
                            item['toefl'] = '83'
                        else:
                            item['ielts'] = '7.5'
                            item['toefl'] = '96-99'
                    elif item['department'] == "Schulich School of Business":
                        item['ielts'] = '7.5'
                        item['toefl'] = '100'
                    elif item['degree_name'] == "BEd":
                        item[
                            'ielts_desc'] = 'An overall score of at least 7 on the IELTS (academic test only), with scores of at least 6.5 in reading and listening and scores of at least 7 in writing and speaking.'
                        item['ielts'] = '7.0'
                        item['ielts_l'] = '6.5'
                        item['ielts_s'] = '7.0'
                        item['ielts_r'] = '6.5'
                        item['ielts_w'] = '7.0'
                        item[
                            'toefl_desc'] = 'A minimum overall score of 103, with scores of at least 23 in Listening, 24 in Reading, 28 in Writing and 28 in Speaking. '
                        item['toefl'] = '103'
                        item['toefl_l'] = '23'
                        item['toefl_s'] = '28'
                        item['toefl_r'] = '24'
                        item['toefl_w'] = '28'

                    # http://futurestudents.yorku.ca/tuition
                    item['tuition_fee_pre'] = 'CAD$'
                    if item['department'] == "Lassonde School of Engineering":
                        if item['major_name_en'] == "Computer Security" or item['major_name_en'] == "Digital Media" or \
                                item['major_name_en'] == "Earth & Atmospheric Science":
                            item['tuition_fee'] = '26,975.40'
                        elif item['major_name_en'] == "Computer Science":
                            item['tuition_fee'] = '27,206.40'
                        elif item['degree_name'] == "BEng":
                            item['tuition_fee'] = '33,880.32'
                    elif item['department'] == "Lassonde School of Engineering":
                        item['tuition_fee'] = '29,469.60'
                    elif item['major_name_en'] == "Design":
                        item['tuition_fee'] = '25,198.81'
                    else:
                        item['tuition_fee'] = "26,975"

                    # http://futurestudents.yorku.ca/requirements
                    chinese_requirement_pre = """<h4>MINIMUM REQUIREMENTS</h4>
                            <ul><li>Senior Secondary School Graduation Certificate</li>
                            <li>Successful completion of the final year of Senior 3/Grade 12 level of study with a minimum overall average of 75% on all academic courses.</li>
                            <li>Some programs require a higher GPA. Please review the requirements for your program below.</li></ul>"""
                    # item['require_chinese_en'] = '<p></p>'
                    uuid = response.xpath("//div[@id='fs-admit-req-wrapper']/script//text()").extract()
                    clear_space(uuid)
                    uuid = ''.join(uuid).strip()
                    print("uuid1: ", uuid)
                    if "=" in uuid:
                        uuid = uuid.split("=")[-1].strip().strip("[").strip("]").strip()
                        # print("uuid1: ", uuid)
                        uuidUpdate = ""
                        ibUrl = ""
                        chineseHighSchoolUrl = ""
                        if "," in uuid:
                            uuidUpdate = uuid.split(",")
                            for i in range(len(uuidUpdate)):
                                uuidUpdate[i] = uuidUpdate[i].strip().strip("'")
                            ibUrlTmp = ""
                            for uuidstr in uuidUpdate:
                                ibUrlTmp += "&uuid%5B%5D=" + uuidstr
                            ibUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=ibcc" + ibUrlTmp
                            chineseHighSchoolUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=chs" + ibUrlTmp
                            alevelUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=gce" + ibUrlTmp
                        else:
                            uuidUpdate = uuid.strip().strip("'")
                            ibUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=ibcc&uuid%5B%5D=" + uuidUpdate
                            chineseHighSchoolUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=chs&uuid%5B%5D=" + uuidUpdate
                            alevelUrl = "http://futurestudents.yorku.ca/ajax/admit-req?applicant-type=highschool&cohort=gce&uuid%5B%5D=" + uuidUpdate
                        # print("ibUrl: ", ibUrl)
                        # print("uuidUpdate: ", uuidUpdate)
                        # print("chineseHighSchoolUrl: ", chineseHighSchoolUrl)
                        print("alevelUrl: ", alevelUrl)
                        '''ib'''
                        # print(self.parse_IB(ibUrl))
                        ib_html = etree.HTML(self.parse_IB(ibUrl).replace("\/", "/").strip())
                        ib = ib_html.xpath("//div[@class='fs-admit-req-general-req']/following-sibling::div")
                        if len(ib) == 1:
                            item['ib'] = clear_lianxu_space(ib[0].xpath("//text()")[ib[0].xpath("//text()").index("Requirements for admission:"):])

                        '''中国学生要求'''
                        # 从接口获取了require_chinese_en中国学生要求数据，需要二次解析
                        require_chinese_link = self.parse_IB(chineseHighSchoolUrl)
                        # print(require_chinese_link.replace("\/", "/").strip())
                        require_chinese_html = etree.HTML(require_chinese_link.replace("\/", "/").strip())
                        # //div[@class='fs-admit-req-general-req']/following-sibling::div
                        # //div//h3[contains(text(), '"+item['degree_name']+"')]/../div
                        require_chinese = require_chinese_html.xpath(
                            "//div[@class='fs-admit-req-general-req']/following-sibling::div")
                        if len(require_chinese) == 1:
                            item['require_chinese_en'] = etree.tostring(require_chinese[0], encoding='unicode')

                        '''alevel'''
                        # print(self.parse_IB(alevelUrl))
                        alevel_html = etree.HTML(self.parse_IB(alevelUrl).replace("\/", "/").strip())
                        alevel = alevel_html.xpath("//div[@class='fs-admit-req-general-req']/following-sibling::div")
                        if len(alevel) == 1:
                            item['alevel'] = remove_tags(etree.tostring(alevel[0], encoding='unicode'))

                    if item['require_chinese_en'] is None:
                        item['require_chinese_en'] = ""
                    item['require_chinese_en'] = chinese_requirement_pre + '\n' + remove_class(clear_lianxu_space([item['require_chinese_en']]))
                    # print("item['require_chinese_en']1: ", item['require_chinese_en'], "===\n===",item['major_name_en'])
                    # print("item['ib']1: ", item['ib'])
                    print("item['alevel']1: ", item['alevel'])
                    yield item
        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] + ".txt",
                      'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 8

Mostrar archivo

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)

        #1.学校名称
        school_name = 'University of Ontario Institute of Technology'

        #2.地点
        try:
            location = response.xpath(
                '//th[contains(text(),"Location")]/following-sibling::td'
            ).extract()[0]
            location = remove_tags(location)
            location = location.replace(
                '                                                     ', '')
            location = location.replace(
                '                                                  ', '')
            location = location.replace('UOIT, ', '')
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus = location

            #print(campus_list)
        except:
            campus_list = None
            #print(campus_list)

#4. 学院
        try:
            department = response.xpath(
                '//th[contains(text(),"Faculty")]/following-sibling::td'
            ).extract()[0]
            department = remove_tags(department)
            department = department.replace(
                '                                                     ', '')
            department = department.replace(
                '                                                  ', '')
            #print(department)
        except:
            department = None
            #print(department)

# 4.
        try:
            degree_name_list = response.xpath(
                '//th[contains(text(),"Degree")]/following-sibling::td'
            ).extract()[0]
            degree_name_list = remove_tags(degree_name_list)
            degree_name_list = degree_name_list.replace(
                '                                                     ', '')
            degree_name_list = degree_name_list.replace(
                '                                                   ', '')
            #degree_name_list = degree_name_list.replace('\n','')
            degree_name_list = degree_name_list.replace(
                '                                                 ', '')
            degree_name_list = degree_name_list.replace(' \n', ':::')
            degree_name_list = degree_name_list.replace('\n', '')
            #degree_name_list = degree_name_list.replace('\n','')
            #degree_name_list =
            degree_name_list = degree_name_list.rstrip(':::')
            degree_name_list = degree_name_list.split(':::')
            #print(degree_name_list)
        except:
            degree_name_list = ['None']
        # print(degree_name_list)

#5.学位描述
        try:
            degree_overview_en = response.xpath(
                '//h2[contains(text(),"Additional information")]/preceding-sibling::*'
            ).extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文
        try:
            major_name_en = response.xpath('//h1').extract()[0]
            major_name_en = remove_tags(major_name_en)
            major_name_en = major_name_en.replace('\r\n', '').replace('\n', '')

        # print(major_name_en)
        except:
            major_name_en = None
        # print(major_name_en)

#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = response.xpath(
                '//th[contains(text(),"Start dates")]/following-sibling::td'
            ).extract()
            start_date = ','.join(start_date)
            start_date = remove_tags(start_date)
            if 'May' in start_date:
                start_date = '2019-05'
            elif 'May (part-time); and September (full-time)' in start_date:
                start_date = '2019-05,2019-09'
            elif 'Fall' in start_date:
                start_date = '2019-09'
            elif 'September' in start_date:
                start_date = '2019-09'

            #start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            #start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
        # print(start_date)
        except:
            start_date = None
        # print(start_date)

#9.课程长度
        try:
            duration = response.xpath(
                '//th[contains(text(),"Length")]/following-sibling::td'
            ).extract()[0]
            duration = remove_tags(duration)
            if 'semesters' in duration:
                item["duration_per"] = '2'
            elif 'year' in duration:
                item["duration_per"] = '1'
            elif 'month' in duration:
                item["duration_per"] = '3'
            else:
                item["duration_per"] = '1'

            if 'Four or five' in duration:
                duration = '4,5'
            elif 'Four years/five years' in duration:
                duration = '4,5'
            elif 'two' in duration:
                duration = '2'
            elif 'Five' in duration:
                duration = '5'
            elif 'five' in duration:
                duration = '5'
            elif 'Four' in duration:
                duration = '4'
            elif 'three' in duration:
                duration = '3'
            elif 'Six' in duration:
                duration = '6'
            elif 'Two' in duration:
                duration = '2'

            #print(duration)
        except:
            duration = None
            #print(duration)

#10.课程设置
        Module_D = {
            "Academic Learning and Success": "826",
            "Automotive Engineering": "827",
            "Biology": "828",
            "Business": "829",
            "Chemistry": "831",
            "Communication": "832",
            "Computer Science": "833",
            "Criminology and Justice": "861",
            "Curriculum Studies": "834",
            "Economics": "835",
            "Education": "836",
            "Educational Studies and Digital Technology": "825",
            "Electrical Engineering": "837",
            "Engineering": "838",
            "Environmental Science": "839",
            "Forensic Science": "840",
            "Health Science": "841",
            "Indigenous": "862",
            "Information Technology": "842",
            "Legal Studies": "843",
            "Manufacturing Engineering": "844",
            "Mathematics": "845",
            "Mechanical Engineering": "846",
            "Mechatronics": "860",
            "Medical Laboratory Science": "847",
            "Nuclear": "848",
            "Nursing": "849",
            "Physics": "850",
            "Political Science": "851",
            "Psychology": "852",
            "Radiation Science": "853",
            "Science": "855",
            "Science Co-op": "854",
            "Social Science": "858",
            "Sociology": "856",
            "Software Engineering": "857",
            "Statistics": "859"
        }
        try:
            for key in Module_D:
                #print(key)
                if major_name_en in key or key in major_name_en:
                    cc = Module_D[key]
                    # print(cc)
                    url = "http://calendar.uoit.ca/content.php?filter%5B27%5D=-1&filter%5B29%5D=&filter%5Bcourse_type%5D=" + cc + "&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid=20&expand=&navoid=824&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter"
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
                    }
                    response1 = requests.get(url, headers=headers)
                    modules_en = response1.text
                    response1 = etree.HTML(modules_en)
                    modules_en = response1.xpath(
                        '//td[@class = "width"]/a/text()')
                    modules_en = '<br>'.join(modules_en)

                    #print(major_name_en)
                # print(modules_en)

                else:
                    continue
                print(modules_en)
        except:
            modules_en = None

        if len(modules_en) <= 5:
            modules_en = response.xpath(
                '//*[@id="tab_program_curriculum"]/ul/li').extract()
            modules_en = ''.join(modules_en)
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en)
        else:
            modules_en = modules_en

#11.就业方向
        try:
            career_en = response.xpath(
                '//h2[contains(text(),"Career opportunities")]/following-sibling::ul'
            ).extract()[0]
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            #career_en = career_en.replace('\r\n','').replace('\n','').replace('\t','').replace('                                                                                                                         ','').replace('                                                                       ','')
            #print(career_en)
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            deadline = '2019-01-31'
            # deadline = response.xpath('//*[@id="Admissionrequirementsanddeadlines-subsection-0"]/table/tbody/tr/td[3]').extract()
            # deadline = '---'.join(deadline)
            # deadline = remove_tags(deadline)
            # deadline = deadline.replace('Documents due: ', '')
            # deadline =  deadline.replace('Sep 1, 2018Oct 1, 2018','2018-09-01').replace('Feb 1, 2019Mar 1, 2019','2019-02-01').replace('Mar 1, 2019Apr 1, 2019','2019-03-01').replace('May 1, 2019Jun 1, 2019','2019-05-01').replace('Sep 1, 2019Oct 1, 2019','2019-09-01').replace('Feb 15, 2019Mar 1, 2019','2019-02-15').replace('---',',')
            # #deadline = remove_tags(deadline)
            # #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费

#14 申请费:
        apply_fee = '130'

        #15 申请要求
        try:
            entry_requirements_en = response.xpath(
                '//h2[contains(text(),"Admission requirements")]/following-sibling::*'
            ).extract()
            entry_requirements_en = ''.join(entry_requirements_en)
            entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                           entry_requirements_en)
            #print(entry_requirements_en)
        except:
            entry_requirements_en = None
            #print(entry_requirements_en)

#16 中国学生申请要求
        try:
            require_chinese_en = '<p>Senior Secondary School Graduation Certificate and Huikao, or similar provincial examination results. Applicants are required to submit both their academic transcript indicating all subjects taken and grades achieved and a copy of their graduation certificate and results. If your documents are issued in a language other than English, you must also provide notarized literal English translations.</p>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = None
            # #specific_requirement_en = remove_tags(specific_requirement_en)
            # specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
            # specific_requirement_en = specific_requirement_en.replace('\r\n','')
            # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0]
            # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
            #print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = None
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'Overall score of 6.5 with no sub-score less than 6.0 (Education and Nursing programs require an overall score of 7.0).'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'Overall score of 6.5 with no sub-score less than 6.0 (Education and Nursing programs require an overall score of 7.0).'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

        #28 toefl_code
        try:
            toefl_code = '7178'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = ' Internet-based test (iBT) with a total score of 83 (Education and Nursing programs require a total score of 87), and minimum scaled scores of Listening: 20; Reading: 20; Speaking: 19; and Writing: 20. The university\'s TOEFL code is 7178. Paper-delivered test results will be reviewed on an individual basis. '
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '83'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 20
        toefl_s = 19
        toefl_r = 20
        toefl_w = 20

        # 32 alevel
        try:
            alevel = 'Possess the (International) General Certificate of Secondary Education with: Passes in at least five subjects: Two of which must be at the Advanced Level (G.C.E.) Two subjects at the Advanced Supplementary (A.S.) Level may be substituted for one subject at the Advanced Level.  For example, 4 Advanced Supplementary (A.S.) Level courses equal two A Level Courses.  The remaining three passes may be at the Ordinary Level (G.C.S.E.) Acceptable standing must be achieved in all subjects Applicants may apply for admission in the year they will be sitting for their final A-Level examinations provided they can present excellent grades in their O-Level examinations and strong predicted A-Level results. With the exception of the Faculty of Engineering, for all other programs that require "Mathematics" as a prerequisite, AS-Level Mathematics is required. Applicants presenting A-Level examinations with a minimum grade of "C" may be considered for advanced standing. In addition to the above, applicants interested in the four year Bachelor of Engineering degree program must complete the following prerequisite courses: A-Level Mathematics A-Level Physics  A-Level Chemistry is preferred; however, AS-level Chemistry will be accepted  O-Level English '
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'Completion of the International Baccalaureate Diploma including English SL or HL. Prerequisite courses can be presented at the Standard or Higher Level with no score lower than 4.A minimum overall score of 24 is require for admission consideration; actual admitting scores vary from program to program.For Engineering and Science degrees, applicants must present Math SL or HL and/or Further Math HL (HL is recommended).Final IB results must be sent to the university electronically by the International Baccalaureate office.If you are not completing the full IB Diploma, individual IB courses may be considered for admission. Official transcripts and proof of high school graduation must be submitted directly from your institution.'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'Prerequisite courses, including English, should be presented at Grade 12/Senior Year/College Prep/Honors level, SAT Subject Tests or Advanced Placement (APl) exam results.'
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = ''
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '4192 '
        sat1_desc = 'SAT or ACT scores should also be submitted if written. The university\'s SAT code is 4192 and the ACT code is 5265.'
        sat2_desc = None
        act_code = '5265'
        act_desc = 'SAT or ACT scores should also be submitted if written. The university\'s SAT code is 4192 and the ACT code is 5265.'

        item["ap"] = ap
        #item["duration_per"] = duration_per
        item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        #item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        #item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc

        for i in degree_name_list:
            degree_name = i
            item["degree_name"] = degree_name
            if 'Bachelor of Engineering' in degree_name:
                item["tuition_fee"] = '26,007.42'
            elif 'Information Technology' in degree_name:
                item["tuition_fee"] = '23,091.20'
            elif 'Bachelor of Commerce' in degree_name:
                item["tuition_fee"] = '22,118.62'
            elif 'Nursing' in degree_name:
                item["tuition_fee"] = '20,333.88'
            elif 'Computer Science' in degree_name:
                item["tuition_fee"] = '21,131.22'
            else:
                item["tuition_fee"] = '19,940.96'
            #print(item["tuition_fee"])
            yield item

Ejemplo n.º 9

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Cape Breton University'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en = response.xpath(
            "/html/body/section[1]/div/div[2]/h1").extract()
        major_name_en = ''.join(major_name_en)
        major_name_en = remove_tags(major_name_en).strip()
        # print(major_name_en)

        #5.department
        department = response.xpath(
            "/html/body/section[1]/div/div[2]/ul/li[2]").extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #6.overview_en
        overview_en = response.xpath(
            "//span[contains(text(),'What is the')]/../following-sibling::*|//span[contains(text(),'What Is The')]/../following-sibling::*|//b[contains(text(),'What is the')]/../following-sibling::*|//b[contains(text(),'What Is The')]/../following-sibling::*|//span[contains(text(),'What Is')]/../following-sibling::*|//b[contains(text(),'What Is')]/../following-sibling::*"
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        end = overview_en.find('Why Study')
        # print(end)
        overview_en = overview_en[:end].replace('<h2><b>',
                                                '').replace('<h2><span>', '')

        #7.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Possible Career Paths')]/../following-sibling::*|//*[contains(text(),'Career Path')]/../following-sibling::*|//strong[contains(text(),'Chemistry opens doors to career opportunities in')]/../following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)

        #8.modules_en
        driver = webdriver.Chrome()
        try:
            driver.get("http://calendar.cbu.ca/")
            input = driver.find_element_by_id('searchwords')
            input.send_keys(major_name_en)
            time.sleep(1)
            s1 = Select(driver.find_element_by_id('what'))
            s1.select_by_value('Catalog')
            time.sleep(1)
            button = driver.find_element_by_id('searchBtn')
            button.click()
            time.sleep(10)
            modules_en = driver.find_element_by_xpath(
                '//*[@id="sis-wrap"]').text
            modules_en = re.findall('([A-Z\d\s]+-[A-Z\s:&,"?I]+)', modules_en)
            doc = ''
            for i in modules_en:
                i = i.replace('\n', '').strip()
                doc += '<p>' + i + '</p>'
                modules_en = doc
        except:
            modules_en = None

        driver.quit()
        if modules_en == []:
            modules_en = None

        #9.location
        location = 'Sydney, Nova Scotia'

        #10.entry_requirements_en
        entry_requirements_en = '<p>An overall average of 65% is required for all students applying to most CBU credit programs based on high school performance. Additional requirements may apply to specific programs.</p>'

        #11.require_chinese_en
        require_chinese_en = '<p>Senior Middle School Graduation Certificate and Transcript</p><p>Applications from international students will be reviewed on an individual basis.We require graduation from an academic secondary school program or equivalent with an average of “C” in five, senior academic-level/university preparatory courses.</p>'

        #12.toefl_desc 13.toefl
        toefl_desc = 'Internet-Based Test: 80'
        toefl = 80

        #14.ielts_desc 1516171819
        ielts_desc = 'Overall Score: 6.5,No Band Below: 6.0'
        ielts = 6.5
        ielts_r = 6.0
        ielts_w = 6.0
        ielts_s = 6.0
        ielts_l = 6.0

        #20.tuition_fee #21.tuition_fee_pre
        tuition_fee = '8,476.30'
        tuition_fee_pre = '$'

        #22.apply_fee #23.apply_pre
        apply_fee = '103'
        apply_pre = '$'

        #24.alevel
        alevel = 'General Certificate of Education; including a minimum of two Advanced Level courses (A Level) or four Advanced Supplementary levels (AS level) subjects and 5 GCSE subjects (O Level).'

        #25.deadline
        deadline = '2019-03-01'

        #26.toefl_code  #27.sat_code
        toefl_code = '9142'
        sat_code = toefl_code

        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['department'] = department
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['modules_en'] = modules_en
        item['location'] = location
        item['entry_requirements_en'] = entry_requirements_en
        item['require_chinese_en'] = require_chinese_en
        item['toefl_desc'] = toefl_desc
        item['toefl'] = toefl
        item['ielts_desc'] = ielts_desc
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['alevel'] = alevel
        item['deadline'] = deadline
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code

        #4.degree_name 要拆开
        degree_name = response.xpath(
            "/html/body/section[1]/div/div[2]/ul/li[1]").extract()[0]
        degree_name = remove_tags(degree_name)
        if 'Bachelor of Arts and Bachelor of Arts Community Studies' in degree_name:
            degree_name = [
                'Bachelor of Arts', 'Bachelor of Arts Community Studies'
            ]
        elif 'Bachelor of Arts, Bachelor of Science, and Bachelor of Arts Community Studies' in degree_name:
            degree_name = [
                'Bachelor of Arts', 'Bachelor of Science',
                'Bachelor of Arts Community Studies'
            ]
        elif 'Bachelor of Arts, Bachelor of Arts Community Studies and Bachelor of Science' in degree_name:
            degree_name = [
                'Bachelor of Arts', 'Bachelor of Arts Community Studies',
                'Bachelor of Science'
            ]
        elif 'Bachelor of Arts, Bachelor of Arts Community Studies and Bachelor of Arts in Environment' in degree_name:
            degree_name = [
                'Bachelor of Arts', 'Bachelor of Arts Community Studies',
                'Bachelor of Arts in Environment'
            ]
        elif 'Bachelor of Business Administration' in degree_name:
            degree_name = ['Bachelor of Business Administration']
        elif 'Bachelor of Arts Community Studies' in degree_name:
            degree_name = ['Bachelor of Arts Community Studies']
        elif 'Bachelor of Engineering Technology' in degree_name:
            degree_name = ['Bachelor of Engineering Technology']
        elif 'Bachelor of Science' in degree_name:
            degree_name = ['Bachelor of Science']
        else:
            degree_name = ['Bachelor of Arts']
        for i in degree_name:
            degree_name = i
            item['degree_name'] = degree_name
            try:
                major = response.xpath(
                    "//strong[contains(text(),'Degree Options:')]/../preceding-sibling::li//following-sibling::li"
                ).extract()[-1]
            except:
                major = 'Major'
            if 'Honours' in major and 'Major' in major:
                other = ['Major', 'Honours']
            elif 'Honours' in major:
                other = ['Honours']
            elif 'Major' in major:
                other = ['Major']
            else:
                other = ['Major']
            for j in other:
                other = j
                item['other'] = other
                yield item

Ejemplo n.º 10

Mostrar archivo

    def parse(self, response):
        major_name_list = response.xpath(
            "//div[@id='contentBody']//ul[1]//li/a//text()").extract()
        clear_space(major_name_list)
        print("major_name_list==len:", len(major_name_list))

        # alllinks = response.xpath("//div[@id='contentBody']//ul//li/a/@href").extract()
        alllinks = response.xpath(
            "//div[@id='contentBody']//ul[1]//li/a/@href").extract()
        print(len(alllinks))
        # print(alllinks)
        # alllinks = list(set(alllinks))
        print(len(list(set(alllinks))))

        major_dict = {}
        if len(major_name_list) == len(alllinks):
            # 将PDF链接和专业名对应起来存进字典major_dict
            for i in range(len(major_name_list)):
                if "/factsheets" not in alllinks[i]:
                    alllinks[
                        i] = "https://www.uwinnipeg.ca/factsheets/" + alllinks[
                            i]
                else:
                    alllinks[i] = "https://www.uwinnipeg.ca" + alllinks[i]
                # if ":" in major_dict[i]:
                major_name_list[i] = major_name_list[i].replace(":",
                                                                " ").strip()
                major_dict[alllinks[i]] = major_name_list[i]
            print(major_dict)
            print(alllinks)
        #
        #     # 将PDF文件下载下来放到本地
        #     for j in range(len(major_name_list)):
        #         if major_dict.get(alllinks[j]) is not None:
        #             res = requests.get(alllinks[j])
        #             with open("D:/pycharm/hooli_scrapy_project/scrapySchool_Canada_Ben/scrapySchool_Canada_Ben/UniversityofWinnipeg/"+major_dict.get(alllinks[j])+".pdf", 'wb') as f:
        #                 f.write(res.content)
        #
        #
        #     # 将下载下来的本地文件转换成HTML文件
        #     for j in range(len(major_name_list)):
        #         if major_dict.get(alllinks[j]) is not None:
        #             try:
        #                 path = r"D:/pycharm/hooli_scrapy_project/scrapySchool_Canada_Ben/scrapySchool_Canada_Ben/UniversityofWinnipeg/"+major_dict.get(alllinks[j])+".pdf"
        #                 toPath = r"D:/pycharm/hooli_scrapy_project/scrapySchool_Canada_Ben/scrapySchool_Canada_Ben/UniversityofWinnipeg/programm_html/"+major_dict.get(alllinks[j])+".html"
        #                 self.readPDF(path, toPath)
        #             except Exception as e:
        #                 print("转换HTML失败：",str(e))

        # 找出本地专业HTML文件的路径，循环访问
        import os
        html_path = r"D:\pycharm\hooli_scrapy_project\scrapySchool_Canada_Ben\scrapySchool_Canada_Ben\UniversityofWinnipeg\programm_html"
        major_html_list = os.listdir(html_path)
        # print("==", major_html_list)
        for html_title in major_html_list:
            print("===========================")
            # 拼接每个专业的本地HTML文件的路径
            elem_path = html_path + "\\" + html_title
            print(elem_path)
            item = get_item(ScrapyschoolCanadaBenItem)

            # 公共字段
            self.parse_data(item)

            # 将HTML文件转成Element html，方便使用xpath获取数据
            # major_html = ""
            # major_text = ""
            with open(elem_path, 'r', encoding="utf-8") as f:
                major_html = etree.HTML(f.read())

            with open(elem_path, 'r', encoding="utf-8") as f:
                major_text = f.read()
            # print(major_html)

            # item['major_name']
            item['department'] = None
            department = major_html.xpath(
                "//p[contains(text(), 'FACULTY OF')]//text()")
            clear_space(department)
            # print("department: ", department)
            if len(department) > 0:
                item['department'] = department[0].replace("GUPTA FACULTY OF K INES IOLOGY AND APPL IED HEALTH", "Gupta Faculty of Kinesiology and Applied Health") \
                    .replace("EDUCAT ION", "EDUCATION").replace("SC IENCE", "SCIENCE")\
                    .replace("BUS INESS & ECONOM ICS", "BUSINESS and ECONOMICS").title().strip()
                item['department'] = item['department'].replace(
                    "Of", "of").replace("And", "and")
            # print("item['department']: ", item['department'])

            major_name_en = major_html.xpath(
                "//p[contains(text(), 'FACULTY OF')]/following-sibling::p[1]//text()"
            )
            clear_space(major_name_en)
            # print("major_name_en: ", major_name_en)
            if len(major_name_en) > 0:
                item['major_name_en'] = major_name_en[0].title().strip()
                item['major_name_en'] = item['major_name_en'].replace(
                    "Of", "of").replace("And", "and").replace("(Bsc)",
                                                              "").strip()
            if item['major_name_en'] is None:
                item['major_name_en'] = html_title.replace(
                    ".html", "").replace("(Bsc)", "").strip()
            print("item['major_name_en']: ", item['major_name_en'])

            url_dict = {
                'Anthropology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-anthropology.pdf',
                'Applied Computer Science':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-applied-computer-sci.pdf',
                'Bioanthropology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-bioanthropology.pdf',
                'Biochemistry':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-biochemistry.pdf',
                'Biology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-biology.pdf',
                'Biopsychology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-biopsychology.pdf',
                'Business & Administration':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-business-admin.pdf',
                'Chemistry':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-chemistry.pdf',
                'Classics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-classics.pdf',
                'Conflict Resolution Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-conflict-res-studies.pdf',
                'Co-operative Education':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-co-op-education.pdf',
                'Criminal Justice':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-criminal-justice.pdf',
                'Dance Program':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-dance.pdf',
                'Developmental Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-developmental-studies.pdf',
                'Disability Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-disability-studies.pdf',
                'East Asian Languages & Cultures':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-east-asian-languages-cultures.pdf',
                'Economics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-economics.pdf',
                'Economics & Finance':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-economics-finance.pdf',
                'Education':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-education.pdf',
                'English':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-english.pdf',
                'Environmental Studies Ba':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-env-studies-ba.pdf',
                'Environmental Sciences':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-env-science-bsc.pdf',
                'Student Designed Major':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-flexible-major.pdf',
                'French Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-french-studies.pdf',
                'Geography':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-geography.pdf',
                'German Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-german-studies.pdf',
                'German-Canadian Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-german-cdn-studies.pdf',
                'History':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-history.pdf',
                'History of Art':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-history-of-art.pdf',
                'Human Rights':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-human-rights.pdf',
                'Indigenous Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-indigenous-studies.pdf',
                'Interdisciplinary Linguistics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-interdisc-linguistics.pdf',
                'International Development Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-international-dev-studies.pdf',
                'Bachelor of Kinesiology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-kinesiology.pdf',
                'Mathematics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-mathematics.pdf',
                'Mennonite Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-mennonite-studies.pdf',
                'Philosophy':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-philosophy.pdf',
                'Bachelor of Physical and Health Education':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-physical-and-health-education.pdf',
                'Physics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-physics.pdf',
                'Medical Physics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-medical-physics.pdf',
                'Radiation Health and Safety':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-radiation-health-and-safety.pdf',
                'Political Science':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-political-science.pdf',
                'Psychology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-psychology.pdf',
                'Radiation Therapy':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-radiation-therapy.pdf',
                'Religion & Culture':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-religion-culture.pdf',
                'Rhetoric and Communications':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-rhetoric-and-communications.pdf',
                'Science-Business Stream':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-science-business-stream.pdf',
                'Sociology':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-sociology.pdf',
                'Spanish Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-spanish-studies.pdf',
                'Statistics':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-statistics.pdf',
                'Theatre and Film':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-theatre-and-film-stream.pdf',
                'Urban and Inner-City Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-urban-and-inner-city-studies.pdf',
                'Women’S & Gender Studies':
                'https://www.uwinnipeg.ca/factsheets/docs/factsheet-women-and-gender-studies.pdf',
            }
            item['url'] = url_dict.get(item['major_name_en'])
            print("item['url']: ", item['url'])

            if "Education" in item[
                    'major_name_en'] and item['department'] is None:
                item['department'] = "Faculty of Education"
            print("item['department']: ", item['department'])

            department_fee = {
                "Faculty of Arts": "13,695",
                "Faculty of Business and Economics": "17,010",
                "Faculty of Education": "14,259",
                "Gupta Faculty of Kinesiology and Applied Health": "14,589",
                "Faculty of Science": "16,372.5",
            }
            item['tuition_fee_pre'] = 'CAD$'
            item['tuition_fee'] = department_fee.get(item['department'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # 专业描述
            overview_en = major_html.xpath(
                "//p[contains(text(), 'SAMPLE CAREERS')]/preceding-sibling::p[position()<last()-1]"
            )
            if len(overview_en) == 0:
                overview_en = major_html.xpath(
                    "//p[contains(text(), 'SAMPLE COURSES')]/preceding-sibling::p[position()<last()-1]|//p[contains(text(), 'CAREER OPPORTUNITIES')]/preceding-sibling::p[position()<last()-1]"
                )
            # print("overview_en", overview_en)
            overview_en_str = ""
            if len(overview_en) > 0:
                for m in overview_en:
                    overview_en_str += etree.tostring(m,
                                                      encoding='unicode',
                                                      method='html')
                item['overview_en'] = remove_class(
                    clear_lianxu_space([overview_en_str]))
            # print("item['overview_en']: ", item['overview_en'])
            '''就业信息'''
            # print("===", major_text)
            key1 = "<p>SAMPLE CAREERS"
            if key1 not in major_text:
                key1 = "<p>CAREER OPPORTUNITIES "

            key2 = "<p>YOUR EDUCATION"
            if key2 not in major_text:
                key2 = "<p>SAMPLE COURSES"
            if key1 in major_text and key2 in major_text:
                item['career_en'] = remove_class(
                    getContentToXpath(major_text, key1, key2))
            # print("item['career_en']: ", item['career_en'])
            '''课程设置'''
            major_name_key = [
                "Indigenous Studies",
                "Anthropology",
                "Classics",
                "Conflict Resolution Studies",
                "Criminal Justice",
                "English",
                "Filmmaking",
                "French Studies",
                "German Studies",
                "History",
                "Interdisciplinary Linguistics",
                "International Development Studies",
                "Philosophy",
                "Political Science",
                "Psychology",
                "Religion & Culture",
                "Rhetoric and Communications",
                "Sociology",
                "Spanish Studies",
                "Theatre and Film",
                "Urban and Inner-City Studies",
                "Women’S & Gender Studies",
                "Business & Administration",
                "Economics",
                "Economics & Finance",
                "Bachelor of Kinesiology",
                "Applied Computer Science",
                "Bioanthropology",
                "Biochemistry",
                "Biology",
                "Biopsychology",
                "Chemistry",
                "Environmental Sciences",
                "Environmental Studies",
                "Geography",
                "Mathematics",
                "Physics",
                "Statistics",
            ]
            modules_url_value = [
                "http://uwinnipeg.ca/sample-first-year-programs/arts/aboriginal-gov.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/anthropology.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/classics.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/conflict-res.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/criminal-justice.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/english.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/filmmaking.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/french.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/german.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/history.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/id-linguistics.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/intl-dev.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/philosophy.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/poli-sci.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/psychology.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/religion-culture.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/rhet-writing-comm.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/sociology.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/spanish.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/theatre.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/uic.html",
                "http://uwinnipeg.ca/sample-first-year-programs/arts/wgs.html",
                "http://uwinnipeg.ca/sample-first-year-programs/bus-econ/business.html",
                "http://uwinnipeg.ca/sample-first-year-programs/bus-econ/economics.html",
                "http://uwinnipeg.ca/sample-first-year-programs/bus-econ/econ-finance.html",
                "http://uwinnipeg.ca/sample-first-year-programs/kinesiology/index.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/acs.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/bioanthro.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/biochemistry.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/biology.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/biopsychology.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/chemistry.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/env-studies.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/env-studies.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/geography.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/mathematics.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/physics.html",
                "http://uwinnipeg.ca/sample-first-year-programs/science/statistics.html",
            ]
            modules_dict = {}
            for i in range(len(major_name_key)):
                modules_dict[major_name_key[i]] = modules_url_value[i]

            modules_url = modules_dict.get(item['major_name_en'])
            if modules_url is not None:
                item['modules_en'] = self.parse_modules(modules_url)
            # print("item['modules_en']: ", item['modules_en'])
            '''学位名称和课程长度'''
            if item['overview_en'] is not None:
                degree_name_re = re.findall(
                    r"Bachelor\sof.*?\)", item['overview_en'].replace(
                        "Bache lor",
                        "Bachelor").replace("Sc ience", "Science"))
                print("degree_name_re； ", degree_name_re)
                ''' 2种情况：（1）只有一个学位但是，课程长度有多个
                                (2)有多个学位，并且每个学位可分多个课程长度或者一个    
                '''

                if len(degree_name_re) == 1:
                    degree_name = re.findall(r"Bachelor\sof.*?\(",
                                             ''.join(degree_name_re))
                    # print("degree_name: ", degree_name)
                    item['degree_name'] = ''.join(degree_name).replace(
                        "degree", "").replace("(", "").strip()
                    if len(item['degree_name']) > 30:
                        item['degree_name'] = ''.join(
                            re.findall(
                                r"Bachelor\sof\sBusiness\sAdministration|Bachelor\sof\s\w+",
                                ''.join(degree_name_re)))
                    print("item['degree_name']: ", item['degree_name'])

                    # 正则匹配课程长度
                    duration_re = re.findall(
                        r"\d-year\sHonours|\d-year|\d\syear\sHonours|or\sHonours\)",
                        ''.join(degree_name_re))
                    print("duration_re: ", duration_re)
                    if len(duration_re) > 0:
                        item['duration_per'] = 1
                        for duration in duration_re:
                            if "Honours" in duration:
                                item['degree_name'] = item[
                                    'degree_name'] + " Honours"
                                item['duration'] = '4'
                            else:
                                item['duration'] = ''.join(
                                    re.findall(r"\d", duration))
                            print("item['degree_name']: ", item['degree_name'])
                            print("item['duration']: ", item['duration'])
                            print("item['duration_per']: ",
                                  item['duration_per'])
                            if item['department'] is None:
                                item['department'] = "Faculty of " + item[
                                    'degree_name'].replace("Bachelor of ",
                                                           "").strip()
                                if item['tuition_fee'] is None:
                                    item['tuition_fee'] = department_fee.get(
                                        item['department'])
                            yield item
                    else:
                        if item['department'] is None:
                            item['department'] = "Faculty of " + item[
                                'degree_name'].replace("Bachelor of ",
                                                       "").strip()
                            if item['tuition_fee'] is None:
                                item['tuition_fee'] = department_fee.get(
                                    item['department'])
                        yield item

                elif len(degree_name_re) > 1:
                    for degree_name_duration in degree_name_re:
                        degree_name = re.findall(
                            r"Bachelor\sof.*?\(",
                            remove_tags(degree_name_duration))
                        # print("degree_name--: ", degree_name)
                        item['degree_name'] = ''.join(degree_name).replace(
                            "Degree", "").replace("degree",
                                                  "").replace("(", "").strip()
                        if len(item['degree_name']) > 30:
                            item['degree_name'] = ''.join(
                                re.findall(
                                    r"Bachelor\sof\sBusiness\sAdministration|Bachelor\sof\s\w+",
                                    degree_name_duration.replace(
                                        "Adm in istration",
                                        "Administration").replace(
                                            "</p><p>", " ")))
                        print("item['degree_name']2: ", item['degree_name'])

                        # 正则匹配课程长度
                        duration_re = re.findall(
                            r"\d-year\sHonours|\d-year|\d\syear\sHonours|or\sHonours\)",
                            remove_tags(degree_name_duration))
                        print("duration_re2: ", duration_re)
                        if len(duration_re) > 0:
                            item['duration_per'] = 1
                            for duration in duration_re:
                                if "Honours" in duration:
                                    item['degree_name'] = item[
                                        'degree_name'] + " Honours"
                                    item['duration'] = '4'
                                else:
                                    item['duration'] = ''.join(
                                        re.findall(r"\d", duration))
                                print("item['degree_name']2: ",
                                      item['degree_name'])
                                print("item['duration']2: ", item['duration'])
                                print("item['duration_per']2: ",
                                      item['duration_per'])
                                if item['department'] is None:
                                    item['department'] = "Faculty of " + item[
                                        'degree_name'].replace(
                                            "Bachelor of ", "").strip()
                                    if item['tuition_fee'] is None:
                                        item[
                                            'tuition_fee'] = department_fee.get(
                                                item['department'])
                                yield item
                        else:
                            if item['department'] is None:
                                item['department'] = "Faculty of " + item[
                                    'degree_name'].replace("Bachelor of ",
                                                           "").strip()
                                if item['tuition_fee'] is None:
                                    item['tuition_fee'] = department_fee.get(
                                        item['department'])
                            yield item
                else:
                    if "Bachelor of" in item['major_name_en']:
                        item['degree_name'] = item['major_name_en']
                    yield item
            else:
                yield item

Ejemplo n.º 11

Mostrar archivo

Archivo: UniversityofPrinceEdwardIsland_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'University of Prince Edward Island'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en = response.xpath('/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]//h2').extract()
        major_name_en = ''.join(major_name_en)
        major_name_en = remove_tags(major_name_en).strip()
        # print(major_name_en)

        #4.overview_en
        overview_en = response.xpath("//div[@id='quicktabs-tabpage-view__flex_tabs__block_1-0']//div[@class='tabcontent']").extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en).strip()
        # print(overview_en,'-----------------------------')

        #5.career_en
        career_en = response.xpath("//div[contains(@class,'view-programpageinfo')]//span[@class='views-label views-label-field-careers'][contains(text(),'Careers:')]//../div").extract()
        try:
            career_en = remove_class(career_en[0]).strip()
        except:
            career_en = None
        # print(career_en)

        #6.modules_en
        modules_en = response.xpath("//div[contains(@id,'quicktabs-tabpage-view__coursefields__block')]//div[@class='views-field views-field-field-courses']").extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en).strip()
        # print(modules_en)
        if len(modules_en)<50:
            modules_en = response.xpath("//div[@class='tabcontent']").extract()
            try:
                modules_en = modules_en[1]
                modules_en = remove_class(modules_en)
            except:
                modules_en = None

        #7.department
        if 'Sustainable Design Engineering' in major_name_en:
            department = 'Faculty of Sustainable Design Engineering'
        elif 'Psychology' in major_name_en or 'Pre-Veterinary Medicine Stream' in major_name_en or 'Wildlife Conservation' in major_name_en or 'Sustainable Design Engineering' in major_name_en or 'Radiography' in major_name_en or 'Physics' in major_name_en or 'Paramedicine' in major_name_en or 'Medical and Biological Physics' in major_name_en or 'Mathematical and Computational Sciences' in major_name_en or 'Kinesiology' in major_name_en or 'Foods and Nutrition' in major_name_en or 'Family Science' in major_name_en or 'Environmental Studies' in major_name_en or 'Dietetic Internship (Foods and Nutrition)' in major_name_en or 'Child and Family Studies' in major_name_en or 'Chemistry' in major_name_en or 'Biotechnology' in major_name_en or 'Biology' in major_name_en or 'Applied Climate Change and Adaptation' in major_name_en:
            department = 'Faculty of Science'
        elif 'Nursing' in major_name_en:
            department  = 'Faculty of Nursing'
        elif 'Bachelor of Education' in major_name_en:
            department = 'Faculty of Education'
        elif 'Bachelor of Business Studies' in major_name_en or 'Bachelor of Business in Tourism and Hospitality' in major_name_en or 'Accelerated Bachelor of Business Administration' in major_name_en or 'Tourism and Hospitality' in major_name_en or 'Organizational Management' in major_name_en or 'Marketing' in major_name_en or 'International Business' in major_name_en or 'Finance' in major_name_en or 'Entrepreneurship' in major_name_en or 'Accounting' in major_name_en or 'Bachelor of Business Administration' in major_name_en:
            department = 'Faculty of Business'
        else:
            department = 'Faculty of Arts'
        # print(department)

        #8.tuition_fee
        if 'Education' in department:
            tuition_fee = '10,692'
        else:
            tuition_fee = '7,176'

        #9.tuition_fee_pre
        tuition_fee_pre = '$'

        #10.entry_requirements_en
        entry_requirements_en = '<p>Average of 75-80% on 5 academic subjects from Grade 11 and 12, and average B on the Huikao.</p>'

        #11.require_chinese_en
        require_chinese_en = '<p>Average of 75-80% on 5 academic subjects from Grade 11 and 12, and average B on the Huikao.</p>'

        #12.13.sat1_desc act_desc
        sat1_desc = "Minimum overall 'B' average (GPA 2.8 on a 4.0 scale) in a recognized academic grade 12 program. At least 4 different subjects at the matriculation level must be represented in the high school diploma. SAT or ACT results are not required, but can be used for scholarship reference.  "
        act_desc = sat1_desc

        #14.alevel
        alevel = 'General Certificate of Secondary Education (GCSE or IGSCE) with five O-level subjects with minimum C grade or better; General Certificate of Education Advanced or Advanced Supplementary Levels (GCE-A or AICE or GCE-AS) with at least 2 A-levels (two AS subjects may be substituted for one A-level). Minimum C in each and all AS and A-levels.'

        #15.location
        location = 'Prince Edward Island'

        #16.ielts_desc 1718192021
        ielts_desc = 'arts,science,business Overall score of 6.5 with 6.5 in writing no other band below 6;nursing,education Overall score of 7 with 7 in writing and speaking; 6.5 in reading and listening'
        if 'Arts' in department or 'Business' in department or 'Science' in department:
            ielts = 6.5
            ielts_w = 6.5
            ielts_r = 6
            ielts_l = 6
            ielts_s = 6
        else:
            ielts = 7
            ielts_w = 7
            ielts_r = 6.5
            ielts_l = 6.5
            ielts_s = 7

        #22.toefl_desc 2324252627
        toefl_desc = 'arts,science,business 80 with minimum of 20 in each category; nursing,education 100 with a minimum of 25 in speaking and writing, 22 in reading and listening'
        if 'Arts' in department or 'Business' in department or 'Science' in department:
            toefl = 80
            toefl_w = 20
            toefl_s = 20
            toefl_l = 20
            toefl_r = 20
        else:
            toefl = 100
            toefl_w = 25
            toefl_s = 25
            toefl_l = 20
            toefl_r = 20

        #28.29.toefl_code,sat_code
        toefl_code = '0941'
        sat_code ='0941'

        #30.act_code
        act_code ='7935'

        #31.apply_fee
        apply_fee = 50

        #32.apply_pre
        apply_pre = '$'

        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['modules_en'] = modules_en
        item['department'] = department
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['entry_requirements_en'] = entry_requirements_en
        item['require_chinese_en'] = require_chinese_en
        item['sat1_desc'] = sat1_desc
        item['act_desc'] = act_desc
        item['alevel'] = alevel
        item['location'] = location
        item['ielts_desc'] = ielts_desc
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl_desc'] = toefl_desc
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_w'] = toefl_w
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        item['act_code'] = act_code
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        yield  item

Ejemplo n.º 12

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Fraser Valley University'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en_a = response.xpath(
            '//*[@id="pl_inner"]/div/div[2]/h1').extract()
        major_name_en_a = ''.join(major_name_en_a)
        major_name_en_a = remove_tags(major_name_en_a)
        major_dict = {
            "Bachelor of Agricultural Science, Horticulture major":
            "Horticulture",
            "Bachelor of Arts in Adult Education (BA AE)":
            "Adult Education",
            "Anthropology — Bachelor of Arts":
            "Anthropology",
            "Biology":
            "Biology",
            "Bachelor of Fine Arts, Visual Arts":
            "Visual Arts",
            "Aviation — Bachelor of Business Administration":
            "Aviation",
            "Chemistry":
            "Chemistry",
            "Bachelor of Computer Information Systems":
            "Computer Information Systems",
            "Education — bachelor's degree":
            "Education",
            "Computing Science":
            "Computing Science",
            "Bachelor of General Studies":
            "General Studies",
            "Creative Writing — Bachelor of Arts":
            "Creative Writing",
            "Economics":
            "Economics",
            "French — Bachelor of Arts":
            "French",
            "Global Development Studies":
            "Global Development Studies",
            "Indigenous Studies — Bachelor of Arts degree":
            "Indigenous Studies",
            "Kinesiology":
            "Kinesiology",
            "English — Bachelor of Arts":
            "English",
            "Mathematics major (BA/BSc), extended minor (BA), or minor (BA/BSc/BCIS/BGS)":
            "Mathematics",
            "Media Arts":
            "Media Arts",
            "Geography — Bachelor of Arts":
            "Geography",
            "Bachelor of Science in Nursing":
            "Nursing",
            "Peace and Conflict Studies":
            "Peace and Conflict Studies",
            "Physics":
            "Physics",
            "Political Science — Bachelor of Arts":
            "Political Science",
            "Media &amp; Communications — Bachelor of Arts":
            "Media &amp; Communications",
            "Psychology":
            "Psychology",
            "Sociology — Bachelor of Arts":
            "Sociology",
            "Bachelor of Arts in Theatre":
            "Theatre",
            "Philosophy — Bachelor of Arts":
            "Philosophy",
            "Social Work — bachelor's degree":
            "Social Work",
            "Physical Geography — Bachelor of Science":
            "Physical Geography"
        }
        major_name_en = major_dict.get(major_name_en_a).replace('&amp; ', '')
        # print(major_name_en)

        #4.degree_name
        degree_name_dict = {
            "Bachelor of Agricultural Science, Horticulture major":
            "Bachelor of Agricultural Science",
            "Bachelor of Arts in Adult Education (BA AE)":
            "Bachelor of Arts",
            "Anthropology — Bachelor of Arts":
            "Bachelor of Arts",
            "Biology":
            "Bachelor of Science",
            "Bachelor of Fine Arts, Visual Arts":
            "Bachelor of Fine Arts",
            "Aviation — Bachelor of Business Administration":
            "Bachelor of Business Administration",
            "Chemistry":
            "Bachelor of Science",
            "Bachelor of Computer Information Systems":
            "Bachelor of Computer Information Systems",
            "Education — bachelor's degree":
            "Bachelor of Education",
            "Computing Science":
            "Bachelor of Science",
            "Bachelor of General Studies":
            "Bachelor of General Studies",
            "Creative Writing — Bachelor of Arts":
            "Bachelor of Arts",
            "Economics":
            "Bachelor of Arts",
            "French — Bachelor of Arts":
            "Bachelor of Arts",
            "Global Development Studies":
            "Bachelor of Arts",
            "Indigenous Studies — Bachelor of Arts degree":
            "Bachelor of Arts",
            "Kinesiology":
            "Bachelor of Kinesiology",
            "English — Bachelor of Arts":
            "Bachelor of Arts",
            "Mathematics major (BA/BSc), extended minor (BA), or minor (BA/BSc/BCIS/BGS)":
            "Bachelor of Arts/Bachelor of Science",
            "Media Arts":
            "Bachelor of Arts",
            "Geography — Bachelor of Arts":
            "Bachelor of Arts",
            "Bachelor of Science in Nursing":
            "Bachelor of Science",
            "Peace and Conflict Studies":
            "Bachelor of Arts",
            "Physics":
            "Bachelor of Sciense",
            "Political Science — Bachelor of Arts":
            "Bachelor of Arts",
            "Media &amp; Communications — Bachelor of Arts":
            "Bachelor of Arts",
            "Psychology":
            "Bachelor of Arts",
            "Sociology — Bachelor of Arts":
            "Bachelor of Arts",
            "Bachelor of Arts in Theatre":
            "Bachelor of Arts",
            "Philosophy — Bachelor of Arts":
            "Bachelor of Arts",
            "Social Work — bachelor's degree":
            "Bachelor of Social Work",
            "Physical Geography — Bachelor of Science":
            "Bachelor of Science"
        }
        degree_name = degree_name_dict.get(major_name_en_a).replace(
            '&amp; ', '')
        # print(major_name_en,'****',degree_name)

        #5.location
        location = 'Vancouver'

        #6.campus
        campus = response.xpath(
            '//*[@id="pl_inner"]/div/div[2]/div/div[3]/div[2]/p').extract()
        campus = ''.join(campus)
        campus = remove_tags(campus)
        campus = re.findall('Location:\s(.*)Cost', campus,
                            re.S)[0].replace('\r', '').replace('\n',
                                                               '').strip()
        # print(campus)

        #7.start_date
        start_date = response.xpath(
            '//*[@id="pl_inner"]/div/div[2]/div/div[3]/div[2]/p').extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = re.findall('Start date:(.*)',
                                start_date)[0].replace('\xa0\r', '')
        if 'January, May, September' in start_date:
            start_date = '2019-1,2019-5,2019-9'
        elif 'September, January, May' in start_date:
            start_date = '2019-1,2019-5,2019-9'
        elif 'September, May, January' in start_date:
            start_date = '2019-1,2019-5,2019-9'
        elif 'May, January, September' in start_date:
            start_date = '2019-1,2019-5,2019-9'
        elif 'January, September' in start_date:
            start_date = '2019-1,2019-9'
        elif 'September, January' in start_date:
            start_date = '2019-1,2019-9'
        elif 'August' in start_date:
            start_date = '2019-8'
        else:
            start_date = '2019-9'
        # print(start_date)

        #8.duration
        duration = 4

        #9.duration_per
        duration_per = 1

        #10.overview_en
        overview_en = response.xpath(
            "//h2[contains(text(),'PROGRAM DESCRIPTION')]/../following-sibling::*[1]"
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #11.career_en
        career_en = response.xpath(
            "//h2[contains(text(),'CAREER EXPECTATIONS')]/../following-sibling::*[1]"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #12.tuition_fee
        tuition_fee = '17,160'

        #13.tuition_fee_pre
        tuition_fee_pre = '$'

        #14.tuition_fee_per
        tuition_fee_per = 1

        #15.ap
        ap = 'A C+ final grade in one of English 12, English 12 First Peoples, AP English, or IB English.'

        #16.ib
        ib = 'A C+ final grade in one of English 12, English 12 First Peoples, AP English, or IB English.'

        #17.ielts_desc 1819202122
        ielts_desc = 'IELTS (Academic) - score of 6.5 or higher with a minimum band score of 6.0'
        ielts = 6.5
        ielts_w = 6
        ielts_r = 6
        ielts_l = 6
        ielts_s = 6

        #23.toefl_desc 2425262728
        toefl_desc = 'TOEFL - score of 88 or higher (iBT) with no section below 20'
        toefl = 88
        toefl_r = 20
        toefl_w = 20
        toefl_s = 20
        toefl_l = 20

        #29.toefl_code
        toefl_code = '9736'

        #30.sat_code
        sat_code = '9736'

        #31.apply_fee
        apply_fee = 150

        #32.apply_pre
        apply_pre = '$'

        #33.entry_requirements_en
        entry_requirements_en = '<p>The following are the minimum requirements for admission to UFV.</p><p></p><p>All applicants must meet ONE of the following:</p><p></p><p>B.C. high school graduation or equivalent;</p><p>Or completion of a minimum of nine UFV or transferable post-secondary credits with a minimum 2.00 GPA (C average) based on all credits </p>attempted;<p>Or a minimum of 19 years of age by the start of the first class;</p><p>Or, for admission into preparatory level programs only, a minimum of 17 years of age and out of high school for at least one year by the </p>start of the semester.<p>English requirements</p><p>English is the language of instruction at UFV. To be successful, applicants must demonstrate language proficiency by meeting the </p>following requirement:<p></p><p>A C+ final grade in one of English 12, English 12 First Peoples, AP English, or IB English.</p>'

        #34.require_chinese_en
        require_chinese_en = entry_requirements_en

        #35.deadline
        deadline = '2019-05-01,2019-10-01'

        # 以下用于抓取课程设置字段
        # department = response.xpath('//*[@id="sb-site"]/div[2]/div/div[1]/h1/a').extract()
        # department = ''.join(department)
        # department = remove_tags(department)
        # # print(department)
        # other = response.xpath('//h2').extract()[:-2]
        # other = ''.join(other)
        # other = remove_class(other)
        # # print(other,response.url)
        # item['department'] = department
        # item['other'] = other
        # item['school_name'] = 'sss'
        # yield item

        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['degree_name'] = degree_name
        item['location'] = location
        item['campus'] = campus
        item['start_date'] = start_date
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        # item['tuition_fee_per'] = tuition_fee_per
        item['ap'] = ap
        item['ib'] = ib
        item['ielts_desc'] = ielts_desc
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['toefl_desc'] = toefl_desc
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_w'] = toefl_w
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['entry_requirements_en'] = entry_requirements_en
        item['require_chinese_en'] = require_chinese_en
        item['deadline'] = deadline

Ejemplo n.º 13

Mostrar archivo

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)



        try:
            major_name_en = response.xpath('//*[@id="framework"]/section[1]/div[1]/h1').extract()[0]
            #major_name_en = ''.join(major_name_en)
            #major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
            major_name_en = remove_tags(major_name_en)
          #  print(major_name_en)
        except:
            major_name_en = None
            #print(major_name_en)
#1.学校名称
        school_name = 'Carleton University'

#2.地点
        try:
            location = 'Ontario'
            #location = remove_tags(location)
            #print(location)
        except:
            location = None
# 3. 校区
        try:
            campus = response.xpath('//h2[contains(text(),"Campus")]/following-sibling::div[1]').extract()[0]
            campus = remove_tags(campus)
            campus = campus.replace(', Online', '')
            campus = campus.replace(' ', '')
            campus = campus.split(',')
            # print(campus_list)
        except:
            campus = None
            # print(campus_list)

                # 4. 学院
        try:
            department = response.xpath('//h2[contains(text(),"Department")]/following-sibling::div[2]').extract()[0]

            department = remove_tags(department)
                # print(department)
        except:
            department = None
            # print(department)

# 4. 学位名称
        try:
            degree_name =  response.xpath('//h2[contains(text(),"Degrees")]/following-sibling::div[1]').extract()[0]
            degree_name = remove_tags(degree_name, keep=('li', 'br', ''))
            degree_name = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', degree_name)
            degree_name = degree_name.replace('<br>', '---')
            degree_name = degree_name.replace('<li>', '').replace('</li>', '---')
            degree_name = degree_name.replace('<span>', '').replace('</span>', '---')
            degree_name = degree_name.split('---')
            # print(degree_name)
        except:
            degree_name = None
            # print(degree_name)

# 5.学位描述
        try:
            degree_overview_en = response.xpath('//div[2]/div[1]/div/div/div/p|//div/div[1]/div/div/div/p').extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文


#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = '2019-09'
            # start_date = ','.join(start_date)
            # start_date = remove_tags(start_date)
            # start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            # start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            # #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
        # try:
        #     duration = response.xpath('').extract()[0]
        #     duration = remove_tags(duration)
        #     # print(duration)
        # except:
        #     duration = None
        #     # print(duration)

#10.课程设置
        try:
            modules_en_dict = {"Aerospace Engineering":"AERO/",
    "African Studies":"AFRI/",
    "American Sign Language":"ASLA/",
    "Anthropology":"ANTH/",
    "Applied Linguistics and Discourse Studies":"ALDS/",
    "Arabic":"ARAB/",
    "Theory/History":"ARCH/",
    "Technical":"ARCC/",
    "Urban":"ARCU/",
    "Techniques":"ARCN/",
    "Design Studios/Design Thesis/Research":"ARCS/",
    "Art History":"ARTH/",
    "Biochemistry":"BIOC/",
    "Biology":"BIOL/",
    "Business":"BUSI/",
    "Canadian Studies":"CDNS/",
    "Centre for Initiatives in Education":"CIED/",
    "Chemistry":"CHEM/",
    "Child Studies":"CHST/",
    "Chinese":"CHIN/",
    "Civil Engineering":"CIVE/",
    "Classical Civilization":"CLCV/",
    "Co-operative Education":"COOP/",
    "Cognitive Science":"CGSC/",
    "Communication and Media Studies":"COMS/",
    "Communication Courses for Disciplines and Professions":"CCDP/",
    "Computer Science":"COMP/",
    "Criminology and Criminal Justice":"CRCJ/",
    "Digital Humanities":"DIGH/",
    "Disability Studies":"DBST/",
    "Earth Sciences":"ERTH/",
    "Economics":"ECON/",
    "Electronics":"ELEC/",
    "Engineering Common Core Courses":"ECOR/",
    "English":"ENGL/",
    "English as a Second Language":"ESLA/",
    "Environmental Engineering":"ENVE/",
    "Environmental Science":"ENSC/",
    "Environmental Studies":"ENST/",
    "European, Russian and Eurasian Studies":"EURR/",
    "Film Studies":"FILM/",
    "First-Year Seminars":"FYSM/",
    "Food Science":"FOOD/",
    "French":"FREN/",
    "French Interdisciplinary Studies":"FINS/",
    "Geography":"GEOG/",
    "Geomatics":"GEOM/",
    "German":"GERM/",
    "Global and International Studies":"GINS/",
    "Global Politics":"GPOL/",
    "Greek":"GREK/",
    "Health Sciences":"HLTH/",
    "Hebrew":"HEBR/",
    "History":"HIST/",
    "Human Rights":"HUMR/",
    "Humanities":"HUMS/",
    "Indigenous Studies":"INDG/",
    "Industrial Design":"IDES/",
    "Information Resource Management":"IRM/",
    "Interactive Media and Design":"IMD/",
    "Network Technology":"NET/",
    "Photonics":"PLT",
    "Information Technology":"ITEC/",
    "Integrated Science":"INSC/",
    "Interdisciplinary Public Affair":"IPAF/",
    "Interdisciplinary Science":"ISCI/",
    "Interdisciplinary Studies":"DIST/",
    "International Affairs":"INAF/",
    "Italian":"ITAL/",
    "Japanese":"JAPA/",
    "Journalism and Communication":"JOUR/",
    "Korean":"KORE/",
    "Language Studies":"LANG/",
    "Latin":"LATN/",
    "Latin American and Caribbean Studies":"LACS/",
    "Law":"LAWS/",
    "Linguistics":"LING/",
    "Mathematics":"MATH/",
    "Mechanical Engineering":"MECH/",
    "Mechanical and Aerospace Engineering":"MAAE/",
    "Media Production and Design":"MPAD/",
    "Medieval and Early Modern Studies":"MEMS/",
    "Migration and Diaspora Studies":"MGDS/",
    "Music":"MUSI/",
    "Natural Sciences":"NSCI/",
    "Neuroscience":"NEUR/",
    "Philosophy":"PHIL/",
    "Physics":"PHYS/",
    "Political Managemen":"POLM/",
    "Political Science":"PSCI/",
    "Portuguese":"PORT/",
    "Psychology":"PSYC/",
    "Public Affairs and Policy Management":"PAPM/",
    "Public Policy and Administration":"PADM/",
    "Religion":"RELI/",
    "Russian":"RUSS/",
    "Sexuality Studies":"SXST/",
    "Social Work":"SOWK/",
    "Sociology":"SOCI/",
    "South Asian Studies":"SAST/",
    "Spanish":"SPAN/",
    "Statistics":"STAT/",
    "Sustainable and Renewable Energy Engineering":"SREE/",
    "Systems and Computer Engineering":"SYSC/",
    "Technology, Society, Environment Studies":"TSES/",
    "Women’s and Gender Studies":"WGST/"}
            modules_en_val = re.sub(' \(.*\)','',major_name_en)
           # print(modules_en_val)
            modules_en_val = modules_en_dict[modules_en_val]
           # print(modules_en_val)
            url = 'http://calendar.carleton.ca/undergrad/courses/' + modules_en_val
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
            response2 = etree.HTML(requests.get(url, headers=headers).text)
            response2 = response2.xpath('//div[@class="courses"]')
            modules_en = []
            # print(response2)
            for rea in response2:
                modules_en += etree.tostring(rea, method='html', encoding='unicode')
                modules_en = ''.join(modules_en)
            # print(modules_en,'------------')
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',modules_en)
            modules_en = modules_en.replace('\r\n','').replace('\n','').replace('\t','').replace('                                                                                                                         ','').replace('                                                                       ','')
            print(modules_en)
        except:
            modules_en = None
            #print(modules_en)

#11.就业方向
        try:
            career_en = response.xpath('//*[@id="main-content"]/div/section/div/div/div[2]/div[3]/div/ul/li/a|//*[@id="main-content"]/div/section/div/div/div[2]/div[3]|//div[@class = "careers"]|//*[@id="future-opportunities"]').extract()
            career_en = ''.join(career_en)
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',career_en)
            career_en = career_en.replace('\r\n','').replace('\n','').replace('\t','').replace('                                                                                                                         ','').replace('                                                                       ','')
           # print(career_en)
        except:
            career_en = None
         #   print(career_en)

#12.截止日期
        try:
            deadline = '2019-04-01'
            # deadline = response.xpath('//*[@id="Admissionrequirementsanddeadlines-subsection-0"]/table/tbody/tr/td[3]').extract()
            # deadline = '---'.join(deadline)
            # deadline = remove_tags(deadline)
            # deadline = deadline.replace('Documents due: ', '')
            # deadline =  deadline.replace('Sep 1, 2018Oct 1, 2018','2018-09-01').replace('Feb 1, 2019Mar 1, 2019','2019-02-01').replace('Mar 1, 2019Apr 1, 2019','2019-03-01').replace('May 1, 2019Jun 1, 2019','2019-05-01').replace('Sep 1, 2019Oct 1, 2019','2019-09-01').replace('Feb 15, 2019Mar 1, 2019','2019-02-15').replace('---',',')
            # #deadline = remove_tags(deadline)
            #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            tuition_fee = '34,221'
            tuition_fee = remove_tags(tuition_fee)
            tuition_fee = tuition_fee.replace('$','')
            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)
#14 申请费:
        apply_fee = '166'

#15 申请要求
        try:
            entry_requirements_en = 'General Requirements:<br>Senior High School (3 years of study) <br>Final Chinese Upper Middle School transcript and graduation diploma for verification by China Academic Degrees and Graduate Education Development Centre (CDGDC) or China Credentials Verification (CHESICC-Parchment Portal Service).'
        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = entry_requirements_en
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = None
            # #specific_requirement_en = remove_tags(specific_requirement_en)
            # specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
            # specific_requirement_en = specific_requirement_en.replace('\r\n','')
            # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0]
            # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
            #print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = response.xpath('').extract()[0]
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = '6.5 IELTS (min 6.0 each band)'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = '6.5 IELTS (min 6.0 each band)'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

#28 toefl_code
        try:
            toefl_code = '0854'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = '86 (22 in writing and speaking, 20 reading and listening)'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '86'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 20
        toefl_s = 22
        toefl_r = 20
        toefl_w = 22

# 32 alevel
        try:
            alevel = ''
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'You will need the full IB diploma (three subsidiary [SL] and three higher level [HL] subjects), with a minimum of 28 points (please note that some programs are more competitive, so will require higher scores). You may have one subject with a grade of 3, provided it is offset by a grade of 5 or better. Prerequisite subjects must have a grade of 4 or better. Early/conditional offers may be available with predicted results. IB students may be awarded advanced standing (transfer) credit for HL subjects with a grade of 5 or better subject to the discretion of the appropriate faculty, to a maximum of 3.0 credits. Prerequisite Equivalencies  Math: SL or HL Math Chemistry: SL or HL Chemistry Physics: SL or HL Physics English: SL or HL English '
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'Advanced standing (transfer) credit may be awarded for Advanced Placement “AP” exams with a minimum grade of 4, subject to the discretion of the appropriate faculty, to a maximum of 3.0 credits.'
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = ''
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

#38 average_score
        try:
            average_score = response.xpath('//span[@class = "listing-text-icon"]/text()').extract()[0]
            average_score = remove_tags(average_score)
            average_score = average_score.replace(' ','')
         #   print(average_score)
        except:
            average_score = None
          #  print(average_score)

        # sat act 代码 介绍
        sat_code = '0854'
        sat1_desc = 'The Grade 12 program must include at least four academic units and a minimum of 16 academic units completed in Grades 9 to 12. A minimum average in your final years of B- or better is required for admission. For Honours or some limited enrolment programs, a higher average may be required. You are encouraged to submit SAT or ACT scores, school grading information including pass marks, and rank in class to support your application.'
        sat2_desc = None
        act_code = '5376'
        act_desc = 'The Grade 12 program must include at least four academic units and a minimum of 16 academic units completed in Grades 9 to 12. A minimum average in your final years of B- or better is required for admission. For Honours or some limited enrolment programs, a higher average may be required. You are encouraged to submit SAT or ACT scores, school grading information including pass marks, and rank in class to support your application.'

        item["ap"] = ap
        item["duration_per"] = 1
        #item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        item["average_score"] = average_score
        #yield item

Ejemplo n.º 14

Mostrar archivo

Archivo: McGillUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "McGill University"
        # item['campus'] = 'Montreal, Quebec, Canada'
        # item['location'] = 'Montreal, Quebec, Canada'
        item['url'] = response.url
        print("===========================")
        print(response.url)

        try:
            major_name_en = response.xpath(
                "//div[@class='details']/h1//text()").extract()
            clear_space(major_name_en)
            item['major_name_en'] = ''.join(major_name_en).strip()
            print("item['major_name_en']: ", item['major_name_en'])

            department = response.xpath(
                "//span[@class='value faculty']//text()").extract()
            item['department'] = department
            # if item['department'] == "":
            #     print("***department 为空")
            print("item['department']: ", item['department'])
            if len(item['department']) > 0:
                for dep in item['department']:
                    if dep in item['major_name_en']:
                        item['major_name_en'] = item['major_name_en'].replace(
                            dep, '').replace('(', '').replace(')', '')
            item['major_name_en'] = item['major_name_en'].strip().strip(
                '-').strip()
            print("item['major_name_en']2== ", item['major_name_en'])

            degree_name = response.xpath(
                "//span[@class='value degree']//text()").extract()
            item['degree_name'] = ', '.join(degree_name).replace(
                "Concurrent", '').strip()
            print("item['degree_name']: ", item['degree_name'])
            if item['degree_name'] == "Bachelor of Science, Bachelor of Science in Agricultural and Environmental Sciences" or item[
                    'degree_name'] == "Bachelor of Science in Agricultural and Environmental Sciences":
                item['degree_name'] = 'Bachelor of Science'

            overview = response.xpath(
                "//p[contains(text(),'DETAILED PROGRAM OUTLINE')]/preceding-sibling::p[position()>1]"
            ).extract()
            if len(overview) == 0:
                overview = response.xpath(
                    "//div[@class='description']/p[position()<last()-1]"
                ).extract()

            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # if item['overview_en'] == "":
            #     print("***overview_en 为空")
            # print("item['overview_en']: ", item['overview_en'])

            # 第一种情况 页面含有major的学位
            # modules_en_major = response.xpath("//a[contains(text(),'Major ')]//text()").extract()
            modules_en_a = response.xpath(
                "//a[contains(text(),'Major ')]/@href").extract()
            # print("***********")
            # print(len(modules_en_major))
            # print(modules_en_major)
            # # 每个专业分为多个专业，专业名不一样，需要匹配
            # if len(modules_en_major) > 0:
            #     for m in range(len(modules_en_major)):
            #         if '- Major ' + item['major_name_en'] + ' - ' not in modules_en_major[m] or '- Major ' + item['major_name_en'] + ': ' not in modules_en_major[m]:
            #             modules_en_major[m] = 'nono'
            # print(modules_en_major)
            # print('modules_en_a==', modules_en_a)
            # if len(modules_en_a) == 1:
            #     modules_en = response.xpath(
            #         "//a[contains(text(),'Major ')]/following-sibling::div//h3[contains(text(),'Program Requirement: ')]/preceding-sibling::*[1]/following-sibling::*[position()<last()]").extract()
            #     item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            modules_en_list = []
            major_en_list = []
            if len(modules_en_a) > 0:
                for modules_a in modules_en_a:
                    # print("***************", modules_a, "**************")
                    if "https://mcgill.ca/study/2018-2019/courses" not in modules_a:
                        modules_en_major_dict = self.parse_modules(modules_a)
                        modules_en = modules_en_major_dict['modules_en']
                        major_name_del = modules_en_major_dict['major_en_del']
                        modules_en_list.append(modules_en)
                        major_en_list.append(major_name_del)
            else:
                modules_en_a = response.xpath(
                    "//a[contains(text(),'Bachelor of')]/@href").extract()
                if len(modules_en_a) > 0:
                    # print(len(modules_en_a))
                    # print('1==', modules_en_a)
                    for i in range(len(modules_en_a)):
                        if "/undergraduate-admissions/" in modules_en_a[i]:
                            modules_en_a[i] = 'yes'
                    total_len = len(modules_en_a)
                    if 'yes' in modules_en_a:
                        modules_en_a.remove('yes')
                    # print(len(modules_en_a))
                    # print('2==', modules_en_a)
                    if len(modules_en_a) > 0:
                        if modules_en_a[0] == "yes":
                            modules_en_major_dict = self.parse_modules(
                                modules_en_a[1])
                            modules_en = modules_en_major_dict['modules_en']
                            major_name_del = modules_en_major_dict[
                                'major_en_del']
                        else:
                            # modules_en = self.parse_modules(modules_en_a[0])
                            modules_en_major_dict = self.parse_modules(
                                modules_en_a[0])
                            modules_en = modules_en_major_dict['modules_en']
                            major_name_del = modules_en_major_dict[
                                'major_en_del']
                        modules_en_list.append(modules_en)
                        major_en_list.append(major_name_del)
            print('modules_en_list=', len(modules_en_list))
            print('major_en_list=', len(major_en_list))

            # if item['modules_en'] == "":
            #     print("***modules_en 为空")
            # print("item['modules_en']: ", item['modules_en'])

            # https://mcgill.ca/applying/requirements/international/china#Process
            item['start_date'] = '9月'
            item['deadline'] = '2019-01-15'

            # https://mcgill.ca/applying/requirements/international/china#Process
            item[
                'entry_requirements_en'] = """<h2>Admission review process</h2>
<p>Applicants will be considered for admission on their high school transcript (Grades 1, 2 and midyear grade 3) and all available results of the Huikao exams. Note that SAT cannot be used as a substitute for the Huikao/Academic Proficiency Test (APT).</p>
<p>Applicants from Chinese provinces where the Huikao is not offered must present additional external information of their academic credentials, such as SATI and SATII scores. If admitted to McGill, you must arrange for your school to send to McGill University an official final transcript of your complete high school record, the graduation certificate, and all final HUIKAO results.</p>
<p>If you write the GAOKAO, you must make arrangements to forward to us the final official results.</p>
<p>If admitted, you are expected to maintain your level of academic performance through to the completion of your pre-McGill studies.</p>
<h2><span>Minimum grades & prerequisites</span></h2>
<ul><li>The minimum requirements normally are averages of 85% or higher in each year and in all prerequisite courses. Many programs are more competitive and will require higher grades; applicants who present the minimum requirements are not guaranteed admission.</li>
</ul>"""
            item['average_score'] = 85
            # https://mcgill.ca/music/admissions/undergraduate/prepare
            item['require_chinese_en'] = '''
<div><ul><li>You need a high school diploma.</li>
<li>Your high school average must be 75% or higher if you're applying to a performance program, 80% or higher if you're applying to a music research program.</li>
<li>If you receive an offer of admission, it will be conditional upon successful graduation from high school and your final grades.</li>
<li>If you are admitted, we will determine whether you are eligible for advanced standing in your program by evaluating your placement exams and, if applicable, your:
	<ul><li>IB Diploma</li>
		<li>A levels</li>
		<li>French Baccaleaurate</li>
	</ul></li>
</ul></div>'''
            # https://mcgill.ca/applying/requirements/international/ib
            item[
                'ib'] = '''Applicants will be considered for admission on their high school transcript and predicted IB results or, if already completed, on the final IB Diploma results. The Diploma with grades of 5 or better on each Higher and Standard Level subject is the minimum expected for most programs. Many programs are more competitive and will require higher grades.
Note: The Math Studies course is not acceptable for programs where math is a required prerequisite.
If admitted, you are expected to maintain your level of academic performance through to the completion of your pre-McGill studies.
A maximum of 30 credits of advanced standing may be granted for the International Baccalaureate Diploma.'''

            # https://www.mcgill.ca/applying/requirements/prep
            item[
                'ielts_desc'] = 'The regular Academic test and the test for UKVI are both accepted. A band score of 6.5 or better; individual component scores of 6.0 or better. '
            item['ielts'] = '6.5'
            item['ielts_l'] = '6.0'
            item['ielts_s'] = '6.0'
            item['ielts_r'] = '6.0'
            item['ielts_w'] = '6.0'

            # https://www.mcgill.ca/applying/requirements/usa#SAT
            # https://mcgill.ca/transfercredit/prospective/ap
            # item['sat1_desc'] = ''
            # item['act_desc'] = ''
            item['toefl_code'] = item['sat_code'] = '0935'
            item['act_code'] = '5231'
            item['ap'] = '0935-00'
            item['other'] = '''问题清单：1.有些专业对应多个学位，意味着对应多个课程设置，三者之间关系的匹配比较复杂
                                    2.学费在单独页面，需要选择各个学位区分出学费，然后再匹配,导致有些学费是空的
                                    3.统一没有找到课程长度
                                    4.有些专业需要分多条，专业名不一样，没法做到每个拆分的专业的名都能准确的采集下来
                                    5.专业描述和课程设置、就业为空的是详情页没有的
                                    '''

            # 一个专业处于几个学院的情况
            if len(department) == 1:
                item['department'] = ''.join(item['department']).replace(
                    '(Macdonald Campus)', '').strip()
                if "Faculty of Agricultural & Environmental Sciences" in item[
                        'department']:
                    item['campus'] = 'Macdonald Campus'
                    item['location'] = "Ste. Anne de Bellevu"
                elif "Faculty of Science" in item['department'] and item[
                        'major_name_en'] == "Human Nutrition":
                    item['campus'] = 'Macdonald Campus'
                    item['location'] = "Ste. Anne de Bellevu"
                else:
                    item['campus'] = 'Downtown Campus'
                    item['location'] = "Montreal"

                if "Bachelor of Education" in item['degree_name'] or item[
                        'department'] == "Desautels Faculty of Management":
                    item['toefl'] = "100"
                elif "Bachelor of Music" in item['degree_name']:
                    item['toefl'] = "79-80"
                else:
                    # item['toefl_desc'] = 'minimum component score of 21 in each of reading, writing, listening, and speaking'
                    item['toefl'] = "90"
                    item['toefl_l'] = "21"
                    item['toefl_s'] = "21"
                    item['toefl_r'] = "21"
                    item['toefl_w'] = "21"

                item['apply_pre'] = 'CAD$'
                if item['department'] == "Faculty of Medicine":
                    item['apply_fee'] = '154.56'
                else:
                    item['apply_fee'] = '110.40'

                # https://www.mcgill.ca/undergraduate-admissions/yearly-costs
                item['tuition_fee_pre'] = 'CAD$'
                if "Bachelor of Music" in item[
                        'degree_name'] and "Bachelor of Education" in item[
                            'degree_name']:
                    item['tuition_fee'] = '17,799.30'
                elif "Bachelor of Science" in item[
                        'degree_name'] and "Bachelor of Education" in item[
                            'degree_name']:
                    item['tuition_fee'] = '17,799.30'
                elif "Bachelor of Arts and Science" in item['degree_name']:
                    item['tuition_fee'] = '17,799.30'
                elif item['major_name_en'] == "Kinesiology" or item[
                        'degree_name'] == "Bachelor of Arts" or "Bachelor of Social Work" in item[
                            'degree_name'] or "Bachelor of Theology" in item[
                                'degree_name']:
                    item['tuition_fee'] = '16,815.6'
                elif "Licentiate in Music" in item['major_name_en'] or "Bachelor of Nursing (Integrated)" in item['degree_name'] or \
                        "(BScN)" in item['degree_name'] or "Occupational Therapy" in item['major_name_en'] or "Physical Therapy" in item['major_name_en'] or\
                        "Bachelor of Science in Agricultural and Environmental Sciences" in item['degree_name'] or "Bachelor of Science in Architecture" in item['degree_name'] or\
                        "Bachelor of Science in Food Science" in item['degree_name'] or "Bachelor of Science in Nutritional Science" in item['degree_name']:
                    item['tuition_fee'] = '18,782.7'

                elif "Bachelor of Engineering" in item['degree_name'] or item[
                        'degree_name'] == "Bachelor of Science" or item[
                            'degree_name'] == "Concurrent Bachelor of Civil Law (B.C.L) and Bachelor of Laws (LL.B)":
                    item['tuition_fee'] = '39,361.2'
                elif item['degree_name'] == "Bachelor of Commerce":
                    item['tuition_fee'] = '45,262.8'
                if item['tuition_fee'] is None and item[
                        'degree_name'] == 'Bachelor of Music':
                    item['tuition_fee'] = "18,782.7"
                if item['tuition_fee'] is None and item[
                        'degree_name'] == 'Bachelor of Education':
                    item['tuition_fee'] = "17,799.30"
                print("item['tuition_fee']: ", item['tuition_fee'])

                # 判断不是minor的课程
                is_minor = response.xpath(
                    "//span[@class='value option']//text()").extract()

                if "Bachelor of Science in Agricultural and Environmental Sciences" in item['degree_name'] or \
                        "Bachelor of Engineering in Bioresource Engineering" in item['degree_name'] or "Bachelor of Science in Food Science" in item['degree_name']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and two science courses in biology, chemistry, or physics as well as</li>
<li>Huikao exams in these subjects</li></ul>'''
                elif "Bachelor of Science in Architecture" in item[
                        'degree_name']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics, physics and chemistry in the Grade 2 or 3 level</li>
<li>Huikao exams in mathematics and physics or chemistry</li>
<li>Applicants must submit a portfolio which will be taken into account during the admission process.</li></ul>'''
                elif "Bachelor of Arts and Science" in item['degree_name']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and two science courses in biology, chemistry, or physics in the Grade 2 or 3 level</li>
                    <li>Huikao exams in mathematics and two of biology, chemistry or physics</li></ul>'''
                elif "Secondary - Mathematics" in item['major_name_en']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Mathematics, at Senior Grade 2 or 3.</li></ul>'''
                elif "Secondary - Science and Technology" in item[
                        'major_name_en']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level; Huikao exams in these subjects. </li></ul>'''
                elif item['major_name_en'] == 'Bioengineering' or item['major_name_en'] == 'Chemical Engineering' or item[
                    'major_name_en'] == 'Civil Engineering' or item['major_name_en'] == 'Software Engineering' or \
                        item['major_name_en'] == 'Computer Engineering' or item['major_name_en'] == 'Electrical Engineering' or 'Mechanical Engineering' in item['major_name_en'] or \
                        'Materials Engineering' in item['major_name_en'] or 'Mining Engineering' in item['major_name_en']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics, physics and chemistry in the Grade 2 or 3 level</li>
<li>Huikao exams in mathematics and physics or chemistry</li>
<li>Biology cannot be used as prerequisite</li></ul>'''
                elif "Human Nutrition" in item['major_name_en']:
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and two science courses in biology, chemistry, or physics as well as/li>
                                        <li>Huikao exams in these subjects</li></ul>'''
                elif item['major_name_en'] == 'Kinesiology':
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level/li>
                                        <li>Huikao exams in these subjects</li></ul>'''
                elif "Management" in item['major_name_en']:
                    item[
                        'specific_requirement_en'] = '''Subjects must include mathematics at grade 3 level</li>
                                        <li>Huikao exam in mathematics</li></ul>'''
                elif "Nursing" in item['major_name_en']:
                    item[
                        'specific_requirement_en'] = '''Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level</li>
                                        <li>Huikao exams in these subjects</li></ul>'''
                elif "Bachelor of Theology" in item['degree_name']:
                    item[
                        'specific_requirement_en'] = '''<h2>Admission Requirements</h2>
<p>The B.Th. program has three points of entry:</p>
<ol><li>To enter the 120-credit degree program from outside Quebec, the applicant must hold a high school diploma, with a minimum average of 75%, or the equivalent. A maximum of 60 credits from another institution of higher learning can be considered for transfer into the 120-credit program.</li>
<li>To enter the 90-credit first-degree program, the applicant is expected to have completed the Diploma of Collegial Studies (DCS) of a Quebec CEGEP with a minimum average Cote R of 24, or the equivalent elsewhere. A maximum of 30 credits from another institution of higher learning can be considered for transfer into this program.</li>
<li>To enter the 60-credit program, the applicant must have completed a B.A. or other Bachelor’s degree with a minimum CGPA of 2.7 (B-). No credits can be transferred from another institution of higher learning into the 60-credit program.</li>
</ol><p>Any McGill student in good standing, with a minimum of 30 credits, may apply for transfer from their current degree program into the B.Th. program. B.Th. students entering the 120- or 90-credit programs are free to pursue Minors in other departments, schools, or faculties, in consultation with their B.Th. adviser(s).</p>
<p>The B.Th. program extends over three academic years of full time studies for those admitted with a Diploma of Collegial Studies and over two academic years for those admitted with a Bachelor's degree. For all other students it requires four years. The normal load consists of five 3-credits courses (15 credits) each term.</p>'''
                elif 'Biological, Biomedical & Life Sciences' in ','.join(is_minor) or \
                        'Physical, Earth, Math & Computer Sciences' in ','.join(is_minor) or \
                        'Physical, Earth, Math and Computer Sciences' in ','.join(is_minor) or \
                        'Bio-Physical-Computational Sciences' in ','.join(is_minor):
                    item[
                        'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level</li>
<li>Huikao exams in these subjects</li></ul>'''

                # print("item['specific_requirement_en']: ", item['specific_requirement_en'])

                # Water Environments and Ecosystems
                if len(modules_en_list) > 0:
                    if len(modules_en_list) == len(major_en_list):
                        for m in range(len(modules_en_list)):
                            item['modules_en'] = modules_en_list[m]
                            item['major_name_en'] = major_en_list[m]
                            print('isminor: ', is_minor)
                            if 'online' not in item[
                                    'major_name_en'] and 'Diploma' not in item[
                                        'major_name_en'] and 'Diploma' not in item[
                                            'degree_name'] and 'Minor' not in item[
                                                'major_name_en'] and ''.join(
                                                    is_minor
                                                ) != 'Minor Concentration' and ''.join(
                                                    is_minor) != 'Minor':
                                print('筛选之后的数据')
                                yield item
                            else:
                                if item['major_name_en'] == "Materials Engineering (Co-op & Minor)":
                                    yield item
                    else:
                        for m in modules_en_list:
                            item['modules_en'] = m
                            print('isminor: ', is_minor)
                            if 'online' not in item[
                                    'major_name_en'] and 'Diploma' not in item[
                                        'major_name_en'] and 'Diploma' not in item[
                                            'degree_name'] and 'Minor' not in item[
                                                'major_name_en'] and ''.join(
                                                    is_minor
                                                ) != 'Minor Concentration' and ''.join(
                                                    is_minor) != 'Minor':
                                print('筛选之后的数据')
                                yield item
                            else:
                                if item['major_name_en'] == "Materials Engineering (Co-op & Minor)":
                                    yield item
            else:
                for dep in item['department']:
                    item['department'] = dep.replace('(Macdonald Campus)',
                                                     '').strip()
                    if "Faculty of Agricultural & Environmental Sciences" in item[
                            'department']:
                        item['campus'] = 'Macdonald Campus'
                        item['location'] = "Ste. Anne de Bellevu"
                    elif "Faculty of Science" in item['department'] and item[
                            'major_name_en'] == "Human Nutrition":
                        item['campus'] = 'Macdonald Campus'
                        item['location'] = "Ste. Anne de Bellevu"
                    else:
                        item['campus'] = 'Downtown Campus'
                        item['location'] = "Montreal"

                    # item['department'] = ''.join(item['department'])
                    if "Bachelor of Education" in item['degree_name'] or item[
                            'department'] == "Desautels Faculty of Management":
                        item['toefl'] = "100"
                    elif "Bachelor of Music" in item['degree_name']:
                        item['toefl'] = "79-80"
                    else:
                        # item['toefl_desc'] = 'minimum component score of 21 in each of reading, writing, listening, and speaking'
                        item['toefl'] = "90"
                        item['toefl_l'] = "21"
                        item['toefl_s'] = "21"
                        item['toefl_r'] = "21"
                        item['toefl_w'] = "21"

                    item['apply_pre'] = 'CAD$'
                    if item['department'] == "Faculty of Medicine":
                        item['apply_fee'] = '154.56'
                    else:
                        item['apply_fee'] = '110.40'

                    # https://www.mcgill.ca/undergraduate-admissions/yearly-costs
                    item['tuition_fee_pre'] = 'CAD$'
                    if "Bachelor of Music" in item[
                            'degree_name'] and "Bachelor of Education" in item[
                                'degree_name']:
                        item['tuition_fee'] = '17,799.30'
                    elif "Bachelor of Science" in item[
                            'degree_name'] and "Bachelor of Education" in item[
                                'degree_name']:
                        item['tuition_fee'] = '17,799.30'
                    elif "Bachelor of Arts and Science" in item['degree_name']:
                        item['tuition_fee'] = '17,799.30'
                    elif item['major_name_en'] == "Kinesiology" or item[
                            'degree_name'] == "Bachelor of Arts" or "Bachelor of Social Work" in item[
                                'degree_name'] or "Bachelor of Theology" in item[
                                    'degree_name']:
                        item['tuition_fee'] = '16,815.6'
                    elif "Licentiate in Music" in item['major_name_en'] or "Bachelor of Nursing (Integrated)" in item[
                        'degree_name'] or \
                            "(BScN)" in item['degree_name'] or "Occupational Therapy" in item[
                        'major_name_en'] or "Physical Therapy" in item['major_name_en'] or \
                            "Bachelor of Science in Agricultural and Environmental Sciences" in item[
                        'degree_name'] or "Bachelor of Science in Architecture" in item['degree_name'] or \
                            "Bachelor of Science in Food Science" in item[
                        'degree_name'] or "Bachelor of Science in Nutritional Science" in item['degree_name']:
                        item['tuition_fee'] = '18,782.7'

                    elif "Bachelor of Engineering" in item['degree_name'] or item[
                            'degree_name'] == "Bachelor of Science" or item[
                                'degree_name'] == "Concurrent Bachelor of Civil Law (B.C.L) and Bachelor of Laws (LL.B)":
                        item['tuition_fee'] = '39,361.2'
                    elif item['degree_name'] == "Bachelor of Commerce":
                        item['tuition_fee'] = '45,262.8'

                    if item['tuition_fee'] is None and item[
                            'degree_name'] == 'Bachelor of Music':
                        item['tuition_fee'] = "18,782.7"
                    if item['tuition_fee'] is None and item[
                            'degree_name'] == 'Bachelor of Education':
                        item['tuition_fee'] = "17,799.30"
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # 判断不是minor的课程
                    is_minor = response.xpath(
                        "//span[@class='value option']//text()").extract()

                    if "Bachelor of Science in Agricultural and Environmental Sciences" in item['degree_name'] or \
                            "Bachelor of Engineering in Bioresource Engineering" in item[
                        'degree_name'] or "Bachelor of Science in Food Science" in item['degree_name']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and two science courses in biology, chemistry, or physics as well as</li>
                    <li>Huikao exams in these subjects</li></ul>'''
                    elif "Bachelor of Science in Architecture" in item[
                            'degree_name']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics, physics and chemistry in the Grade 2 or 3 level</li>
                    <li>Huikao exams in mathematics and physics or chemistry</li>
                    <li>Applicants must submit a portfolio which will be taken into account during the admission process.</li></ul>'''
                    elif "Bachelor of Arts and Science" in item['degree_name']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and two science courses in biology, chemistry, or physics in the Grade 2 or 3 level</li>
                                        <li>Huikao exams in mathematics and two of biology, chemistry or physics</li></ul>'''
                    elif "Secondary - Mathematics" in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Mathematics, at Senior Grade 2 or 3.</li></ul>'''
                    elif "Secondary - Science and Technology" in item[
                            'major_name_en']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level; Huikao exams in these subjects. </li></ul>'''
                    elif item['major_name_en'] == 'Bioengineering' or item['major_name_en'] == 'Chemical Engineering' or \
                            item[
                                'major_name_en'] == 'Civil Engineering' or item[
                        'major_name_en'] == 'Software Engineering' or \
                            item['major_name_en'] == 'Computer Engineering' or item[
                        'major_name_en'] == 'Electrical Engineering' or 'Mechanical Engineering' in item[
                        'major_name_en'] or \
                            'Materials Engineering' in item['major_name_en'] or 'Mining Engineering' in item[
                        'major_name_en']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics, physics and chemistry in the Grade 2 or 3 level</li>
                    <li>Huikao exams in mathematics and physics or chemistry</li>
                    <li>Biology cannot be used as prerequisite</li></ul>'''
                    elif "Human Nutrition" in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and two science courses in biology, chemistry, or physics as well as/li>
                                                            <li>Huikao exams in these subjects</li></ul>'''
                    elif item['major_name_en'] == 'Kinesiology':
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level/li>
                                                            <li>Huikao exams in these subjects</li></ul>'''
                    elif "Management" in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = '''Subjects must include mathematics at grade 3 level</li>
                                                            <li>Huikao exam in mathematics</li></ul>'''
                    elif "Nursing" in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = '''Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level</li>
                                                            <li>Huikao exams in these subjects</li></ul>'''
                    elif "Bachelor of Theology" in item['degree_name']:
                        item[
                            'specific_requirement_en'] = '''<h2>Admission Requirements</h2>
                    <p>The B.Th. program has three points of entry:</p>
                    <ol><li>To enter the 120-credit degree program from outside Quebec, the applicant must hold a high school diploma, with a minimum average of 75%, or the equivalent. A maximum of 60 credits from another institution of higher learning can be considered for transfer into the 120-credit program.</li>
                    <li>To enter the 90-credit first-degree program, the applicant is expected to have completed the Diploma of Collegial Studies (DCS) of a Quebec CEGEP with a minimum average Cote R of 24, or the equivalent elsewhere. A maximum of 30 credits from another institution of higher learning can be considered for transfer into this program.</li>
                    <li>To enter the 60-credit program, the applicant must have completed a B.A. or other Bachelor’s degree with a minimum CGPA of 2.7 (B-). No credits can be transferred from another institution of higher learning into the 60-credit program.</li>
                    </ol><p>Any McGill student in good standing, with a minimum of 30 credits, may apply for transfer from their current degree program into the B.Th. program. B.Th. students entering the 120- or 90-credit programs are free to pursue Minors in other departments, schools, or faculties, in consultation with their B.Th. adviser(s).</p>
                    <p>The B.Th. program extends over three academic years of full time studies for those admitted with a Diploma of Collegial Studies and over two academic years for those admitted with a Bachelor's degree. For all other students it requires four years. The normal load consists of five 3-credits courses (15 credits) each term.</p>'''
                    elif 'Biological, Biomedical & Life Sciences' in ','.join(is_minor) or \
                            'Physical, Earth, Math & Computer Sciences' in ','.join(is_minor) or \
                            'Physical, Earth, Math and Computer Sciences' in ','.join(is_minor) or \
                            'Bio-Physical-Computational Sciences' in ','.join(is_minor):
                        item[
                            'specific_requirement_en'] = '''<ul><li>Subjects must include mathematics and at least two of biology, chemistry, or physics in the Grade 2 or 3 level</li>
                    <li>Huikao exams in these subjects</li></ul>'''

                    # print("item['specific_requirement_en']: ", item['specific_requirement_en'])

                    if len(modules_en_list) > 0:
                        if len(modules_en_list) == len(major_en_list):
                            for m in range(len(modules_en_list)):
                                item['modules_en'] = modules_en_list[m]
                                item['major_name_en'] = major_en_list[m]
                                print('isminor: ', is_minor)
                                if 'online' not in item[
                                        'major_name_en'] and 'Diploma' not in item[
                                            'major_name_en'] and 'Diploma' not in item[
                                                'degree_name'] and 'Minor' not in item[
                                                    'major_name_en'] and ''.join(
                                                        is_minor
                                                    ) != 'Minor Concentration' and ''.join(
                                                        is_minor) != 'Minor':
                                    print('筛选之后的数据')
                                    yield item
                                else:
                                    if item['major_name_en'] == "Materials Engineering (Co-op & Minor)":
                                        yield item
                        else:
                            for m in modules_en_list:
                                item['modules_en'] = m
                                print('isminor: ', is_minor)
                                if 'online' not in item[
                                        'major_name_en'] and 'Diploma' not in item[
                                            'major_name_en'] and 'Diploma' not in item[
                                                'degree_name'] and 'Minor' not in item[
                                                    'major_name_en'] and ''.join(
                                                        is_minor
                                                    ) != 'Minor Concentration' and ''.join(
                                                        is_minor) != 'Minor':
                                    print('筛选之后的数据')
                                    yield item
                                else:
                                    if item['major_name_en'] == "Materials Engineering (Co-op & Minor)":
                                        yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 15

Mostrar archivo

Archivo: EmilyCarrUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "Emily Carr University"
        item['url'] = response.url
        print("===========================")
        print(response.url)
        item['other'] = '''问题描述：1.没有课程长度，其他没有问题'''
        '''公共字段'''
        item['location'] = '520 East 1st Ave Vancouver, BC V5T 0H2'

        # item['act_code'] = '0719'

        # https://www.ecuad.ca/admissions/application-info/undergraduate-applications/how-to-apply-application-process
        item['apply_pre'] = 'CAD$'
        item['apply_fee'] = '70'
        item['start_date'] = "2019-01,2019-09"
        item['deadline'] = '2019-01-15,2019-10-01'

        # https://www.ecuad.ca/assets/pdf-attachments/Undergraduate-Expenses-Overview-June-2018.pdf
        item['tuition_fee_pre'] = 'CAD$'
        item['tuition_fee'] = '15,966'

        # https://www.ecuad.ca/admissions/application-info/undergraduate-applications/english-language-proficiency
        item['sat_code'] = item['toefl_code'] = '0032'
        item[
            'ielts_desc'] = 'minimum band 6.5, with no component less than 6.0'
        item['ielts'] = '6.5'
        item['ielts_l'] = '6.0'
        item['ielts_s'] = '6.0'
        item['ielts_r'] = '6.0'
        item['ielts_w'] = '6.0'
        item[
            'toefl_desc'] = 'minimum of 84 out of 120 total points including a minimum score in each of the four skills; Speaking 20/30, Reading 20/30, Writing 18/30, and Listening 20/30.'
        item['toefl'] = '84'
        item['toefl_l'] = '20'
        item['toefl_s'] = '20'
        item['toefl_r'] = '20'
        item['toefl_w'] = '18'

        #         item['act_desc'] = item['sat1_desc'] = "SAT or ACT scores will also be considered"

        # https://www.ecuad.ca/admissions/application-info/undergraduate-applications/first-year-academic-requirements
        item[
            'require_chinese_en'] = '''<p>Completion of the highest level of secondary education available in your home country, in a program leading directly to university entrance. You require at least a C+ (67) average in the five courses that most closely match the British Columbia requirements.</p>
<p>Applicants from China (excluding Hong Kong) must verify their educational documents through China Credentials Verification (CHESICC).Send your documents to this Chinese agency for verification in English. Ask the agency to send your official academic transcript and English verification report directly to:</p>
<blockquote> Emily Carr University of Art + Design<br>520 East 1st Ave,<br>Vancouver, BC,V5T 0H2<br>Canada</blockquote>'''
        item['ap'] = """AP Applicants
Emily Carr University recognizes the value of AP courses. First year university transfer credit will be awarded to students who achieve a grade of 4 or higher in courses that are approved as equivalent to Emily Carr University required courses.
        When ordering transcripts from the College Board website, the four digit code is 4148."""
        item[
            'alevel'] = """Emily Carr welcomes applications from students who have completed their GCSEs (O Levels) and GCEs (A Levels), either in Great Britain or at one of many British Pattern schools around the world.
Minimum requirements:
Graduation from a university-preparatory program at a senior secondary school with standing in at least five subject areas, including English plus two more academic subjects, with at least three approved academic GCE (A Level) subjects; or with standing in at least six subjects at the Advanced Subsidiary Level.  A subject may not be counted at both the GCE (A Level) and the GCSE (O Level) levels.
Your admission average is calculated on your final year academic courses/exams and must include at least two GCSEs (O Levels) and three GCEs (A Levels), or must include at least six Advanced Subsidiary Levels."""
        item['ib'] = """International Baccalaureate (IB)
3 Higher Level and 3 Standard Level subjects. A minimum requirement of 24 points is recommended to be considered for admission.
First year university transfer credit will be awarded to students who achieve a grade of at least 5 or higher in Higher Level courses in courses that are approved as equivalent to Emily Carr University required courses."""
        item[
            'entry_requirements_en'] = """<h1>First Year Academic Requirements</h1>
          <h2>To be successful with your application, meet our academic requirements.</h2>
          <p><strong>Here is what you need to know to ensure that you satisfy our academic requirements.</strong></p>
<p>The minimum academic requirement for admission to Emily Carr University undergraduate programs is graduation from grade 12 secondary school, with five grade 12 subjects, including English 12 with a minimum grade of 'C', two other grade 12 academic courses and two grade 12 elective courses. The minimum overall grade point average required for admission is 2.5 or C+ or 67%.  All elective courses must be grade 12 and can be Ministry Approved, Board Approved or Locally Developed.</p>
"""

        # https://www.ecuad.ca/admissions/application-info/undergraduate-applications/portfolio-requirements
        item['portfolio_desc_en'] = """<div>
                    <a>
                        <span>First Year Foundation Applicant Portfolios</span>
                        <i></i>
                    </a>
                    <div>
                        <h3><strong></strong><strong>Foundation Portfolio Requirements</strong>
  </h3>
<p>Your portfolio is a collection of work and ideas that demonstrate the state of your creative development. We want to see what you create to imagine how you might succeed at Emily Carr University of Art + Design. A strong portfolio includes a diverse array of artwork, experimental processes, material techniques, observation skills, and creative thinking. There is no rigid or fail-safe formula for a good portfolio: we are looking for that unique combination of creativity, engagement, and inspiration that makes art, design, and media education suitable for you. 
  </p>
<h2>Part 1: Examples of your Creative Practice</h2>
<p>We are interested in seeing a wide range of examples of your creative practice. We encourage you to submit not only visual arts projects, but also sound, time-based, craft-based, design, 3D, illustration, and animation work. All media, mixed or otherwise - from a video or a song, to a comic strip or a zine - are welcome. Please submit only your best work in its final state from the last two years. 
  </p>
<p><strong>We will only evaluate 10 samples maximum</strong>. We are asking that you be critical in your selection. 
  </p>
<h2>Part 2: Process Projects </h2>
<p>We want to know how you create, how you problem-solve, how you ask questions, and where your process takes you. Please respond to all the prompts provided below by submitting <strong>1 sample of your original work&nbsp;<b>per prompt&nbsp;</b>(or one 20-second video)</strong>. Be sure to label your slide with the relevant prompt and feel free to add a title should you feel so inspired.
  &nbsp;
  </p>
<ol><li>What keeps you up at      night?
</li><li>Describe where you live      without showing any images of your home.
</li><li>You have been asked to      design a town square; what would you put in the centre of it?
</li></ol>
<p>After responding to the three prompts artistically, please tell us how you feel about that entire creative process by writing <strong>no more than 50 words</strong>.
  </p>
<h2>Part 3: Written Responses</h2>
<p>Respond to the following three questions. We want to hear your voice and learn about you through your writing. This is not a formal essay, but an opportunity for personal reflection and intellectual honesty. Please keep your answers to <strong>75 - 100 words each</strong>.&nbsp;
  </p>
<ol><li>What makes a problem      interesting to you?
</li><li>Make a list of all the      things you’d like to learn at Emily Carr.</li><li>If you could change one thing in the world, what would it be? 
</li> <span></span></ol>
                    </div>
                </div>"""

        try:
            item['degree_name'] = response.meta.get("degree_name")
            print("item['degree_name']: ", item['degree_name'])

            item['degree_overview_en'] = response.meta.get(
                "degree_overview_en")
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            major_name_en = response.xpath(
                "//section[@class='main-body-content']/h1//text()").extract()
            clear_space(major_name_en)
            item['major_name_en'] = ''.join(major_name_en).strip()
            if "Major" in item['major_name_en']:
                item['major_name_en'] = item['major_name_en'].replace(
                    "Major", "").strip().strip(",").strip()
            print("item['major_name_en']: ", item['major_name_en'])

            overview = response.xpath(
                "//a[@class='tab-link is-active']/../div/*[position()<last()]"
            ).extract()
            if len(overview) > 0:
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview)).replace("<p></p>",
                                                          "").strip()
            # if item['overview_en'] is None:
            #     print("***overview_en 为空")
            # print("item['overview_en']: ", item['overview_en'])

            career_en = response.xpath(
                "//a[contains(text(),'Pathways')]/../div").extract()
            if len(career_en) > 0:
                item['career_en'] = remove_class(
                    clear_lianxu_space(overview)).replace("<p></p>",
                                                          "").strip()
            # if item['career_en'] is None:
            #     print("***career_en 为空")
            # print("item['career_en']: ", item['career_en'])

            # if len(modules_url) == 0:
            modules_url = response.xpath(
                "//div[contains(@class,'small-12 medium-4 columns sidbar-container')]//ul[@class='side-nav']//a[contains(@href,'courses/')]/@href"
            ).extract()
            # print(modules_url)

            item['modules_en'] = None
            # print("***modules_en 为空")
            # print("item['modules_en']: ", item['modules_en'])

            yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 16

Mostrar archivo

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)

        #1.学校名称
        school_name = 'Mount Allison University'

        #2.地点
        try:
            location = response.xpath(
                '//*[@id="page-wrapper"]/footer/div/div[1]/div/address/a'
            ).extract()[0]
            location = remove_tags(location)
        # print(location)
        except:
            location = None
        # print(location)

#3. 校区
        try:
            campus = 'main'
            # campus_list = remove_tags(campus)
            # campus_list = campus_list.replace(', Online','')
            # campus_list = campus_list.replace(' ','')
            # campus_list = campus_list.split(',')
            #print(campus_list)
        except:
            campus = None
            #print(campus_list)

        duration = None

        #4. 学院
        try:
            department = response.xpath(
                '//*[@id="programs_curriculum_holder"]/p[1]/text()[1]'
            ).extract()[0]
            #department = remove_tags(department,keep=("i"))
            department = re.findall('(F.*)', department)[0]
            #department = department.replace(' class="fa fa-graduation-cap"','').replace(' class="fa fa-university"','')
            #department = department.replace(' <i> </i> ','<i></i>').replace('<i> </i> ','')
            #department = department.split('<i></i>')[-1]
            #print(len(department))
            #print(department)
            #print(response.url)
        except:
            department = None
            #print(department)

# 4.
        try:
            degree_name = response.xpath(
                '//*[@id="programs_curriculum_holder"]/p[1]/text()[2]'
            ).extract()[0]
            degree_name = degree_name.lstrip(' ')
            degree_name = re.findall('(.*? ).*', degree_name)[0]
            degree_name = degree_name.replace(':', 'BSc')
            degree_name = degree_name.replace(';', '')
            # degree_name = remove_tags(degree_name,keep=('li','ul'))
            # #degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list)
            # degree_name = degree_name.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','')
            # # degree_name_list = degree_name_list.replace('<li>','').replace('</li>','---')
            # # degree_name_list = degree_name_list.replace('<span>','').replace('</span>','---')
            # degree_name = degree_name.split('</li><li>')
            degree_name = degree_name.replace('BA', 'Bachelor of Arts')
            #print(degree_name)
            # #print(response.url)
        except:

            degree_name = None
        # print(degree_name)

#5.学位描述
        try:
            degree_overview_en = response.xpath(
                '//*[@id="programs_intro_text"]').extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文
        try:
            major_name_en = response.xpath(
                '//*[@id="page-wrapper"]/div/div[1]/div/div/div/h1').extract(
                )[0]
            major_name_en = major_name_en.replace('\r\n', '').replace(
                '\n', '').replace('           ',
                                  '').replace('\t', '').replace('     ', '')
            major_name_en = remove_tags(major_name_en)
            major_name_en = major_name_en.replace('&amp; ', '')
            major_name_en = major_name_en.replace('Ancient Greek', 'Greek')
        # print(major_name_en)
        except:
            major_name_en = None
        # print(major_name_en)

#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = '2019-01,2019-09'
            # start_date = ','.join(start_date)
            # start_date = remove_tags(start_date)
            # start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            # start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            # #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
# try:
#     duration = response.xpath('').extract()[0]
#     duration = remove_tags(duration)
#     # print(duration)
# except:
#     duration = None
#     # print(duration)

#10.课程设置
        try:
            modules_en = response.xpath(
                '//*[@id="programs_curriculum_holder"]').extract()[0]
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en)

            modules_en = modules_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
            abc = re.findall('<p>.*?</p>', modules_en)[0]
            modules_en = modules_en.replace(abc, '')
            print(modules_en)

        except:
            modules_en = None
            print(modules_en)

#11.就业方向
        try:
            career_en = response.xpath(
                '//*[@id="programs_careers_holder"]').extract()[0]
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            career_en = career_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
            #print(career_en)
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            if 'BMus' in degree_name:
                deadline = '2019-02-10'
            elif 'finearts' in response.url:
                deadline = '2019-02-15'
            else:
                deadline = '2019-03-01'
            #deadline = remove_tags(deadline)
            #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            if 'Aviation' in major_name_en:
                tuition_fee = '32120'
            else:
                tuition_fee = '17,600'

            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)
#14 申请费:
        apply_fee = '50'

        #
        average_score = 'An average of 3.5 / 70%'
        #15 申请要求
        try:
            entry_requirements_en = '<div><div><p><span>International admission requirements:</span></p><p>To be considered for admission, a university-preparatory language arts course and generally a minimum of four additional university-preparatory courses are required. A university-preparatory math (pre-calculus) is required for students applying to the Bachelor of Science or Commerce programs.</p><p>As admissions requirements vary by country and each application to Mount Allison is assessed on an individual basis, you are encouraged to contact your admissions counsellor ([email protected]) to discuss requirements as they apply to your specific academic background. Be sure to include your country, desired program of study, current grade, and age in your email so we can provide accurate and detailed advice.</p><p><strong>Official transcripts</strong>: To be considered \'official,\' transcripts must be forwarded directly to Mount Allison\'s registrar\'s office by the issuing institution. Please note that notarized English translations of required documents should be included if applicable.</p><p><strong>Regular admission</strong>: To be considered for admission, a minimum final grade of 65% or the equivalent is required in all university-preparatory courses reviewed.</p><p><strong>Early admission</strong>: Applicants may be considered for early admission based on their final grade 11 transcripts. A minimum admissions average of 80% is required for consideration.</p><p><strong>Conditional offers of admission</strong>: If an applicant meets Mount Allison’s academic requirements but has not provided proof of English language proficiency they may be granted a conditional offer of admission. Students must complete a university-preparatory English language program at one of <a>Mount Allison’s partner language training institutes</a><strong> </strong>or provide sufficient <a>proof of English language proficiency</a> before of full offer of admission will be granted.</p> </div><div><p>&nbsp;</p> </div><div></div></div>'

        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = '<p>Minimum entry requirements:<br>Senior Middle School Graduation Certificate<br>High school studies should be university preparatory <br>An average of 3.5 / 70%<br><a>An accepted proof of English proficiency</a><br><br>Students studying at vocational/technical high schools may be considered if they have strong grades and their educational background is relevant to their intended field of study.</p>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            #
            specific_requirement_en = None
            # #specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
            # specific_requirement_en = specific_requirement_en.replace('\r\n','')
            # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0]
            # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
            #print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = response.xpath('').extract()[0]
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'International English Language Testing System – Academic (IELTS) score of 6.5 with no band score lower than 6*'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'International English Language Testing System – Academic (IELTS) score of 6.5 with no band score lower than 6*'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

        #28 toefl_code
        try:
            toefl_code = '0939'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = 'TOEFL score of 90 (internet test), 580 (paper test), 213 (computer test) (DI Code 0939) with no band score lower than 20'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '90'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 20
        toefl_s = 20
        toefl_r = 20
        toefl_w = 20

        # 32 alevel
        try:
            alevel = 'Students who have completed their ‘A’ Level exams with a minimum grade of D may qualify for up to 30 transfer credits.'
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'Students who successfully complete the IB diploma may receive up to 30 transfer credits comprised of higher level (HL) or standard level (SL) courses with a score of five (5) or higher, and theory of knowledge with a grade of C or higher.   IB certificate students who complete higher level (HL) courses with a score of five (5) or higher will be assessed for transfer credit for those HL courses to a maximum of 18 credits. Standard level (SL) courses completed as an IB certificate are not eligible for transfer credits.   Below is a list of IB courses that have transferred to Mount Allison in the past to give you an idea of the credits you may receive for successfully completed IB courses. An official transfer credit assessment will be completed for each individual upon the submission and receipt of the official International Baccalaureate Organization (IBO) transcript in early summer.   It is the student\'s responsibility to arrange for an official transcript to be sent directly from the IBO to Mount Allison University. The listing of credit transfers noted below does not guarantee the granting of credit in a specific situation as additional factors may apply. While every effort is made to ensure the currency and accuracy of the data found within the list below, errors may occur. It is the responsibility of the applicant to verify this information with the admitting institution. Not every IB course has been evaluated for credit transfer. Therefore if your courses do not appear in the list below, it may still be possible to receive credit for them. Please contact your admissions team for more information.    Mount Allison equivalencies of select IB courses        Arabic A Lang & Lit HL — 6 non-designated Arabic Language credits at the 1000 level (3 credits count as distribution under Arts) Biology HL — BIOL 1001 & BIOL 1501 (BIOL 1001 counts as distribution under Natural Science) Biology SL — 3 non-designated Biology credits at the 1000 level (counts as distribution under Natural Science) Business Management HL — COMM 1011 & 3 non-designated Commerce credits at the 2000 level Business Management SL — COMM 1011 Economics HL — ECON 1001 & ECON 1011 (3 credits from ECON 1001 or 1011 counts as distribution under Social World) Economics SL — 3 non-designated Economics credits at the 1000 level (counts as distribution under Social World) English A  Literature HL — ENGL 1201 & 3 non-designated English credits at the 1000 level (ENGL 1201 counts as distribution under Arts) English A Literature SL — 3 non-designated English credits at the 1000 level (counts as distribution under Arts) English A Language & Literature HL — ENGL 1201 & 3 non-designated English credits at the 1000 level (ENGL 1201 counts as distribution under Arts) English A Language & Literature SL — 3 non-designated English credits at the 1000 level (counts as distribution under Arts) Environment & Society SL — GENS 1401 (counts as distribution under Natural Science) French A Literature HL — 6 non-designated French Language credits at the 1000 level (3 credits count as distribution under Arts) French A Literature SL — 3 non-designated French Language credits at the 1000 level (counts as distribution under Arts) French A Language & Literature HL — 6 non-designated French Language credits at the 1000 level (3 credits count as distribution under Arts) French A Language & Literature SL — 3 non-designated French Language credits at the 1000 level (counts as distribution under Arts) French AB SL — 3 non-designated French Language credits at the 1000 level (counts as distribution under Arts) French B HL — 6 non-designated French Language credits at the 1000 level (3 credits count as distribution under Arts) French B SL — 3 non-designated French Language credits at the 1000 level (3 credits count as distribution under Arts) Geography HL — GENV 1201 & GENS 1401 (GENV 1201 counts as distribution under Social World; GENS 1401 counts as Natural Science) Geography SL — GENS 1401 (counts as distribution under Natural Science) Hindi B HL — 6 non-designated Hindi Language credits at the 1000 level (3 credits count as distribution under Arts) History of Africa & the Middle East HL — 6 non-designated History credits at the 1600 level (3 credits count as distribution under Humanities) History of the Americas HL — 6 non-designated History credits at the 1600 level (3 credits count as distribution under Humanities) History of Asia & Oceania HL — 6 non-designated History credits at the 1600 level (3 credits count as distribution under Humanities) History of Europe HL — 6 non-designated History credits at the 1600 level (3 credits count as distribution under Humanities) History SL — 3 non-designated History credits at the 1600 level (counts as distribution under Humanities) Italian B SL — 3 non-designated Italian Language credits at the 1000 level (counts as distribution under Arts) Mathematics HL — MATH 1111 & 3 non-designated Math credits at the 1000 level (MATH 1111 counts as distribution under Natural Science) Mathematics SL — MATH 1111 (counts as distribution under Natural Science) Norwegian B HL — 6 non-designated Norwegian Language credits at the 1000 level (counts as distribution under Arts) Philosophy HL — 6 non-designated Philosophy credits at the 1000 level Philosophy SL — 3 non-designated Philosophy credits at the 1000 level Physics HL — PHYS 1051 & PHYS 1551 (PHYS 1051 counts as distribution under Natural Science) Physics SL — 3 non-designated Physics credits at the 1000 level (counts as distribution under Natural Science) Psychology HL — PSYC 1011 & 3 non-designated Psychology credits at the 1000 level Psychology SL — 3 non-designated Psychology credits at the 1000 level Swahili B SL — 3 non-designated Swahili Language credits at the 1000 level (counts as distribution under Arts) Theory of Knowledge — 3 non-designated Elective credits at the 1000 level'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = response.xpath('').extract()[0]
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            if 'BMus' in degree_name:
                interview_desc_en = 'Students applying to the Bachelor of Music or Bachelor of Arts with honours/major in Music programs must submit an application for admission to the University as well as a Music Application Form to the Music Department. Bachelor of Music applicants are required to complete an audition. Applicants to the Bachelor of Music or Bachelor of Arts with honours/major in Music programs are required to complete a personal interview with the department. All applicants to the Music program (including Bachelor of Arts with minor in Music) must complete an entrance assessment. Music Application Forms are due February 10. For more information, visit the Music admissions pages.'
            #interview_desc_en = remove_tags(interview_desc_en)
            else:
                interview_desc_en = None
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            if 'BFA' in degree_name:
                portfolio_desc_en = 'Students applying to the Bachelor of Fine Arts or Bachelor of Arts with a major/minor in Fine Arts programs must submit an application for admission to the University as well as an Art Information Sheet, brief written statement, digital portfolio, and list of works to the Fine Arts Department. Fine Arts application packages are due February 15. For more information, visit the Fine Arts admission pages. '
            else:
                portfolio_desc_en = None
            #portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = '没有课程时长字段'
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0939'
        sat1_desc = 'Required: Senior-level English and a minimum of four additional university-preparatory courses. Pre-calculus is required for students applying to the Bachelor of Science or Commerce programs. SAT/ACT scores are not required for admissions purposes but students are encouraged to submit their results if available. '
        sat2_desc = None
        act_code = None
        act_desc = None

        item["ap"] = ap
        item["duration_per"] = 1
        item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        item["average_score"] = average_score

Ejemplo n.º 17

Mostrar archivo

Archivo: MountSaintVincentUniversity_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "Mount Saint Vincent University"
        item['url'] = response.url
        print("===========================")
        print(response.url)

        item['other'] = '''问题描述：1. 没有课程长度，申请要求
        2.专业描述和课程设置、就业为空的是页面没有的'''
        '''公共字段'''
        # item['campus'] = 'Hamilton'
        item['location'] = 'Halifax, Nova Scotia,Canada'
        # item['sat_code'] = item['toefl_code'] = '5076'
        # item['act_code'] = '0719'

        # item['duration'] = '4'
        # item['duration_per'] = 1
        # http://www.msvu.ca/en/home/beamountstudent/money/tuitionfees/detailedfeeinformation.aspx
        item['apply_pre'] = 'CAD$'
        item['apply_fee'] = '40'

        # http://www.msvu.ca/en/home/beamountstudent/internationaleducationcentre/AdmissionRequirements/ApplicationDeadlines.aspx
        item['start_date'] = '9月'
        item['deadline'] = '2019-06-21'

        # http://www.msvu.ca/en/home/beamountstudent/money/tuitionfees/default.aspx
        item['tuition_fee_pre'] = 'CAD$'
        item['tuition_fee'] = '16,586 - 16,969'

        # http://www.msvu.ca/en/home/programsdepartments/academiccalendars/undergradprograms/admissions/requirements/internationalrequirements.aspx#lang
        item['ielts_desc'] = '6.5 (no individual score below 6.0)'
        item['ielts'] = '6.5'
        item['ielts_l'] = '6.0'
        item['ielts_s'] = '6.0'
        item['ielts_r'] = '6.0'
        item['ielts_w'] = '6.0'
        item['toefl_desc'] = '86 - 92 (no individual score below 21)'
        item['toefl'] = '86-92'
        item['toefl_l'] = '21'
        item['toefl_s'] = '21'
        item['toefl_r'] = '21'
        item['toefl_w'] = '21'

        # http://www.msvu.ca/en/home/programsdepartments/academiccalendars/undergradprograms/admissions/requirements/highschoolrequirements.aspx
        item['ap'] = """Advanced Placement Program (AP)
Mount Saint Vincent University participates in the Advanced Placement Program administered by the College Board (Princeton, New Jersey).
Upon presentation of Advanced Placement credentials, students may receive up to a maximum of 5.0 units of transfer credits for Advanced Placement Examinations provided that they have achieved grades of 4 or 5."""
        item['ib'] = """International Baccalaureate (IB)
Mount Saint Vincent University welcomes applicants holding the International Baccalaureate (IB) diploma. Students enrolled in the IB program may receive transfer credits for a maximum of 5.0 units for a combination of the following:
 Course Type	            Value
Higher Level IB 	        1.0 unit at the 1000 level for each with a final grade of five or higher upon presentation of the final transcript or completed diploma. 
Standard Level IB	        0.5 unit at the 1000 level for each with a final grade of five or higher upon presentation of the final transcript or completed diploma.
Theory of Knowledge (ToK)	1.0 unit of ARTS elective at the 1000 level with a final grade of “B” or higher upon presentation of the final transcript or completed diploma.
Students receiving transfer credit for IB courses are advised to contact the departments or academic advising to determine the effect of those credits on their plans for future study and their career goals."""
        # item['act_desc'] = item['sat1_desc'] = "SAT or ACT scores will also be considered"

        # http://www.msvu.ca/en/home/beamountstudent/internationaleducationcentre/AdmissionRequirements/default.aspx
        item[
            'require_chinese_en'] = '<p>Senior Middle Two and Three results as well as a graduation certificate. Students may also be asked to report scores from the National College Entrance Examination with grades of 70% or above.</p><p>English Language test scores required</p>'
        item['average_score'] = '70'
        item['specific_requirement_en'] = "70% with no mark below 60%"
        item['entry_requirements_en'] = """<div>
<h3>International Applicants</h3>
<p>International applicants are expected to have completed a preparatory program that
leads to university entrance in their own country. <br /> In general,&nbsp;secondary school&nbsp;applicants have an average of 70% (5 best scoring academic subjects&nbsp;considered) during their final year of schooling. Applicants looking to transfer from another post-secondary institution must demonstrate an overall GPA of 2.0. Refer to chart below for program specific requirements.</p><p>Possession of minimum entrance requirements does not
guarantee admission to the University. Applicants must submit proof of
ability to follow a university program taught entirely in English.
If English is not your first language, please submit official reports with acceptable scores.</p><p>Applications are considered on an individual basis.</p></div>"""
        try:

            major_name_en = response.xpath(
                "//h1[@class='no-margin']//text()").extract()
            clear_space(major_name_en)
            item['major_name_en'] = ''.join(major_name_en).replace(
                'Bachelor of Arts -', '').strip()
            # print("item['major_name_en']: ", item['major_name_en'])

            degree_name = response.xpath(
                "//div[@class='breadcrumbs']//ul//a[@href='http://www.msvu.ca/en/home/programsdepartments/BA/default.aspx'][contains(text(),'Bachelor of Arts')]//text()|"
                "//div[@class='breadcrumbs']//ul//a[@href='http://www.msvu.ca/en/home/programsdepartments/bachelorofscience/default.aspx'][contains(text(),'Bachelor of Science')]//text()"
            ).extract()
            clear_space(degree_name)
            item['degree_name'] = ''.join(degree_name).strip()
            if "BSc" in item['major_name_en']:
                item['degree_name'] = "BSc"
                item['major_name_en'] = item['major_name_en'].replace(
                    "BSc", "").strip()
            if "Bachelor of Arts" in ''.join(major_name_en):
                item['degree_name'] = "Bachelor of Arts"
            print("item['major_name_en']: ", item['major_name_en'])
            print("item['degree_name']: ", item['degree_name'])
            '''overview_en'''
            # //span[@class='h2_inside']
            # overview_en = response.xpath("//span[@class='h2_inside']|//div[contains(@id,'tmpl_cbins')]").extract()
            overview_en = response.xpath(
                "//h1[contains(text(),'Highlights')]/following-sibling::p[position()<3]|"
                "//span[@class='h1_inside']/../following-sibling::p[position()<3]|"
                "//h2[contains(text(),'French Program Highlights')]/preceding-sibling::h4[1]|"
                "//h2[contains(text(),'Program Highlights')]/following-sibling::p[1]"
            ).extract()
            if len(overview_en) == 0:
                overview_en = response.xpath(
                    "//span[contains(text(),'Program Highlights')]/../following-sibling::p[position()<3]"
                ).extract()
            if len(overview_en) > 0:
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en)).replace(
                        "<p>Admissions Requirements »Tuition &amp; Fees »</p>",
                        "").strip()
            print("item['overview_en']: ", item['overview_en'])
            '''modules'''
            modules_url = response.xpath(
                "//a[contains(text(),'Courses')]/@href").extract()
            # print(modules_url)
            if len(modules_url) > 0:
                item['modules_en'] = self.parse_modules(modules_url[0])
            # print("item['modules_en']: ", item['modules_en'])
            '''career_en'''
            career_en = response.xpath(
                "//h1[contains(text(),'Future Possibilities')]/following-sibling::p[1]|"
                "//h1[contains(text(),'Our Graduates')]/following-sibling::p[position()<3]|"
                "//h1[contains(text(),'Advancing your Career')]/following-sibling::p[1]|"
                "//h1[contains(text(),'Career Options')]/..").extract()
            if len(career_en) == 0:
                career_en = response.xpath(
                    "//span[contains(text(),'Program Highlights')]/../following-sibling::p[position()<3]"
                ).extract()
            if len(career_en) > 0:
                item['career_en'] = remove_class(clear_lianxu_space(career_en)).replace("Read what our students say »", "") \
                    .replace("Read about some of their", "").replace("career paths »", "").strip()
            print("item['career_en']: ", item['career_en'])
            yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 18

Mostrar archivo

Archivo: UniversityofManitoba_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "University of Manitoba"
        item['url'] = response.url
        print("===========================")
        print(response.url)
        item['other'] = '''问题描述：1.学位展示不规律，拆分可能存在不准确的情况
        2.有课程设置和就业信息、课程长度为空的是详情页没有的
        3.学费为空是没有匹配上的专业'''
        '''公共字段'''
        # item['campus'] = 'Hamilton'
        # item['location'] = 'Fredericton, NB, Canada'
        item['sat_code'] = item['toefl_code'] = '0973'

        # http://umanitoba.ca/student/admissions/finances/tuition-fees.html
        item['apply_pre'] = 'CAD$'
        item['apply_fee'] = '120'

        # http://umanitoba.ca/student/records/deadlines/index.html
        # item['start_date'] = '1月,5月,9月'
        # item['deadline'] = '2018-12-31,2019-03-01'
        # http://umanitoba.ca/student/admissions/international/asia.html
        item['tuition_fee_pre'] = 'CAD$'
        # item['tuition_fee'] = ''

        # http://umanitoba.ca/student/admissions/media/pdf/Requirement_sheet_-_General.pdf
        item['ielts_desc'] = '6.5 overall band score'
        item['ielts'] = '6.5'
        item[
            'toefl_desc'] = '(86 total score with a min. of 20 in each component) '
        item['toefl'] = '86'
        item['toefl_l'] = '20'
        item['toefl_s'] = '20'
        item['toefl_r'] = '20'
        item['toefl_w'] = '20'
        item[
            'entry_requirements_en'] = """<strong>IF YOU ARE A HIGH SCHOOL STUDENT</strong>
<p>If you’re coming to university directly from high school or have completed less than one year of university studies, you’ll take the direct entry route into a faculty or program: this means beginning in University 1 (U1), or applying to a program that offers a direct entry option. U1 is a unique approach to your first year at the U of M, giving you the opportunity to design an individualized schedule that meets the admission and/or first year requirements for one or more target degree programs. U1 will not add any time or cost to your degree; it serves as year 1 of any 3 or 4 year degree program</p>"""

        # http://umanitoba.ca/student/admissions/international/common-international-curriculums.html
        item[
            'alevel'] = 'Minimum 2 courses at Advanced (A) level and 3 courses at Ordinary (O) level: no grades below a ‘D’ (GCE)'
        item[
            'ib'] = "An IB diploma with 3 courses at the higher level (HL) and 3 courses at the standard level (SL): no grades below a ‘4’ (IB)"

        # http://umanitoba.ca/student/admissions/media/pdf/Requirement_sheet_-_USA.pdf
        item['act_desc'] = item['sat1_desc'] = "NO ACT/SAT REQUIRED"
        item[
            'ap'] = """Advanced Placement (Ap) And International Baccalaureate (IB)
        The University of Manitoba recognizes the Advanced Placement (AP) and International Baccalaureate (IB) programs for admission, scholarships, and university transfer credit. Credits will be transferred as follows:
        AP: 4=B; 5=A; 5 and 95% on the final exam=A+
        IB: 4=B; 5=B+; 6=A; 7=A+"""

        # http://umanitoba.ca/student/admissions/media/pdf/Requirement_sheet_-_General.pdf
        item[
            'require_chinese_en'] = '<p>High school graduation with a minimum average of 70% in three (3) senior level courses, with at least 60% in first language literature. </p>'
        try:
            departmentMajor = response.xpath(
                "//div[@id='centerHeader']//text()").extract()
            departmentMajor_str = ''.join(departmentMajor).strip()
            print(departmentMajor_str)
            if "Diploma" not in departmentMajor_str:
                if "- " in departmentMajor_str:
                    programmeDepartment = departmentMajor_str.split("- ")
                    # 专业
                    item['major_name_en'] = programmeDepartment[-1]
                    # 学院
                    item['department'] = programmeDepartment[0]
                elif ":" in departmentMajor_str:
                    programmeDepartment = departmentMajor_str.split(":")
                    # 专业
                    item['major_name_en'] = programmeDepartment[-1]
                    # 学院
                    item['department'] = programmeDepartment[0]
                elif "&" in departmentMajor_str:
                    programmeDepartment = departmentMajor_str.split("&")
                    # 专业
                    item['major_name_en'] = programmeDepartment[-1]
                    # 学院
                    item['department'] = programmeDepartment[0]
                else:
                    item['department'] = item[
                        'major_name_en'] = departmentMajor_str
                item['department'] = item['department'].strip()
                print("item['department']: ", item['department'])

                item['major_name_en'] = item['major_name_en'].strip()
                print("item['major_name_en']: ", item['major_name_en'])

                overview = response.xpath(
                    "//*[contains(text(),'Program description')]/../../following-sibling::p[1]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview)).replace("<p></p>",
                                                          "").strip()
                if item['overview_en'] == "<p><strong><span>Program options</span></strong></p>":
                    overview_xpath = "//span[contains(text(),'Program description')]/../../../text()"
                    overview = response.xpath(overview_xpath).extract()
                    item['overview_en'] = "<p>" + remove_class(
                        clear_lianxu_space(overview)).strip() + "</p>"
                if item['overview_en'] == "":
                    overview_xpath = "//span[contains(text(),'Program description')]/../following-sibling::p[1]"
                    overview = response.xpath(overview_xpath).extract()
                    item['overview_en'] = remove_class(
                        clear_lianxu_space(overview)).replace("<p></p>",
                                                              "").strip()
                if item['overview_en'] == "":
                    item['overview_en'] = None
                # print("item['overview_en']: ", item['overview_en'])

                tmp_html = response.text
                # 就业
                career = response.xpath(
                    "//strong[contains(text(),'Professional opportunities')]/..|"
                    "//strong[contains(text(),'Professional opportunities')]/../following-sibling::ul[1]"
                ).extract()
                if len(career) == 0:
                    career = response.xpath(
                        "//*[contains(text(),'Professional opportunities')]/../..|"
                        "//*[contains(text(),'Professional opportunities')]/../../following-sibling::ul[1]"
                    ).extract()
                career_end = remove_class(clear_lianxu_space(career))
                # print("career_end: ", career_end)
                if ''.join(career_end).replace("<ul></ul>", "").strip(
                ) == '<p><strong><span>Professional opportunities</span></strong></p>' or '>Program description' in career_end or ''.join(
                        career_end
                ) == '<span><strong>Professional opportunities</strong></span>':
                    career_key1 = r'<p><strong><span style="font-size: 14pt">Professional opportunities'
                    if career_key1 not in tmp_html:
                        career_key1 = r'<strong><span style="font-size: 14pt">Professional opportunities'
                        if career_key1 not in tmp_html:
                            career_key1 = r'<p><span style="font-size: 14pt"><strong>Professional opportunities'
                    career_key2 = r'<p><strong><span style="font-size: 14pt">Admission '
                    if career_key2 not in tmp_html:
                        career_key2 = r'<p><span style="font-size: 14pt"><strong>Admission '
                    career_end = getContentToXpath(tmp_html, career_key1,
                                                   career_key2)
                    career_end = remove_class(clear_lianxu_space([career_end]))

                    if len(career_end) == 0:
                        career = response.xpath(
                            "//*[contains(text(),'Professional opportunities')]/../..|"
                            "//*[contains(text(),'Professional opportunities')]/../../following-sibling::*[1]"
                        ).extract()
                        career_end = remove_class(clear_lianxu_space(career))
                # print("career_end2: ", career_end)
                if career_end == "":
                    item['career_en'] = None
                else:
                    item['career_en'] = career_end
                # print("item['career_en']: ", item['career_en'])
                '''modules'''
                modules_key1 = r'<p><strong><span style="font-size: 14pt">Interesting courses and unique opportunities'
                modules_key2 = r'<p><strong><span style="font-size: 14pt">Professional opportunities'
                if modules_key2 not in tmp_html:
                    modules_key2 = r'<p><span style="font-size: 14pt"><strong>Professional opportunities'
                if modules_key1 in tmp_html and modules_key2 in tmp_html:
                    modules_list1 = getContentToXpath(tmp_html, modules_key1,
                                                      modules_key2)
                    if len(modules_list1) > 0:
                        item['modules_en'] = remove_class(
                            clear_lianxu_space([modules_list1]))
                if item['modules_en'] is None:
                    modules = response.xpath(
                        "//span[contains(text(),'Interesting courses and unique opportunities')]/..|"
                        "//span[contains(text(),'Interesting courses and unique opportunities')]/../../following-sibling::ul[1]"
                    ).extract()
                    if len(modules) > 1:
                        item['modules_en'] = remove_class(
                            clear_lianxu_space(modules))
                if item['modules_en'] is None:
                    modules = response.xpath(
                        "//span[contains(text(),'Interesting courses and unique opportunities')]/..|"
                        "//span[contains(text(),'Interesting courses and unique opportunities')]/../following-sibling::ul[1]|"
                        "//strong[contains(text(),'Interesting courses and unique opportunities')]/..|"
                        "//strong[contains(text(),'Interesting courses and unique opportunities')]/../../following-sibling::ul[1]"
                    ).extract()
                    if len(modules) > 0:
                        item['modules_en'] = remove_class(
                            clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])
                '''学费'''
                tuition_fee_dict = {}
                tuition_fee_key = [
                    "University 1",
                    "Faculty of Agricultural",
                    "Faculty of Agricultural & Food Sciences",
                    "Faculty of Architecture",
                    "Faculty of Arts",
                    "Asper School of Business",
                    "Faculty of Education",
                    "Faculty of Engineering",
                    "Clayton H. Riddell Faculty of  Environment, Earth, & Resources",
                    "School of Art",
                    "Health Studies",
                    "Health Sciences",
                    "Faculty of Kinesiology & Recreation Management",
                    "Faculty of Law",
                    "Desautels Faculty of Music",
                    "Nursing",
                    "Faculty of Science",
                    "Faculty of Social Work",
                ]
                tuition_fee_value = [
                    "16,000",
                    "18,000",
                    "18,000",
                    "18,500",
                    "14,500",
                    "20,000",
                    "16,000",
                    "21,500",
                    "17,500",
                    "18,500",
                    "16,500",
                    "16,500",
                    "19,500",
                    "26,500",
                    "18,000",
                    "18,500",
                    "17,500",
                    "19,000",
                ]
                for i in range(len(tuition_fee_key)):
                    tuition_fee_dict[tuition_fee_key[i]] = tuition_fee_value[i]
                item['tuition_fee'] = tuition_fee_dict.get(item['department'])
                if item['major_name_en'] == "College of Nursing":
                    item['tuition_fee'] = "18,500"
                # print("item['tuition_fee']: ", item['tuition_fee'])

                if item['major_name_en'] == "Health Sciences" or item[
                        'major_name_en'] == "Health Studies" or item[
                            'major_name_en'] == "Family Social Sciences":
                    item['department'] = "Rady Faculty of Health Sciences"
                '''start_date'''
                start_date_dict = {
                    "University 1":
                    "1月,5月,9月",
                    "Faculty of Agricultural & Food Sciences":
                    "9月",
                    "Faculty of Architecture":
                    "9月",
                    "Faculty of Arts":
                    "1月,5月,9月",
                    "Asper School of Business":
                    "9月",
                    "Faculty of Education":
                    "9月",
                    "Faculty of Engineering":
                    "9月",
                    "Clayton H. Riddell Faculty of  Environment, Earth, & Resources":
                    "1月,5月,9月",
                    "School of Art":
                    "9月",
                    "Rady Faculty of Health Sciences":
                    "9月",
                    "Faculty of Kinesiology & Recreation Management":
                    "9月",
                    "Faculty of Law":
                    "9月",
                    "Desautels Faculty of Music":
                    "9月",
                    "Faculty of Science":
                    "1月,5月,9月",
                    "Faculty of Social Work":
                    "9月",
                }
                item['start_date'] = start_date_dict.get(item['department'])
                print("item['start_date']: ", item['start_date'])
                '''deadline'''
                deadline_dict = {
                    "University 1":
                    "2019-12-01,2019-04-01,2019-03-01",
                    "Faculty of Agricultural & Food Sciences":
                    "2019-03-01",
                    "Faculty of Architecture":
                    "2019-03-01",
                    "Faculty of Arts":
                    "2019-10-01,2019-03-01,2019-03-01",
                    "Asper School of Business":
                    "2019-03-01",
                    "Faculty of Education":
                    "2019-03-01",
                    "Faculty of Engineering":
                    "2019-03-01",
                    "Clayton H. Riddell Faculty of  Environment, Earth, & Resources":
                    "2019-10-01,2019-03-01,2019-03-01",
                    "School of Art":
                    "2019-03-01",
                    "Rady Faculty of Health Sciences":
                    "2019-03-01",
                    "Faculty of Kinesiology & Recreation Management":
                    "2019-03-01",
                    "Faculty of Law":
                    "2019-03-01",
                    "Desautels Faculty of Music":
                    "2019-01-15",
                    "Faculty of Science":
                    "2019-10-01,2019-03-01,2019-03-01",
                    "Faculty of Social Work":
                    "2019-03-01",
                }
                item['deadline'] = deadline_dict.get(item['department'])
                print("item['deadline']: ", item['deadline'])

                # degree_name_list = response.xpath("//strong[contains(text(),'Degree options')]/..//text()").extract()
                # print("degree_name_list: ", degree_name_list)
                # 学位名称

                d_key1 = r"<p><strong>Degree options"
                d_key2 = r"<p><strong>Program options"
                if d_key2 not in tmp_html:
                    d_key2 = r'<p><strong><span style="font-size: 14pt">Interesting courses and unique opportunities'
                degree_name_list1 = getContentToXpath(tmp_html, d_key1, d_key2)
                # print(degree_name_list1)
                degree_name_list_str = remove_tags(degree_name_list1).replace(
                    "Degree options", "").strip()
                # print("degree_name_list_str: ", degree_name_list_str)
                degree_name_list = degree_name_list_str.split('\n')
                # print("degree_name_list===: ", degree_name_list)
                if len(degree_name_list_str) == 0:
                    degree_name_list = response.xpath(
                        "//strong[contains(text(),'Degree options')]/..//text()"
                    ).extract()
                    if ''.join(degree_name_list).strip() == "Degree options":
                        degree_name_list = response.xpath(
                            "//strong[contains(text(),'Degree options')]/../following-sibling::p[1]//text()"
                        ).extract()
                    if len(degree_name_list) == 0:
                        degree_name_list = response.xpath(
                            "//strong[contains(text(),'Degree options')]/../following-sibling::ul[1]//text()"
                        ).extract()
                    clear_space(degree_name_list)
                    # degree_name_list.remove('Degree options')
                print("degree_name_list: ", degree_name_list)

                if len(degree_name_list) > 0:
                    for d in degree_name_list:
                        # 两种情况，一种包含bachelor of，一种不包含只有学位简写
                        if "Bachelor of" in d or "B." in d:
                            print("d===========", d)
                            degree_name_pre = re.findall(
                                r"[\w\W]+Bachelor\sof", d)
                            # print(degree_name_pre, "---")
                            if "–" in d:
                                duration_degree_name = d.split('– ')
                                duration_re = re.findall(
                                    r"[\.\w\s]+year", duration_degree_name[-1])
                                print(duration_re, "===")
                                if len(duration_re) > 0:
                                    item['duration'] = ''.join(
                                        duration_re).replace("year",
                                                             "").replace(
                                                                 " or",
                                                                 ",").strip()
                                    item['duration_per'] = 1
                                item['degree_name'] = duration_degree_name[
                                    0].replace(
                                        ''.join(degree_name_pre).replace(
                                            "Bachelor of", "").strip(),
                                        "").strip()
                            elif "- " in d or "- ":
                                duration_degree_name = d.split('- ')
                                duration_re = re.findall(
                                    r"[\.\w\s]+year|[\.\w\s]+Year",
                                    duration_degree_name[-1])
                                print(duration_re, "===")
                                if len(duration_re) > 0:
                                    item['duration'] = ''.join(
                                        duration_re).replace("year",
                                                             "").replace(
                                                                 " or",
                                                                 ",").strip()
                                    item['duration_per'] = 1
                                item['degree_name'] = duration_degree_name[
                                    0].replace(
                                        ''.join(degree_name_pre).replace(
                                            "Bachelor of", "").strip(),
                                        "").strip()
                            else:
                                item['degree_name'] = d.replace(
                                    ''.join(degree_name_pre).replace(
                                        "Bachelor of", "").strip(),
                                    "").strip()
                            if item['major_name_en'] == "Bachelor of Education":
                                item['degree_name'] = "Bachelor of Education"
                            if item['major_name_en'] == "Health Studies":
                                item[
                                    'degree_name'] = "Bachelor of Health Studies"
                            print("item['degree_name']: ", item['degree_name'])
                            print("item['duration']: ", item['duration'])
                            print("item['duration_per']: ",
                                  item['duration_per'])

                            yield item
                # else:
                #     if item['major_name_en'] == "Bachelor of Education":
                #         item['degree_name'] = "Bachelor of Education"
                #     if item['major_name_en'] == "Health Studies":
                #         item['degree_name'] = "Bachelor of Health Studies"
                #     yield item
        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 19

Mostrar archivo

Archivo: Nova_Soctia_College_of_Arts_and_Design_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)

        #1.学校名称
        school_name = 'Nova Soctia College of Arts and Design'

        #2.地点
        try:
            location = response.xpath(
                '//*[contains(text(),"Location Offered")]/following-sibling::span'
            ).extract()[0]
            location = remove_tags(location)
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus = location
            #campus_list = remove_tags(campus_list)
            #campus_list = campus_list.replace(', Online','')
            #campus_list = campus_list.replace(' ','')
            #campus_list = campus_list.split(',')
            #print(campus_list)
        except:
            campus_list = None
            #print(campus_list)

#4. 学院
        try:
            department = response.xpath(
                '//*[@id="tabbed-content"]/div[5]').extract()[0]
            #department = remove_tags(department,keep=("i"))
            department = department.replace(' class="fa fa-graduation-cap"',
                                            '').replace(
                                                ' class="fa fa-university"',
                                                '')
            department = department.replace(' <i> </i> ', '<i></i>').replace(
                '<i> </i> ', '')
            department = department.split('<i></i>')[-1]
            #print(len(department))
            #print(department)
            #print(response.url)
        except:
            department = None
            #print(department)

# 4.
        try:
            degree_name1 = response.xpath('//*[@id="page-title"]').extract()[0]
            degree_name2 = response.xpath('//h2[@id][1]').extract()[0]
            degree_name1 = remove_tags(degree_name1)
            degree_name2 = remove_tags(degree_name2)

            # degree_name = remove_tags(degree_name,keep=('li','ul'))
            #degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list)
            # degree_name = degree_name.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','')
            # degree_name_list = degree_name_list.replace('<li>','').replace('</li>','---')
            # degree_name_list = degree_name_list.replace('<span>','').replace('</span>','---')
            #degree_name_list = degree_name_list.split('</li><li>')
            #print("1:" + degree_name2)
            #print("2:" + degree_name1)
            if 'Bachelor of Education' in degree_name1 or 'Bachelor of Education' in degree_name2:
                degree_name = 'Bachelor of Education'
            elif 'BSc' in degree_name2 or 'BSc' in degree_name1:
                degree_name = 'Bachelor of Science'
            elif 'BA' in degree_name1 or 'BA' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Bachelor of Hospitality Management' in degree_name1 or 'Bachelor of Hospitality Management' in degree_name2:
                degree_name = 'Bachelor of Hospitality Management'
            elif 'Bachelor of Tourism Management' in degree_name1 or 'Bachelor of Tourism Management' in degree_name2:
                degree_name = 'Bachelor of Tourism Management'
            elif 'Minor' in degree_name1 or 'Minor' in degree_name2:
                degree_name = 'Minor Pass'
            elif 'Bachelor of Science' in degree_name1 or 'Bachelor of Science' in degree_name2:
                degree_name = 'Bachelor of Science'
            elif 'Geoscience' in degree_name1 or 'Geoscience' in degree_name2:
                degree_name = 'Bachelor of Science'
            elif 'Bachelor of Arts' in degree_name1 or 'Bachelor of Arts' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Bachelor of Natural Resource Protection' in degree_name1 or 'Bachelor of Natural Resource Protection' in degree_name2:
                degree_name = 'Bachelor of Natural Resource Protection'
            elif 'Bachelor of Social Work' in degree_name1 or 'Bachelor of Social Work' in degree_name2:
                degree_name = 'Bachelor of Social Work'
            elif 'Education' in degree_name1 or 'Education' in degree_name2:
                degree_name = 'Bachelor of Education'
            elif 'BBA' in degree_name1 or 'BBA' in degree_name2:
                degree_name = 'Bachelor of Business Administration'
            elif 'Studies in Women and Gender' in degree_name1 or 'Studies in Women and Gender' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Bachelor of Anthropology' in degree_name1 or 'Bachelor of Anthropology' in degree_name2:
                degree_name = 'Bachelor of Anthropology'
            elif 'Political Studies' in degree_name1 or 'Political Studies' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Visual Art' in degree_name1 or 'Visual Art' in degree_name2:
                degree_name = 'Bachelor of Art'
            elif 'Philosophy' in degree_name1 or 'Philosophy' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Sociology' in degree_name1 or 'Sociology' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Liberal' in degree_name1 or 'Liberal' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Physical' in degree_name1 or 'Physical' in degree_name2:
                degree_name = 'Bachelor of Education'
            elif 'Media Studies' in degree_name1 or 'Media Studies' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Economics' in degree_name1 or 'Economics' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Global' in degree_name1 or 'Global' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'History' in degree_name1 or 'History' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Business' in degree_name1 or 'Business' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Bachelor of Music' in degree_name1 or 'Bachelor of Music' in degree_name2:
                degree_name = 'Bachelor of Music'
            elif 'Earth Science' in degree_name1 or 'Earth Science' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Digital Media Studies' in degree_name1 or 'Digital Media Studies' in degree_name2:
                degree_name = 'Digital Media Studies'
            elif 'Creative Writing' in degree_name1 or 'Creative Writing' in degree_name2:
                degree_name = 'Bachelor of Arts'
            elif 'Bachelor of Interior Design' in degree_name1 or 'Bachelor of Interior Design' in degree_name2:
                degree_name = 'Bachelor of Interior Design'
            elif 'Bachelor of Design' in degree_name1 or 'Bachelor of Design' in degree_name2:
                degree_name = 'Bachelor of Design'
            elif 'https://www.viu.ca/programs/arts-humanities-social-sciences' in response.url:
                degree_name = 'Bachelor of Arts'
            else:
                degree_name = None
            #print(degree_name)
            #print(response.url)
        except:

            degree_name = None
            #print(degree_name)

        try:
            duration = response.xpath(
                '//*[contains(text(),"Program Length")]/following-sibling::span'
            ).extract()[0]
            duration = remove_tags(duration)
            duration = duration.replace(' Years', '').replace('\n', '')
            #print(duration)
        except:
            duration = None
            #print(duration)
#5.学位描述
        try:
            degree_overview_en = response.xpath(
                '//h2[contains(text(),"Program")]/following-sibling::div/span'
            ).extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
        # print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文

#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            # start_date = response.xpath('//*[@id="Admissionrequirementsanddeadlines-subsection-0"]/table/tbody/tr/td[1]').extract()
            start_date = '2019-09'
            #start_date = remove_tags(start_date)
            #start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            #start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
# try:
#     duration = response.xpath('').extract()[0]
#     duration = remove_tags(duration)
#     # print(duration)
# except:
#     duration = None
#     # print(duration)

#10.课程设置
        try:
            modules_en = response.xpath(
                '//*[@id="program-outline"]/div[1]/span').extract()[0]
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en)
            modules_en = modules_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
            #print(modules_en)
        except:
            modules_en = None
            #print(modules_en)

#11.就业方向
        try:
            career_en = response.xpath(
                '//a[contains(text(),"Career Opportunities")]/../../following-sibling::div'
            ).extract()[0]
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            career_en = career_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
            #print(career_en)
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            deadline = '2019-03-31'
            #deadline = response.xpath('//*[@id="Admissionrequirementsanddeadlines-subsection-0"]/table/tbody/tr/td[3]').extract()
            #deadline = '---'.join(deadline)
            #deadline = remove_tags(deadline)
            #deadline = deadline.replace('Documents due: ', '')
            #deadline =  deadline.replace('Sep 1, 2018Oct 1, 2018','2018-09-01').replace('Feb 1, 2019Mar 1, 2019','2019-02-01').replace('Mar 1, 2019Apr 1, 2019','2019-03-01').replace('May 1, 2019Jun 1, 2019','2019-05-01').replace('Sep 1, 2019Oct 1, 2019','2019-09-01').replace('Feb 15, 2019Mar 1, 2019','2019-02-15').replace('---',',')
            #deadline = remove_tags(deadline)
            #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            #tuition_fee = response.xpath('//*').extract()[0]
            #tuition_fee = re.findall('\d\d,\d\d\d\.\d\d',tuition_fee)[0]\
            tuition_fee = '15,240.00'
            #tuition_fee = remove_tags(tuition_fee)
            #tuition_fee = tuition_fee.replace('$','')
            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)
#14 申请费:
        apply_fee = '150'

        #15 申请要求
        try:
            entry_requirements_en = response.xpath(
                '//a[contains(text(),"Admission Requirements")]/../../following-sibling::div'
            ).extract()[0]
            entry_requirements_en = remove_tags(entry_requirements_en)
        #  print(entry_requirements_en)
        except:
            entry_requirements_en = None
        #   print(entry_requirements_en)
        #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = '<div><h2><a></a>Admission</h2><p>Vancouver Island University accepts international students under <em>one</em> of the following conditions:</p><ul><li>direct applications (online or paper applications);</li><li>applications through authorized representatives;</li><li>institution-to-institution exchange programs;</li><li>sponsorship through inter-governmental agreements;</li><li>winner of academic competitions or international scholarships.</li></ul><h2>Admission to English Language Studies in the English Language Centre</h2><p>All students must:</p><ul><li>be an international visitor or hold a&nbsp;<a>study permit</a>;</li><li>complete a <a>Vancouver Island University ESL language assessment test (or equivalent)</a></li></ul><p>Please keep in mind that students applying for conditional admission to an undergraduate (or diploma, certificate or trades) program at VIU, or thinking of applying upon completion of ESL programming, will still be required to provide proof of high school graduation and their secondary school transcripts.</p><h2>The High School</h2><p>All students must:</p><ul><li>have completed Grade 9, or equivalent;</li><li>turn 15 years of age by December 31 following a September enrollment;</li><li>be under 19 years of age.</li></ul><p>Admission is selective and is based on academic and leadership potential. For more info please visit <a>The High School at VIU</a>.</p><h2>Undergraduate Academic and Vocational Programs</h2><ul><li>Graduation from high school or equivalent.</li><li>Proof of English language proficiency is required from applicants for whom English is not their first language or from those whose education was completed in any country where English is not the official language.</li><li>For students who meet the academic requirements but not the English language requirement, conditional acceptance to VIU may be granted. In this case, admission is granted pending completion of <a>VIU’s English Language Centre program</a> (level AP5). Exceptions may apply.</li><li>Up to 30 advanced credits may be granted for <a>Advanced Placement</a> and <a>International Baccalaureate</a> (higher level) courses.</li><li>Academic programs may have additional program admission requirements and course prerequisites. Please check your specific program requirements from&nbsp;<a>VIU programs</a>.</li><li>International students who are BC high school graduates must have a minimum grade of “C” in English 12 for direct entry into an academic program.</li></ul><h2><a></a>English Language Requirements for Academic Programs</h2><p>To meet the English language requirements for academic programs, students must have completed one of the following:</p><table><tbody><tr><td><h4><strong>Test</strong></h4></td><td><h4><strong>Minimum score</strong></h4></td></tr><tr><td><ul><li><h5>TOEFL Paper Based Test</h5></li></ul></td><td><h5><strong>550 </strong>(no section below 56)</h5></td></tr><tr><td><ul><li><h5>TOEFL IBT</h5></li></ul></td><td><h5><strong>88 </strong>(no section below 20)</h5></td></tr><tr><td><ul><li><h5>IELTS (Academic)</h5></li></ul></td><td><h5><strong>6.5 </strong>(no band below 6.0)</h5></td></tr><tr><td><ul><li><h5>CAEL</h5></li></ul></td><td><h5><strong>60</strong></h5></td></tr><tr><td><ul><li><h5>Cambridge Certificate of Proficiency in English (CPE)</h5></li></ul></td><td><h5><strong>176</strong> overall</h5></td></tr><tr><td><ul><li><h5>Cambridge Certificate of Advanced English (CAE)</h5></li></ul></td><td><h5><strong>176</strong> overall</h5></td></tr><tr><td><ul><li><h5>English 12 (BC)</h5></li></ul></td><td><h5>Min. “<strong>C</strong>”</h5></td></tr><tr><td><ul><li><h5>Pearson (PTE)</h5></li></ul></td><td><h5><strong>60 </strong>(no section below 60)</h5></td></tr><tr><td><ul><li><h5>International Baccalaureate English A1/A2</h5></li></ul></td><td><h5>Higher Level (HL)/ Standard Level (SL) grade <strong>3</strong> or higher</h5></td></tr><tr><td><ul><li><h5>VIU English Language Centre</h5></li></ul></td><td><h5>Successful completion of University Preparation <strong>Level 5</strong></h5></td></tr><tr><td><ul><li><h5>Advanced Placement (AP) English Language and Composition or English Literature and Composition</h5></li></ul></td><td><h5>Grade<strong> 2</strong> or higher</h5></td></tr><tr><td><ul><li><h5>Recognized university where English is the language of instruction.</h5></li></ul></td><td><h5>Completion of six credits of post-secondary English composition and literature with a minimum grade of “<strong>C</strong>”</h5></td></tr><tr><td><ul><li><h5>Language Proficiency Index (LPI)</h5></li></ul></td><td><h5>Score<strong> 5</strong> or higher</h5></td></tr><tr><td><ul><li><h5>General Certificate of Secondary Education (GCSE)</h5></li></ul></td><td><h5>English at the O-level with a minimum grade of <strong>C or 4</strong></h5></td></tr></tbody></table><h2>Admission to Post-Degree Diploma in Business Studies</h2><ul><li>Business or non-business Bachelor\'s degree from a recognized institution.</li><li>Meet VIU’s <a>English language requirements for academic programs</a>. Individual course pre-requisites also apply.</li></ul><h2><a></a>Admission to Graduate Programs</h2><ul><li>Minimum “B” average in the final 2 years of a Bachelor’s degree.</li><li>Meet <strong>one</strong> of the following VIU English language requirements:</li></ul><table><tbody><tr><td><h3><strong>Test</strong></h3></td><td><h3><strong>Minimum score</strong></h3></td></tr><tr><td><h5>TOEFL iBT</h5></td><td><h5><strong>93 </strong>(no band below 20)</h5></td></tr><tr><td><h5>IELTS</h5></td><td><h5><strong>7.0 </strong>(no band below 6.5)</h5></td></tr><tr><td><h5><spanHelvetica Neue\', Helvetica, Arial, sans-serif; font-size: 13px;">VIU\'s&nbsp;</span><aHelvetica Neue\', Helvetica, Arial, sans-serif; font-size: 13px;">English Language Centre</a><spanHelvetica Neue\', Helvetica, Arial, sans-serif; font-size: 13px;">&nbsp;</span></h5></td><td><h5>Successful completion of&nbsp;<span>Graduate Preparation Program (Grad Prep)</span></h5></td></tr><tr><td><h5>CAEL</h5></td><td><h5><strong>70 (no band below 60)</strong></h5></td></tr><tr><td><h5>Pearson (PTE)</h5></td><td><h5><strong>65 </strong>(no section below 60)</h5></td></tr><tr><td><h5>Cambridge Certificate of Proficiency in English (CPE)</h5></td><td><h5><strong>185 overall</strong></h5></td></tr><tr><td><h5>Cambridge Certificate of Advanced English (CAE)</h5></td><td><h5><strong>185 overall</strong></h5></td></tr></tbody></table><p>&nbsp;</p><ul><li>No GMAT or GRE scores required</li><li>For other documents and work experience requirements, please see program specific page.</li></ul><h2>Transfer Credit</h2><p>Students enrolled in a University program may receive transfer credit for up to 50 percent of their program. Students seeking credit for courses from other institutions will need to submit detailed course outlines and a credit transfer request form. Please note, it can take 6-8 weeks after admissions received your documents for credit to appear. Students are welcome to find transfer credit resources by looking up <a>individual courses</a> or searching the BC Council on Admission and Transfer (BCCAT).</p><h2><a></a>Deferring Admission</h2><p><strong>For students wanting to change their program to a later start date:</strong></p><p>If for any reason you need to change when you start your program at VIU, or you haven’t been successful in obtaining a visa and wish to cancel your enrollment, please contact us right away at&nbsp;<a>[email protected]</a>&nbsp; or <a>[email protected]</a> for students entering Masters programs.&nbsp; Click here for more information on <a>VIU International Tuition Fee Deferral and Refund Policy.</a></p></div>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = response.xpath(
                '//*[@id="admission-requirements"]/div[1]/span/span').extract(
                )[0]
            #specific_requirement_en = remove_tags(specific_requirement_en)
            specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]',
                                             '', specific_requirement_en)
            #specific_requirement_en = specific_requirement_en.replace('\r\n','')
            #specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0]
            #specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
        # print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = response.xpath('').extract()[0]
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = '6.5 (no band below 6.0)/88 (no section below 20)'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = '6.5 (no band below 6.0)'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

        #28 toefl_code
        try:
            toefl_code = '9581'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = '88 (no section below 20)'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '88'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 20
        toefl_s = 20
        toefl_r = 20
        toefl_w = 20

        # 32 alevel
        try:
            alevel = None
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'VIU recognizes the International Baccalaureate program. Courses at the Subsidiary Level are roughly equivalent to enriched versions of B.C. Grade 11 courses in academic disciplines. Courses at the Higher Levelare roughly equivalent to enriched B.C. Grade 12.'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'A grade of 4 or higher on the exam is required for credit consideration. Courses with exam grades of 3 or less will not be eligible for credit or advance standing.'
            #ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = ''
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '9581'
        sat1_desc = None
        sat2_desc = None
        act_code = None
        act_desc = None

        item["ap"] = ap
        item["duration_per"] = 1
        item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        #item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = '9581'
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        try:
            major_name_en_list = response.xpath(
                '//*[contains(text(),"LEARN MORE")]/../../a/@href|//*[contains(text(),"SEE MORE")]/../../a/@href'
            ).extract()
            for mod in major_name_en_list:
                item["major_name_en"] = mod
                print(item["major_name_en"])
            # yield item

            #yield  item
            #major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
            #major_name_en = remove_tags(major_name_en)

        except:
            item["major_name_en"] = None
            major_name_en_list = None
            print(major_name_en_list)

Ejemplo n.º 20

Mostrar archivo

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)

        try:
            if 'business/bba' in response.url or 'criminology-and-criminal-justice' in response.url or 'gender-equality-and-social-justice' in response.url:
                major_name_en_list = response.xpath(
                    '//*[@id="tabs-1"]/div/div/div/div/div/div/h3/a|//*[@id="tabs-1"]/div/div/div/div/div/div/h4/a'
                ).extract()
                major_name_en_list = ''.join(major_name_en_list)
                #major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
                major_name_en_list = remove_tags(major_name_en_list,
                                                 keep=('h3', 'h4', 'a'))
                major_name_en_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                            major_name_en_list)
                major_name_en_list = major_name_en_list.replace(
                    '</a><a>', ':::').replace('<a>', '').replace('</a>', '')
                major_name_en_list = major_name_en_list.split(':::')
            # major_name_en = re.findall('( .*?[a-z])[A-Z]',major_name_en)
            else:
                major_name_en_list = response.xpath(
                    '//*[@id="block-nu-page-title"]/div/h1/span').extract()
                major_name_en_list = ''.join(major_name_en_list)
                major_name_en_list = remove_tags(major_name_en_list)
                major_name_en_ = []
                major_name_en_.append(major_name_en_list)
                major_name_en_list = major_name_en_
                #print(major_name_en_list)
        except:
            major_name_en = None
        # print(major_name_en)
#1.学校名称
        school_name = 'Nipissing University'

        #2.地点
        try:
            location = None
            location = remove_tags(location)
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus_list = response.xpath('').extract()[0]
            campus_list = remove_tags(campus_list)
            campus_list = campus_list.replace(', Online', '')
            campus_list = campus_list.replace(' ', '')
            campus_list = campus_list.split(',')
            #print(campus_list)
        except:
            campus_list = None
            #print(campus_list)

#4. 学院
        try:
            department = response.xpath(
                '//*[@id="block-nu-breadcrumbs"]/div/nav/ol/li[4]/a').extract(
                )[0]
            department = remove_tags(department, keep=("i"))

            #print(len(department))
        # print(department)
        #print(response.url)
        except:
            department = None
            #print(department)

# 4.
        try:
            degree_name = response.xpath(
                '//*[@id="tabs-1"]/div/div/div/div/div/div/h3').extract()[0]
            #degree_name_list = remove_tags(degree_name_list,keep=('li','ul'))
            #degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list)
            #degree_name_list = degree_name_list.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','')
            # degree_name_list = degree_name_list.replace('<li>','').replace('</li>','---')
            # degree_name_list = degree_name_list.replace('<span>','').replace('</span>','---')
            # degree_name_list = degree_name_list.split('</li><li>')
            degree_name = remove_tags(degree_name)
            #print(degree_name)
            #print(response.url)
        except:

            degree_name = None
        #   print(degree_name)
#
#5.学位描述
        try:
            degree_overview_en = response.xpath(
                '//h2[contains(text(),"Welcome to")]/following-sibling::p'
            ).extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文

#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = '2019-01,2019-09'
            # start_date = ','.join(start_date)
            # start_date = remove_tags(start_date)
            # start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            # start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            # #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
# try:
#     duration = response.xpath('').extract()[0]
#     duration = remove_tags(duration)
#     # print(duration)
# except:
#     duration = None
#     # print(duration)

#10.课程设置
        try:
            modules_en = response.xpath(
                '//h3[contains(text(),"Courses")]/following-sibling::*'
            ).extract()
            modules_en = ''.join(modules_en)
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en)
            modules_en = modules_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
        #print(modules_en)
        except:
            modules_en = None
        #print(modules_en)

#11.就业方向
        try:
            career_en = response.xpath('//*[@id="tabs-7"]').extract()[0]
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            career_en = career_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
            #print(career_en)
        except:
            career_en = None
        # print(career_en)

#12.截止日期
        try:
            deadline = '2019-03-01'
            # deadline = response.xpath('//*[@id="Admissionrequirementsanddeadlines-subsection-0"]/table/tbody/tr/td[3]').extract()
            # deadline = '---'.join(deadline)
            # deadline = remove_tags(deadline)
            # deadline = deadline.replace('Documents due: ', '')
            # deadline =  deadline.replace('Sep 1, 2018Oct 1, 2018','2018-09-01').replace('Feb 1, 2019Mar 1, 2019','2019-02-01').replace('Mar 1, 2019Apr 1, 2019','2019-03-01').replace('May 1, 2019Jun 1, 2019','2019-05-01').replace('Sep 1, 2019Oct 1, 2019','2019-09-01').replace('Feb 15, 2019Mar 1, 2019','2019-02-15').replace('---',',')
            # #deadline = remove_tags(deadline)
            # #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            tuition_fee = '21,257.50'
            tuition_fee = remove_tags(tuition_fee)
            tuition_fee = tuition_fee.replace('$', '')
            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)
#14 申请费:
        apply_fee = '90'

        #15 申请要求
        try:
            url = ''
            body = {
                "studenttype": "HS",
                "institution": "INT-BR",
                "program": "en"
            }
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
            }

        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = entry_requirements_en
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = None
            # #specific_requirement_en = remove_tags(specific_requirement_en)
            # specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
            # specific_requirement_en = specific_requirement_en.replace('\r\n','')
            # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0]
            # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
            # #print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = response.xpath('').extract()[0]
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'Overall Band Score: 6.5 With minimum individual scores of Reading: 6 Listening: 6 Speaking: 6 Writing: 6 or Minimum Score: 86 (no component score less than 19)'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'Overall Band Score: 6.5 With minimum individual scores of Reading: 6 Listening: 6 Speaking: 6 Writing: 6'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

        #28 toefl_code
        try:
            toefl_code = '0847'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = ''
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '80'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
        # print(toefl)

#31 toefl_?
        toefl_l = 19
        toefl_s = 19
        toefl_r = 19
        toefl_w = 19

        # 32 alevel
        try:
            alevel = 'Possess the (International) General Certificate of Secondary Education with: Passes in at least five subjects: Two of which must be at the Advanced Level (G.C.E.) Two subjects at the Advanced Supplementary (A.S.) Level may be substituted for one subject at the Advanced Level.  For example, 4 Advanced Supplementary (A.S.) Level courses equal two A Level Courses.  The remaining three passes may be at the Ordinary Level (G.C.S.E.) Acceptable standing must be achieved in all subjects Applicants may apply for admission in the year they will be sitting for their final A-Level examinations provided they can present excellent grades in their O-Level examinations and strong predicted A-Level results. With the exception of the Faculty of Engineering, for all other programs that require "Mathematics" as a prerequisite, AS-Level Mathematics is required. Applicants presenting A-Level examinations with a minimum grade of "C" may be considered for advanced standing. In addition to the above, applicants interested in the four year Bachelor of Engineering degree program must complete the following prerequisite courses: A-Level Mathematics A-Level Physics  A-Level Chemistry is preferred; however, AS-level Chemistry will be accepted  O-Level English '
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'Applicants who have successfully completed the International Baccalaureate (IB) with at least six subjects, including three at the higher level, with a minimum final grade total of 24 (not including bonus points) will be considered for admission. ,Advanced standing, to a maximum of 30 credits, may be granted for courses completed at the higher level with a grade of 5 or higher. Applicants need to present courses in specific subject areas as outlined on the Admissions Chart.'
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'Applicants who have completed Advanced Placement (AP) courses are encouraged to submit their examination results. Official AP score reports must be sent directly to Nipissing University. Advanced standing will be granted for most AP courses completed with a grade of 4 or higher, to a maximum of 18 credits. Nipissing\'s DI code is 4149.'
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = ''
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0847'
        sat1_desc = 'The SAT with the following scores: Reading* = five hundred and fifty (550) Math = five hundred and fifty (550) Total score = one thousand and one hundred (1100)'
        sat2_desc = None
        act_code = None
        act_desc = 'The Enhanced Composite ACT with a Total Score of twenty-four (24)'

        item["ap"] = ap
        item["duration_per"] = 1
        #item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        #item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        for zhuanye in major_name_en_list:
            item["major_name_en"] = zhuanye
            # print(zhuanye)
            yield item

Ejemplo n.º 21

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "Laurentian University"
        item['campus'] = 'Sudbury campus'
        item['url'] = response.url
        print("===========================")
        print(response.url)

        try:
            major_name_en = response.xpath(
                "//div[@class='header-image-text']/h1//text()").extract()
            clear_space(major_name_en)
            item['major_name_en'] = ''.join(major_name_en).strip()
            print("item['major_name_en']: ", item['major_name_en'])

            if 'online' not in item['major_name_en']:
                item['department'] = response.meta.get(item['major_name_en'])
                # print("item['department']: ", item['department'])

                overview = response.xpath(
                    "//section[@id='prog_overview']").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # if item['overview_en'] == "":
                #     print("***overview_en 为空")
                # print("item['overview_en']: ", item['overview_en'])

                career = response.xpath(
                    "//h1[contains(text(),'Career Opportunities')]/following-sibling::*"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                # https://laurentian.ca/international/undergraduate-admissions
                item['deadline'] = '2019-02-01'

                # https://laurentian.ca/fees-financing/undergraduate-tuition
                item['tuition_fee_pre'] = '$'
                if 'engineer' in item['major_name_en'].lower():
                    item['tuition_fee'] = 30627
                elif 'architectur' in item['major_name_en'].lower():
                    item['tuition_fee'] = 31027
                else:
                    item['tuition_fee'] = 24104
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # https://laurentian.ca/international/undergraduate-admissions
                item[
                    'require_chinese_en'] = '<p>Senior School Leaving Certificate;  Senior Upper Middle School.  Huikao (Senior School Graduation Exam) is not required.</p>'

                if item['department'] == "Faculty of Arts":
                    item['specific_requirement_en'] = 'Senior level English'
                elif item[
                        'department'] == "Faculty of Science, Engineering and Architecture":
                    if 'architectur' in item['major_name_en'].lower():
                        item[
                            'specific_requirement_en'] = 'Senior level English and two senior level math'
                    elif 'Computer Science' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English and two math'
                    elif 'engineering' in item['major_name_en'].lower():
                        item[
                            'specific_requirement_en'] = 'Senior level English, math, physics and chemistry'
                    elif 'Forensic Science' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English, math, biology and chemistry'
                    elif 'Radiation Therapy' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = '''<ul><li>Senior level English, math, physics, chemistry and biology</li><li>Select group of candidates will be invited to interview</li></ul>'''
                    elif 'Sciences' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = '<ul><li>Senior level English</li><li>one senior level math & two senior level sciences OR two senior level math & on senior level science</li></ul>'
                elif item['department'] == "Faculty of Management":
                    if 'Business Administration' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English and one senior level math'
                    elif 'Sports Administration' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English and two senior level math'
                elif item['department'] == "Faculty of Health":
                    if 'Nursing' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English, math, chemistry and biology'
                    elif 'Midwifery' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English, Biology OR Chemistry and one social science'
                    elif 'Gerontology' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English'
                    elif 'Social Work' in item[
                            'major_name_en'] or 'Indigenous Social Work' in item[
                                'major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English'
                    elif 'Sport and Physical Education' in item[
                            'major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English and biology'
                    elif 'Kinesiology' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English, math and chemistry'
                    elif 'Outdoor Adventure Leadership' in item[
                            'major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English and biology'
                    elif 'Health Promotion' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English, chemistry and biology'
                    elif 'Sport Psychology' in item['major_name_en']:
                        item[
                            'specific_requirement_en'] = 'Senior level English'

                item[
                    'ielts_desc'] = 'International English Language Testing System (IELTS - Academic)(6.5 minimum overall for direct academic entry with no band score lower than 6)'
                item['ielts'] = '6.5'
                item['ielts_l'] = '6.0'
                item['ielts_s'] = '6.0'
                item['ielts_r'] = '6.0'
                item['ielts_w'] = '6.0'
                item[
                    'toefl_desc'] = 'Test of English as a Foreign Language (TOEFL) (minimum score: 230 computer-based, 88 Internet-based)'
                item['toefl'] = '88'

                item['start_date'] = '9月'
                item[
                    'alevel'] = '5 GCE/GCSE/IGCSE subjects with at least 2 at A-Level.  4 AS-Level subjects with 1 GCSE/IGCSE/O-Level subject will be considered provided the AS-levels do not duplicate subject matter at the GCSE/IGCSE or O Level. (Minimum grade of C or better required)'
                item['sat1_desc'] = '1650'
                item['act_desc'] = '24'
                item['toefl_code'] = item['sat_code'] = item['act_code'] = None
                item['apply_pre'] = 'CAD$'
                item['apply_fee'] = '150'
                # //a[@class='btn btn-default full'][contains(text(),'Program Details')]/@href

                item[
                    'other'] = '''问题清单：1.duration页面展示不一致，有些出现在一段话中导致抓取缺少,需要详细核对
                2.学位名称下面有些存在划分的专业，需要找出特殊的那几条然后拆分，不一定全部能拆完全以及准确
                3.因为正则匹配问题，学位名可能不全或者不准确
                4.专业描述和课程设置、就业为空的是详情页没有的
                '''
                detail_url = response.xpath(
                    "//a[@class='btn btn-default full'][contains(text(),'Program Details')]/@href"
                ).extract()
                if len(detail_url) > 0:
                    print(''.join(detail_url))
                    detail_dict = self.parse_detail_ziduan(
                        ''.join(detail_url), item['major_name_en'])
                    item['entry_requirements_en'] = detail_dict[
                        'entrey_requirements_en']
                    item['ap'] = detail_dict['ap']
                    item['ib'] = detail_dict['ib']
                    item['modules_en'] = detail_dict['modules_en']

                    degree_name_list = detail_dict['degree_name_list']
                    major_name_en_list = detail_dict['major_name_en_list']
                    duration_text = detail_dict['duration_text']
                    item['other'] = item['other'] + '\n'.join(
                        major_name_en_list)
                    print("degree_name_list: ", degree_name_list)
                    print("major_name_en_list: ", major_name_en_list)
                    print("duration_text: ", duration_text)
                    if len(degree_name_list) == 0:
                        yield item
                    else:
                        # enducation 专业特殊需要处理
                        if item['major_name_en'] == 'Education':
                            degree_name_list = ''.join(degree_name_list).split(
                                ';')
                        # print("degree_name_list: ", degree_name_list)
                        # if len(major_name_en_list) > 1:
                        #     if "<h4>Fourth year</h4>" in item['modules_en']:
                        #         item['duration'] = '4'
                        #         item['duration_per'] = 1
                        #     if item['duration'] is None and "<h4>Third year</h4>" in item['modules_en']:
                        #         item['duration'] = '3'
                        #         item['duration_per'] = 1
                        #     if item['duration'] is None and "<h4>Second year</h4>" in item['modules_en']:
                        #         item['duration'] = '2'
                        #         item['duration_per'] = 1
                        #     if item['duration'] is None and "<h4>First year</h4>" in item['modules_en']:
                        #         item['duration'] = '1'
                        #         item['duration_per'] = 1
                        #     item['degree_name'] = degree_name_list[-1].replace('(4 Year)', '').replace('in ' + ''.join(item['major_name_en']), '').strip()
                        #     degree_name_list = degree_name_list[:-1]
                        #     for major_name in major_name_en_list:
                        #         item['major_name_en'] = major_name.replace('Specialization in', '').replace('Option in', '').replace('(27 credits)', '').replace('(24 credits)', '').strip()
                        #         print("item['major_name_en']==:", item['major_name_en'])
                        #         yield item
                        # print("degree_name_list2==: ", degree_name_list)
                        # if len(degree_name_list) > 0:
                        for degree_name in degree_name_list:
                            print("**************", item['major_name_en'])
                            # print(degree_name)
                            duration_re = re.findall(
                                r"\(\d\syear.*?\)|\(\d-year.*?\)", degree_name,
                                re.I)
                            print('duration_re: ', duration_re)
                            if len(duration_re) > 0:
                                duration_re1 = re.findall(
                                    r"\d", ''.join(duration_re))
                                item['duration'] = ''.join(
                                    duration_re1[0]).strip()
                                item['duration_per'] = 1
                            if item['duration'] is None and "<h4>Fourth year</h4>" in item[
                                    'modules_en']:
                                item['duration'] = '4'
                                item['duration_per'] = 1
                            if item['duration'] is None and "<h4>Third year</h4>" in item[
                                    'modules_en']:
                                item['duration'] = '3'
                                item['duration_per'] = 1
                            if item['duration'] is None and "<h4>Second year</h4>" in item[
                                    'modules_en']:
                                item['duration'] = '2'
                                item['duration_per'] = 1
                            if item['duration'] is None and "<h4>First year</h4>" in item[
                                    'modules_en']:
                                item['duration'] = '1'
                                item['duration_per'] = 1
                            print("item['duration']: ", item['duration'])
                            print("item['duration_per']: ",
                                  item['duration_per'])

                            # 匹配无用的括号里面的东西
                            unuse_re = re.findall(r"\([^B][\w\W]*\)",
                                                  degree_name)
                            item['degree_name'] = degree_name.replace(
                                '  ',
                                ' ').replace(''.join(duration_re), '').replace(
                                    'Concentration -',
                                    '').replace('- 120 credits', '').replace(
                                        ''.join(unuse_re), '').replace(
                                            ' in ' + item['major_name_en'], ''
                                        ).replace(
                                            'in Women’s and Gender Studies',
                                            '').strip()
                            # 学位缩写
                            if item['major_name_en'] == 'Forensic Science':
                                item[
                                    'degree_name'] = 'Bachelor of Forensic Science'
                            elif '(B' in degree_name:
                                degree_name_re = re.findall(
                                    r"\(B[\w\W]*?\)", degree_name)
                                if 'Honours' in degree_name:
                                    item['degree_name'] = 'Honours' + ''.join(
                                        degree_name_re).replace(
                                            '(', '').replace(')', '').strip()
                                else:
                                    item['degree_name'] = ''.join(
                                        degree_name_re).replace(
                                            '(', '').replace(')', '').strip()
                                item['major_name_en'] = degree_name.replace(
                                    'Honours', '').replace(
                                        ''.join(degree_name_re), '').replace(
                                            'Specialization in',
                                            '').replace(''.join(unuse_re),
                                                        '').strip()
                            # BBA 的特殊情况
                            if 'BBA IN' in item['degree_name']:
                                item['major_name_en'] = item[
                                    'degree_name'].replace('BBA IN',
                                                           '').strip().title()
                                item[
                                    'degree_name'] = "BACHELOR OF BUSINESS ADMINISTRATION".title(
                                    )
                            # print("item['degree_name']: ", item['degree_name'])
                            item['degree_name'] = item['degree_name'].replace(
                                ',', ' or ')
                            item['degree_name'] = item['degree_name'].replace(
                                '  ',
                                ' ').replace(''.join(duration_re), '').replace(
                                    'Concentration -',
                                    '').replace('- 120 credits', '').replace(
                                        ''.join(unuse_re), '').replace(
                                            ' in ' + item['major_name_en'], ''
                                        ).replace(
                                            'in Women’s and Gender Studies',
                                            '').replace('Major -', '').replace(
                                                'Specialization  -',
                                                '').strip()
                            print("item['degree_name']: ", item['degree_name'])
                            # 如果含有分专业

                            # 含有 or的学位需要拆分成两条
                            if ' or ' in item['degree_name']:
                                degree_name_list_again = item[
                                    'degree_name'].split(' or ')
                                print('degree_name_list_again：',
                                      degree_name_list_again)
                                for deg in degree_name_list_again:
                                    if deg == 'Science':
                                        item[
                                            'degree_name'] = 'Bachelor of ' + deg.strip(
                                            ).strip('or').strip()
                                    else:
                                        item['degree_name'] = deg.strip(
                                        ).strip('or').strip()
                                    if len(major_name_en_list
                                           ) > 1 and '(B' not in ''.join(
                                               major_name_en_list
                                           ) and item['duration'] == '4':
                                        # item['degree_name'] = degree_name_list[-1]
                                        # degree_name_list = degree_name_list[:-1]
                                        for major_name in major_name_en_list:
                                            item[
                                                'major_name_en'] = major_name.replace(
                                                    'Specialization in',
                                                    '').replace(
                                                        'Option in',
                                                        '').replace(
                                                            '(27 credits)',
                                                            '').replace(
                                                                '(24 credits)',
                                                                '').strip()
                                            print("item['major_name_en']==:",
                                                  item['major_name_en'])
                                            yield item
                                    else:
                                        yield item
                            elif 'B.Eng.' in item['degree_name']:
                                item['degree_name'] = 'B.Eng.'
                                yield item
                            else:
                                if len(major_name_en_list
                                       ) > 1 and '(B' not in ''.join(
                                           major_name_en_list
                                       ) and item['duration'] == '4':
                                    # item['degree_name'] = degree_name_list[-1]
                                    # degree_name_list = degree_name_list[:-1]
                                    for major_name in major_name_en_list:
                                        item[
                                            'major_name_en'] = major_name.replace(
                                                'Specialization in',
                                                '').replace(
                                                    'Option in', '').replace(
                                                        '(27 credits)',
                                                        '').replace(
                                                            '(24 credits)',
                                                            '').strip()
                                        print("item['major_name_en']==:",
                                              item['major_name_en'])
                                        yield item
                                else:
                                    yield item
                else:
                    yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 22

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "McMaster University"
        item['url'] = response.url
        print("===========================")
        print(response.url)
        item['other'] = '''问题描述：1.学院和学费是根据先前的表格匹配获得的，不是所有的专业都有，能够匹配上
        2.没有entry_requirements
        3.课程长度是根据先前的表格赋值的
        4.专业描述和课程设置、就业为空的是详情页没有的'''
        '''公共字段'''
        item['campus'] = 'Hamilton'
        item['location'] = 'Hamilton'
        item['sat_code'] = item['toefl_code'] = '0936'
        item['act_code'] = '5326'
        item['start_date'] = '9月'
        item['duration'] = '4'
        item['duration_per'] = 1
        item['apply_pre'] = 'CAD$'
        item['apply_fee'] = '100'
        item['tuition_fee_pre'] = 'CAD$'

        # https://future.mcmaster.ca/admission/language-2/
        item[
            'ielts_desc'] = '6.5 Overall with a minimum of 6.0 in each of the four components (Reading, Writing, Speaking, Listening); results valid for 2 years'
        item['ielts'] = '6.5'
        item['ielts_l'] = '6.0'
        item['ielts_s'] = '6.0'
        item['ielts_r'] = '6.0'
        item['ielts_w'] = '6.0'
        item[
            'toefl_desc'] = 'IBT: 86 Overall with a minimum score of 20 in each of the four components (Reading, Writing, Speaking, Listening); valid for 2 years'
        item['toefl'] = '86'
        item['toefl_l'] = '20'
        item['toefl_s'] = '20'
        item['toefl_r'] = '20'
        item['toefl_w'] = '20'

        # https://future.mcmaster.ca/admission/requirements/
        item[
            'require_chinese_en'] = '''<p>Senior High School (Upper Middle School) Graduation Diploma and Academic Proficiency test/Huikao and Gaokao.</p>
<p>NOTE: Applicants who do not present Gaokao must provide a letter stating the reason(s) for not sitting Gaokao. We require transcripts for the last three years of Upper Middle School, results of standardized tests (eg. SAT, ACT, IB, AP, GCE) and school profile.</p>
<p>NOTE: Applicants presenting a combination of Chinese curriculum and British Pattered A Levels must present three different A(2) Level subjects required for the program of application.</p>'''
        try:
            major_name = response.xpath(
                "//h1[@class='banner-title banner-title-lg banner-title-line banner-text-rev']//text()"
            ).extract()
            item['major_name_en'] = ''.join(major_name).replace(
                "(BHSc)", "").replace("(Bachelor Fine Arts)", "").strip()
            print("item['major_name_en']: ", item['major_name_en'])

            department_res = response.xpath(
                "//h2[contains(text(),'Web Links')]/following-sibling::ul//a[contains(text(),'Faculty of')]//text()|"
                "//h2[contains(text(),'Web Links')]/following-sibling::ul//a[contains(text(),'DeGroote School of Medicine')]//text()"
            ).extract()
            print("department_res: ", department_res)
            if len(department_res) > 0:
                item['department'] = department_res[0].replace(
                    "Book a", "").replace("Tour", "").strip()
            department_list = [
                "DeGroote School of Business",
                "DeGroote School of Business",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Engineering",
                "Faculty of Health Sciences",
                "Faculty of Health Sciences",
                "Faculty of Humanities",
                "Faculty of Humanities",
                "Faculty of Humanities",
                "Faculty of Science",
                "Faculty of Science",
                "Faculty of Science",
                "Faculty of Science",
                "Faculty of Science",
                "Faculty of Science",
                "Faculty of Social Sciences",
            ]
            major_name_list = [
                "Business",
                "Integrated Business & Humanities",
                "Engineering",
                "Engineerin",
                "Integrated Biomedical Engineering & Health Sciences",
                "Integrated Biomedical Engineering & Health Sciences",
                "Computer Science",
                "Computer Science-Co op",
                "Automotive & Vehicle Engineering Technology",
                "Bachelor of Technology",
                "Automation Engineering Technology",
                "Health Sciences",
                "Nursing",
                "Humanities",
                "Music",
                "Studio Art",
                "Chemical and Physical Sciences",
                "Environmental & Earth Sciences Gateway",
                "Integrated Science",
                "Kinesiology",
                "Life Sciences Gateway",
                "Mathematics & Statistics Gateway",
                "Social Sciences",
            ]
            department_dict = {}
            for d in range(len(department_list)):
                department_dict[major_name_list[d]] = department_list[d]
            if item['department'] is None:
                item['department'] = department_dict.get(item['major_name_en'])
            print("item['department']: ", item['department'])

            tuition_fee_list = [
                "34119",
                "34119",
                "40008",
                "40008",
                "40008",
                "40008",
                "30691",
                "30691",
                "31373",
                "31373",
                "31373",
                "28484",
                "31433",
                "27151",
                "27151",
                "27151",
                "28505",
                "28505",
                "24902",
                "28505",
                "28505",
                "28505",
                "27156",
                "28000",
            ]
            major_name_tuition_fee_list = [
                "Business",
                "Integrated Business & Humanities",
                "Engineering",
                "Engineerin",
                "Integrated Biomedical Engineering & Health Sciences",
                "Integrated Biomedical Engineering & Health Sciences",
                "Computer Science",
                "Computer Science-Co op",
                "Automotive & Vehicle Engineering Technology",
                "Bachelor of Technology",
                "Automation Engineering Technology",
                "Health Sciences",
                "Nursing",
                "Humanities",
                "Music",
                "Studio Art",
                "Chemical and Physical Sciences",
                "Environmental & Earth Sciences Gateway",
                "Integrated Science",
                "Kinesiology",
                "Life Sciences Gateway",
                "Mathematics & Statistics Gateway",
                "Social Sciences",
                "Arts & Science",
            ]
            tuition_fee_dict = {}
            for d in range(len(tuition_fee_list)):
                tuition_fee_dict[
                    major_name_tuition_fee_list[d]] = tuition_fee_list[d]
            item['tuition_fee'] = tuition_fee_dict.get(item['major_name_en'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            overview_en = response.xpath(
                "//div[@class='entry-content']//div//*[contains(text(),'Why ')]/preceding-sibling::*"
            ).extract()
            if len(overview_en) == 0:
                overview_en = response.xpath(
                    "//h2[contains(text(),'Overview')]/following-sibling::div[1]/p"
                ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            if item['overview_en'] == "":
                item['overview_en'] = None
                print("overview_en 为空")
            # print("item['overview_en']: ", item['overview_en'])

            # modules_en    //h5[@class='mb-0']//button[contains(@class,'btn btn-link')]
            modules_en = response.xpath(
                "//div[@id='first-year-courses-content']//h5[@class='mb-0']//button[contains(@class,'btn btn-link')]"
            ).extract()
            modules_en = ''.join(modules_en).replace("<button", '<p').replace(
                "</button>", '</p>')
            item['modules_en'] = remove_class(clear_lianxu_space([modules_en]))
            if item['modules_en'] == "":
                item['modules_en'] = None
                print("modules_en 为空")
            # print("item['modules_en']: ", item['modules_en'])

            # career_en     //h3[contains(text(),'Careers or Options Beyond This Program')]/following-sibling::*[1]
            career_en = response.xpath(
                "//h3[contains(text(),'Careers or Options Beyond This Program')]/following-sibling::*[1]|"
                "//h3[contains(text(),'Career or Options Beyond This Program')]/following-sibling::*[1]|"
                "//h3[contains(text(),'Careers/Options Beyond This Program')]/following-sibling::*[1]|"
                "//h3[contains(text(),'Careers/Opportunities Beyond This Program')]/following-sibling::*[1]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            if item['career_en'] == "":
                item['career_en'] = None
                print("career_en 为空")
            # print("item['career_en']: ", item['career_en'])

            # 以下值不一样
            # "student-program-preference": "450",
            post_dict = {
                "student-status": "attending-international",
                "student-province-territory": "alberta",
                "student-filterby": "filterby-curriculum",
                "student-background": "cegep",
                "student-type-as": "american_style_curriculum",
                "student-type-gce": "general_certificate_education",
                "student-type-ib": "international_baccalaureate",
                "student-type--ap": "advanced_placement",
                "action": "admission_form_process",
            }
            key_value = response.xpath(
                '//input[@name="student-program-preference"]/@value').extract(
                )
            key_value = ''.join(key_value).strip()
            # print(key_value)
            post_dict['student-program-preference'] = key_value
            '''post 请求详细信息'''
            detail_dict = self.parse_detail_data(post_dict)

            item['deadline'] = detail_dict['deadline']
            # print("item['deadline']: ", item['deadline'])

            item['act_desc'] = detail_dict['act_desc'].replace(
                "Submit an ACT composite score of at least", "").strip()
            # print("item['act_desc']: ", item['act_desc'])

            item['sat1_desc'] = detail_dict['sat1_desc']
            print("item['sat1_desc']: ", item['sat1_desc'])

            item['alevel'] = detail_dict['alevel']
            # print("item['alevel']: ", item['alevel'])

            item['ib'] = detail_dict['IB']
            # print("item['ib']: ", item['ib'])

            item['ap'] = detail_dict['ap']
            # print("item['ap']: ", item['ap'])

            degree_name = detail_dict['degree_name']
            print("degree_name: ", degree_name)
            if "," in degree_name:
                degree_name_list = degree_name.split(',')
                for d in degree_name_list:
                    item['degree_name'] = d.strip()
                    print("item['degree_name']: ", item['degree_name'])
                    yield item
            else:
                item['degree_name'] = degree_name
                print("item['degree_name']: ", item['degree_name'])
                # Bachelor of Technology的特殊情况
                if response.url == "https://future.mcmaster.ca/programs/btech/":
                    item['degree_name'] = item[
                        'major_name_en'] = "Bachelor of Technology"
                    # spe_major_name_list = ["Automotive and Vehicle Engineering Technology", "Biotechnology", "Automation Engineering Technology"]
                    spe_career_en_list = [
                        "<p>Graduates could be involved in the automotive industry with research and technology applications related to:</p> <ul> <li>development of new automotive products and revision of existing ones</li> <li>collaboration in research and development</li> <li>production planning and designing new production processes</li> <li>conducting and developing test procedures</li> <li>automotive product design, manufacturing and quality improvement</li> </ul> <p>Some careers our recent grads are pursuing include:</p> <ul> <li>Design Engineer (Honda)</li> <li>Research Engineer (Ford)</li> <li>Management Associate (US Steel)</li> <li>M.Eng Design, McMaster University</li> <li>MASc, Mechanical Engineering, McMaster University</li> </ul>",
                        "<p>Graduates will qualify for positions in government, university and industry. They will also strengthen the competitiveness of businesses in biotechnology with research and technology applications related to:</p> <ul> <li>biotechnology</li> <li>genetic engineering</li> <li>pharmaceuticals</li> <li>food production</li> <li>analytical and testing services</li> <li>policies and regulations</li> </ul> <p>Some careers our recent grads are pursuing include:</p> <ul> <li>Chemist, Esteè Lauder</li> <li>Production Supervisor, (Bungee)</li> <li>M.Sc. Ontario Institute for Cancer Research, UOIT</li> <li>Master of Biotechnology, University of Toronto</li> <li>PhD in Biomedical Engineering, McMaster University</li> </ul>",
                        "<p>Graduates can work for companies in various industrial processing and manufacturing sectors related to:</p> <ul> <li>primary steel</li> <li>chemicals</li> <li>petrochemicals</li> <li>pharmaceuticals</li> <li>power generation</li> </ul> <p>Some careers our recent grads are pursuing include:</p> <ul> <li>Process Controls Specialist, GE Water and Process Technologies</li> <li>Senior Automation Analyst, Arcelor-Mittal Dofasco</li> <li>M.A.Sc. Electrical &amp; Computer Engineering, McMaster University</li> <li>M.Eng Manufacturing, McMaster University</li> </ul>"
                    ]
                    # for i in range(len(spe_major_name_list)):
                    #     item['major_name_en'] = spe_major_name_list[i]
                    item['career_en'] = ''.join(spe_career_en_list)
                    yield item
                else:
                    if item['degree_name'] == "":
                        item['degree_name'] = None
                    yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 23

Mostrar archivo

Archivo: ConcordiaUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)
        item['school_name'] = "Concordia University"
        # item['campus'] = 'Antigonish'
        # item['location'] = 'Antigonish'
        item['url'] = response.url
        print("===========================")
        print(response.url)
        item['other'] = '''1.Concordia申请专业没有校区，根据选课的课程所在的校区上课
        2.开学时间空着是详情页上没有的'''
        try:
            # degree_type
            department = response.xpath(
                "//div[@class='span8 ordinal-group-1']/a[contains(text(), 'Faculty of')]//text()|"
                "//div[@class='span8 ordinal-group-1']/a[contains(text(), 'Gina Cody School of Engineering and Computer Science')]//text()|"
                "//div[@class='span8 ordinal-group-1']/a[contains(text(), 'John Molson School of Business')]//text()"
            ).extract()
            print(department)
            # if len(department) > 0:
            #     item['department'] = department[0]
            item['department'] = ', '.join(department)
            print("item['department']: ", item['department'])

            # 专业
            programme = response.xpath(
                "//div[@class='span8 ordinal-group-1']/h1//text()").extract()
            programme = ''.join(programme)
            degree_type = re.findall(r"\([\w\s,/]+\)", programme)
            degree_type = ''.join(degree_type)
            # print("degree_type: ", degree_type)
            degree_type_str = degree_type.strip().strip("(").strip(")")
            # print("degree_type_str: ", degree_type_str)

            programme = programme.strip()
            if degree_type != "":
                programme = programme.split(degree_type)
            item['major_name_en'] = ''.join(programme).replace(
                "(BA)", "").replace("(BFA)",
                                    "").replace("(BA. Cert)",
                                                "").replace(" (BEd)",
                                                            "").strip()
            print("item['major_name_en']: ", item['major_name_en'])

            if "Minor" not in degree_type_str:
                # AP
                # apDict = {'Art History': 'ARTH 200 (6)* or GFAR (6)', 'Biology': 'BIOL 201 (3) and BIOL 1st year level (3)',
                #           'Calculus AB\xa0': 'MATH 203 (3), with exemption from MATH 201, 206 and 209*',
                #           'Calculus BC': 'MATH 203 (3) and MATH 205 (3), with exemption from MATH 201, 206 and 209*',
                #           'Chemistry': 'CHEM 205 (3) and CHEM 206 (3)', 'Chinese': 'MCHI 1st year level (6)',
                #           'Computer Science A': 'COMP 248 (3)', 'Economics: Macroeconomics': 'ECON 203 (3)',
                #           'Economics: Microeconomics': 'ECON 201 (3)',
                #           'English Language and Composition': 'ENGL 1st year level (6)\xa0',
                #           'English Literature and Composition': 'ENGL 1st year level (6)\xa0',
                #           'Environmental Science': 'GEOG 1st year level (3)', 'French Language': 'FRAN 211 (6)\xa0',
                #           'French Literature': 'FRAN 1st year level (6)',
                #           'German Language and Culture': 'GERM 200 (6) with exemptions for GERM 201 and GERM 202',
                #           'Government and Politics: Comparative': 'POLI 203 (3)',
                #           'Government and Politics: United States': 'POLI 1st year level (3), with an exemption for POLI\xa0310',
                #           'History: European': 'HIST 1st year level (6)',
                #           'History: United States': 'HIST 251 (3) and HIST 253 (3)',
                #           'Human Geography': 'GEOG 1st year level (3)',
                #           'Italian Language and Culture': 'ITAL 200 (6) with exemptions for ITAL 201 and ITAL 202',
                #           'Japanese': 'MODL 1st year level (6)', 'Latin': 'CLAS 1st year level (6)',
                #           'Music Theory': 'MUSI A (3)\xa0', 'Physics 1': 'No transfer credit awarded',
                #           'Physics 2': 'No transfer credit awarded',
                #           'Physics 1 and Physics 2': 'No transfer credit awarded', 'Physics C (Mechanics)': 'PHYS 204 (3)',
                #           'Physics C\xa0(Electricity and Magnetics)': 'PHYS 205 (3)', 'Psychology': 'PSYC 200 (6)',
                #           'Spanish Language and Culture': 'SPAN 200 (6) with exemptions for SPAN 201 and SPAN 202',
                #           'Spanish Literature and Culture': 'SPAN 200 (6) with exemptions for SPAN 201 and SPAN 202',
                #           'Statistics': 'MATH 1st year level (6)', 'Studio Art: Drawing': 'SFAR A (6)',
                #           'Studio Art :2-D Design:': 'SFAR A (6)', 'Studio Art: 3-D Design:': 'SFAR A (6)',
                #           'World History': 'HIST 1st year level (6)'}
                # item['ap'] = apDict.get(item['major_name_en'])
                # print("item['ap']: ", item['ap'])

                # overview  //h4[contains(text(),'Why study')]/..
                overview = response.xpath(
                    "//h4[contains(text(),'Program details')]/../../preceding-sibling::*[position()<last()]|"
                    "//h4[contains(text(),'Program Details')]/../../preceding-sibling::*[position()<last()]"
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        "//h4[contains(text(),'Why study')]/..").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # if item['overview_en'] == "":
                #     print("overview_en 为空")
                # print("item['overview_en']: ", item['overview_en'])

                # 正则从全文中匹配duration
                duration_dict = {
                    "One": "1",
                    "Two": "2",
                    "Three": "3",
                    "Four": "4",
                    "Five": "5",
                    "Six": "6",
                    "Seven": "7",
                    "Eight": "8",
                    "Nine": "9",
                    "Ten": "10",
                    "one": "1",
                    "two": "2",
                    "three": "3",
                    "four": "4",
                    "five": "5",
                    "six": "6",
                    "seven": "7",
                    "eight": "8",
                    "nine": "9",
                    "ten": "10",
                }
                duration_re = re.findall(
                    r"[a-z]+\sor\s[a-z]+\syears|[a-z]+\sto\s[a-z]+\syears|of\s[a-z]+\syears",
                    remove_tags(response.text))
                # print("duration_re: ", duration_re)
                duration_re_1 = re.findall(r"three|four|five",
                                           ''.join(duration_re))
                # print(duration_re_1)
                if len(duration_re_1) > 0:
                    item['duration_per'] = '1'
                    d_tmp_str = ""
                    for duration in duration_re_1:
                        d_tmp_str += duration_dict.get(duration) + "-"
                    item['duration'] = d_tmp_str.strip().strip('-').strip()
                # print("item['duration']: ", item['duration'])

                # portfolio作品集描述
                # //a[@name='legend-expand'][contains(text(),'Portfolio')]/../../following-sibling::div[1]
                portfolio = response.xpath(
                    "//a[@name='legend-expand'][contains(text(),'Portfolio')]/../../following-sibling::div[1]"
                ).extract()
                # clear_space(entry_requirements)
                portfolio = remove_class(clear_lianxu_space(portfolio))
                item['portfolio_desc_en'] = portfolio
                # print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])

                # 开学日期
                # //a[@name='legend-expand'][contains(text(),'Application deadlines')]/../../following-sibling::div[1]//table//tr[1]/td[position()>1]
                start_date = response.xpath(
                    "//a[@name='legend-expand'][contains(text(),'Application deadlines')]/../../following-sibling::div[1]//table//tr[1]/td[position()>1]//text()|//a[@name='legend-expand'][contains(text(),'Application deadlines')]/../../following-sibling::div[1]//table//th//text()|//a[@name='legend-expand'][contains(text(),'Applications deadlines')]/../../following-sibling::div[1]//table//tr[1]/td[position()>1]//text()"
                ).extract()
                start_date = ', '.join(start_date).strip()
                # print(start_date)
                sd = ""
                if len(start_date) != 0:
                    if "Winter" in start_date:
                        sd += "1月"
                    if "Fall" in start_date:
                        sd += ",9月"
                item['start_date'] = sd
                # print("item['start_date']: ", item['start_date'])

                # 截止日期
                # //a[@name='legend-expand'][contains(text(),'Application deadlines')]/../../following-sibling::div[1]//table//tr[3]/td[position()>1]
                deadline = response.xpath(
                    "//a[@name='legend-expand'][contains(text(),'Application deadlines')]/../../following-sibling::div[1]//table//tr[3]/td[position()>1]//text()|"
                    "//a[@title='Open Applications deadlines']/../../following-sibling::div[1]//table//tr[3]/td[position()>1]//text()|"
                    "//b[contains(text(),'outside')]/../following-sibling::td//b//text()"
                ).extract()
                deadline_str = ', '.join(deadline).strip()
                # print(deadline_str)
                monthDict = {
                    "january": "01",
                    "february": "02",
                    "march": "03",
                    "april": "04",
                    "may": "05",
                    "june": "06",
                    "july": "07",
                    "august": "08",
                    "september": "09",
                    "october": "10",
                    "november": "11",
                    "december": "12",
                    "jan": "01",
                    "feb": "02",
                    "mar": "03",
                    "apr": "04",
                    "jun": "06",
                    "jul": "07",
                    "aug": "08",
                    "sep": "09",
                    "oct": "10",
                    "nov": "11",
                    "dec": "12",
                    "sept": "09",
                }
                month_re = re.findall(
                    r"january|february|march|april|may|june|july|august|september|october|november|december",
                    deadline_str, re.I)
                # print("month_re: ", month_re)
                day_re = re.findall(r"\d+", deadline_str)
                # print("day_re: ", day_re)

                deadline_tmp_str = ""
                if len(month_re) > 0:
                    for m in range(len(month_re)):
                        month_re1 = monthDict.get(month_re[m].strip().lower())
                        # print("month_re1: ", month_re1)

                        day_re1 = day_re[m]
                        if day_re1 != "" and int(day_re1) < 10:
                            day_re1 = "0" + day_re1
                        # print("day_re1: ", day_re1)

                        deadline_tmp_str += "2019" + "-" + month_re1 + "-" + day_re1 + ","
                item['deadline'] = deadline_tmp_str.strip().strip(',').strip()
                # print("item['deadline']: ", item['deadline'])

                # 就业
                # //a[@name='legend-expand'][contains(text(),'Application deadlines')]/../../following-sibling::div[1]//table//tr[3]/td[position()>1]
                career = response.xpath(
                    "//a[@name='legend-expand'][contains(text(),'Career opportunities')]/../../following-sibling::div[1]|"
                    "//a[@name='legend-expand'][contains(text(),'After your degree')]/../../following-sibling::div[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("career 为空")
                # print("item['career_en']: ", item['career_en'])
                '''课程设置不跳转'''
                # modules_url = response.xpath("//a[contains(text(),'Consult the Undergraduate Calendar')]/@href|"
                #                              "//a[contains(text(),'consult the Undergraduate Calendar')]/@href|"
                #                              "//a[contains(text(),'Consult the undergraduate calendar')]/@href").extract()
                # modules_url = response.xpath("//a[contains(@href,'/academics/undergraduate/calendar/current/sec')]/@href|"
                #                              "//a[contains(@adhocenable,'false')][contains(@href,'/jmsb/programs/undergraduate/bachelor/majors/')]/@href").extract()
                # print("modules_url: ", modules_url)
                # if len(modules_url) > 0:
                #     modules_url_parse = "https://www.concordia.ca" + modules_url[0]
                #     print("**************", modules_url_parse)
                #     item['modules_en'] = self.parse_modules(modules_url_parse, item['major_name_en'])
                modules_en = response.xpath(
                    "//a[contains(@title,'Open Course curriculum')]/../../following-sibling::div|"
                    "//a[contains(@title,'Open Sample classes')]/../../following-sibling::div|"
                    "//a[contains(@title,'Open Curriculum')]/../../following-sibling::div"
                ).extract()
                item['modules_en'] = remove_class(
                    clear_lianxu_space(modules_en))
                if item['modules_en'] == "":
                    # print("modules_en 为空")
                    modules_url = response.xpath(
                        "//a[contains(@adhocenable,'false')][contains(@href,'/jmsb/programs/undergraduate/bachelor/majors/')]/@href"
                    ).extract()
                    if len(modules_url) == 0:
                        modules_url = response.xpath(
                            "//a[contains(@adhocenable,'false')][contains(@href,'/jmsb/programs/undergraduate/bachelor/program-structure/core-courses')]/@href"
                        ).extract()

                    # print("modules_url: ", modules_url)
                    if len(modules_url) > 0:
                        modules_url_parse = "https://www.concordia.ca" + modules_url[
                            0]
                        print("**************", modules_url_parse)
                        item['modules_en'] = self.parse_modules(
                            modules_url_parse, item['major_name_en'])
                print("item['modules_en']: ", item['modules_en'])

                # //b[contains(text(),'Jazz Studies:')]/../b
                # 商学院   //div[@class='content-main parsys']//div//div[1]//div[1]//div[1]//div[1]//h3[1]
                '''公共字段'''
                # https://www.concordia.ca/admissions/undergraduate/requirements/international.html
                item[
                    'ap'] = "If you have successfully passed Advanced Placement examinations in appropriate subjects with a grade of 3* or better (exceptions noted with *), you may be granted some advanced standing. We will notify you if you’ve been given advanced standing in your Offer of Admission."
                item[
                    'require_chinese_en'] = "<p>Senior Middle School Diploma plus Chinese National University Entrance Examinations (if available)</p>"
                item['apply_fee'] = "100"
                item['apply_pre'] = "$"

                # https://www.concordia.ca/admissions/undergraduate/requirements/english-language-proficiency.html
                item['ielts_desc'] = "IELTS score of 7 or higher"
                item['ielts'] = "7"
                item['toefl_desc'] = "TOEFL iBT score 90 or higher"
                item['toefl'] = "90"
                item['sat_code'] = item['toefl_code'] = "0956"

                # 学位名称
                degree_name = response.xpath(
                    "//div[@class='span8 ordinal-group-1']/h5//text()"
                ).extract()
                # degree_name_str = ''.join(degree_name).strip()

                if "," in ''.join(degree_name).strip():
                    degree_name = ''.join(degree_name).strip().split(',')
                print(degree_name)

                if len(degree_name) > 0:
                    for degree in degree_name:
                        degree_del = re.findall(r"\([\w\s,]+\)", degree)
                        item['degree_name'] = degree.replace(
                            ''.join(degree_del), '').strip()
                        print("item['degree_name']: ", item['degree_name'])
                        if item['degree_name'] != "Certificate":

                            # 有多个学位的专业有多个entry_requirements、ib
                            is_entry_ib = response.xpath(
                                "//a[@name='legend-expand'][contains(text(),'Admission requirements')]/span[contains(text(), 'BA')]//text()|"
                                "//a[@name='legend-expand'][contains(text(),'Admission requirements')]/span[contains(text(), 'BSc')]//text()"
                            ).extract()
                            print("is_entry_ib: ", is_entry_ib)
                            entry_requirements = response.xpath(
                                "//a[@name='legend-expand'][contains(text(),'Admission requirements')]/../../following-sibling::div[1]"
                            ).extract()
                            entry_requirements_str = remove_tags(
                                ''.join(entry_requirements))
                            if len(is_entry_ib) == 2:
                                if item['degree_name'] == "Bachelor of Science":
                                    item[
                                        'entry_requirements_en'] = remove_class(
                                            clear_lianxu_space(
                                                [entry_requirements[0]]))
                                elif item['degree_name'] == "Bachelor of Arts":
                                    item[
                                        'entry_requirements_en'] = remove_class(
                                            clear_lianxu_space(
                                                [entry_requirements[-1]]))
                            else:
                                item['entry_requirements_en'] = remove_class(
                                    clear_lianxu_space(entry_requirements))
                            print("item['entry_requirements_en']: ",
                                  item['entry_requirements_en'])

                            IB = re.findall(
                                r"International\sBacc.\s\(IB\):.{1,300}",
                                entry_requirements_str)
                            # print("IB: ", IB)
                            IB_str = ''.join(IB)
                            if len(is_entry_ib) == 2:
                                if item['degree_name'] == "Bachelor of Science":
                                    IB_str = IB[0]
                                elif item['degree_name'] == "Bachelor of Arts":
                                    IB_str = IB[-1]
                            ib_re = re.findall(r":[\w\W].*", IB_str)
                            item['ib'] = ''.join(ib_re).strip().strip(
                                ":").strip()
                            item['ib'] = item['ib'].replace(
                                "International Bacc. (IB):", ";")
                            # print("item['ib']: ", item['ib'])

                            # 判断获取item['alevel']
                            item['alevel'] = self.get_alevel(
                                item['degree_name'], item['department'])
                            # print("item['alevel']: ", item['alevel'])

                            # 判断获取学费
                            item['tuition_fee_pre'] = "$"
                            item['tuition_fee'] = self.get_tuition_fee(
                                item['degree_name'], item['department'],
                                item['major_name_en'])
                            # print("item['tuition_fee']: ", item['tuition_fee'])

                            if item['degree_name'] == "Bachelor of Computer Science" and "Gina Cody School of Engineering and Computer Science" in item[
                                    'department']:
                                item[
                                    'department'] = "Gina Cody School of Engineering and Computer Science"
                            elif item[
                                    'degree_name'] == "Bachelor of Science" and "Faculty of Arts & Science" in item[
                                        'department']:
                                item[
                                    'department'] = "Faculty of Arts & Science"
                            elif item[
                                    'degree_name'] == "Bachelor of Artse" and "Faculty of Arts & Science" in item[
                                        'department']:
                                item[
                                    'department'] = "Faculty of Arts & Science"

                            yield item

        except Exception as e:
            with open("scrapySchool_Canada_Ben/error/" + item['school_name'] +
                      ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 24

Mostrar archivo

Archivo: lakehead_university_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)

        #1.学校名称
        school_name = 'Lakehead University'

        #2.地点
        try:
            location = response.xpath('').extract()[0]
            location = remove_tags(location)
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus_list = response.xpath(
                '//h2[contains(text(),"Campus")]/following-sibling::div[1]'
            ).extract()[0]
            campus_list = remove_tags(campus_list)
            campus_list = campus_list.replace(', Online', '')
            campus_list = campus_list.replace(' ', '')
            campus_list = campus_list.split(',')
            #print(campus_list)
        except:
            campus_list = None
            #print(campus_list)

#4. 学院
        try:
            department = response.xpath(
                '//h2[contains(text(),"Department")]/following-sibling::div[2]'
            ).extract()[0]
            department = remove_tags(department)
            #print(department)
        except:
            department = None
            #print(department)

# 4. 学位名称列表,需要拆分,在下方yield 处写循环.此处是拆分存入列表
#'https://www.lakeheadu.ca/academics/undergraduate-programs/engineering'特殊情况
#'https://www.lakeheadu.ca/academics/undergraduate-programs/forestry'
        try:
            degree_name_list = response.xpath(
                '//h2[contains(text(),"Degrees")]/following-sibling::div[1]'
            ).extract()[0]
            degree_name_list = remove_tags(degree_name_list,
                                           keep=('li', 'br', ''))
            degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                      degree_name_list)
            degree_name_list = degree_name_list.replace('<br>', '---')
            degree_name_list = degree_name_list.replace('<li>', '').replace(
                '</li>', '---')
            degree_name_list = degree_name_list.replace('<span>', '').replace(
                '</span>', '---')
            degree_name_list = degree_name_list.split('---')
        # print(degree_name_list)
        except:

            degree_name_list = None
            #print(degree_name_list)

#5.学位描述
        try:
            degree_overview_en = response.xpath(
                '//div[2]/div[1]/div/div/div/p|//div/div[1]/div/div/div/p'
            ).extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文
        try:
            major_name_en = response.xpath('//h1').extract()[0]
            major_name_en = remove_tags(major_name_en)
            #print(major_name_en)
        except:
            major_name_en = None
            #print(major_name_en)

#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

# #8.入学时间
#         try:
#             start_date = response.xpath('').extract()[0]
#             start_date = remove_tags(start_date)
#             # print(start_date)
#         except:
#             start_date = None
#             # print(start_date)

#9.课程长度
# try:
#     duration = response.xpath('').extract()[0]
#     duration = remove_tags(duration)
#     # print(duration)
# except:
#     duration = None
#     # print(duration)

#10.课程设置
        try:
            modules_en = response.xpath(
                '//div[@class = "field field-name-field-first-year field-type-text-long field-label-hidden"]'
            ).extract()[0]
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en)
            #print(modules_en)
        except:
            modules_en = None
            #print(modules_en)

#11.就业方向
        try:
            career_en = response.xpath(
                '//h2[contains(text(),"Future Careers")]/following-sibling::div[1]'
            ).extract()[0]
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            #print(career_en)
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            deadline = '2018-11-02,2019-03-01'
            #deadline = remove_tags(deadline)
            # print(career_en)
        except:
            deadline = None
            # print(deadline)
#13.学费
        try:
            tuition_fee = response.xpath(
                '//tbody/tr[1]/td[2]/div/div').extract()[-1]
            tuition_fee = remove_tags(tuition_fee)
            tuition_fee = tuition_fee.replace('$', '')
            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)
#14 申请费:
        apply_fee = '135'

        #15 申请要求

        try:
            entry_requirements_en = '<div><h3>General Admission Requirements</h3><p>To be considered for admission as an International Student, you must currently reside outside of Canada and you are not a Canadian citizen.&nbsp;<span class="s1">To meet the General Admission Requirements to Lakehead, you will need the following:</span></p><ul><li>Successful graduation from an academic secondary school program or equivalent (For most countries, the same academic preparation as is required for university entrance in that country is required for consideration. Refer to&nbsp;Admission Requirements by Country.</a>)&nbsp;</li><li>Program-Specific Prerequisite courses completed at the senior level (e.g. Grade 12);</li><li>Equivalent of a minimum 70% overall final average (Canadian)</li></ul><p>Note: Meeting the minimum admission requirements is not a guarantee of admission.</p></div>'
            #entry_requirements_en = remove_tags(entry_requirements_en)
            # print(entry_requirements_en)
        except:
            entry_requirements_en = None
            # print(entry_requirements_en)

#16 中国学生申请要求
        try:
            require_chinese_en = '<p>Senior High School (Upper Middle School) Graduation Diploma</p><div><h3>General Admission Requirements</h3><p>To be considered for admission as an International Student, you must currently reside outside of Canada and you are not a Canadian citizen.&nbsp;<span class="s1">To meet the General Admission Requirements to Lakehead, you will need the following:</span></p><ul><li>Successful graduation from an academic secondary school program or equivalent (For most countries, the same academic preparation as is required for university entrance in that country is required for consideration. Refer to&nbsp;Admission Requirements by Country.</a>)&nbsp;</li><li>Program-Specific Prerequisite courses completed at the senior level (e.g. Grade 12);</li><li>Equivalent of a minimum 70% overall final average (Canadian)</li></ul><p>Note: Meeting the minimum admission requirements is not a guarantee of admission.</p></div>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = 'https://www.lakeheadu.ca/studentcentral/applying/general-admission-requirements/international-student'
            specific_requirement_en = remove_tags(specific_requirement_en)
            ## print(specific_requirement_en)
        except:
            specific_requirement_en = None
            # print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = response.xpath('').extract()[0]
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = response.xpath('').extract()[0]
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'Applicants will automatically be considered for admission through Lakehead University\'s Academic English Program if they meet Lakehead University\'s admission requirements but do not meet English language proficiency requirements. Applicants who complete Lakehead University\'s Academic English Program will meet the English language proficiency requirement for programs that require an IELTS score of 6.5 (no minimum score less than 6.0) or a TOEFL score of 80 (no minimum score less than 19). As a result, applicants interested in the: Nursing programs, Juris Doctor program, One-Year Social Work program, or the Two-Year Bachelor of Education program will need to successfully complete one of the recognized tests with the appropriate minimum scores as outlined in Option 1.'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'IELTS score of 6.5 (no minimum score less than 6.0) '
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = remove_tags(ielts)
            # print(ielts)
        except:
            ielts = None
            # print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

        #28 toefl_code
        try:
            toefl_code = '0888'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = 'Minimum Score: 80 (no component score less than 19)'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '80'
            #toefl = remove_tags(toefl)
            # print(toefl)
        except:
            toefl = None
            # print(toefl)

#31 toefl_?
        toefl_l = 19
        toefl_s = 19
        toefl_r = 19
        toefl_w = 19

        # 32 alevel
        try:
            alevel = 'Possess the (International) General Certificate of Secondary Education with: Passes in at least five subjects: Two of which must be at the Advanced Level (G.C.E.) Two subjects at the Advanced Supplementary (A.S.) Level may be substituted for one subject at the Advanced Level.  For example, 4 Advanced Supplementary (A.S.) Level courses equal two A Level Courses.  The remaining three passes may be at the Ordinary Level (G.C.S.E.) Acceptable standing must be achieved in all subjects Applicants may apply for admission in the year they will be sitting for their final A-Level examinations provided they can present excellent grades in their O-Level examinations and strong predicted A-Level results. With the exception of the Faculty of Engineering, for all other programs that require "Mathematics" as a prerequisite, AS-Level Mathematics is required. Applicants presenting A-Level examinations with a minimum grade of "C" may be considered for advanced standing. In addition to the above, applicants interested in the four year Bachelor of Engineering degree program must complete the following prerequisite courses: A-Level Mathematics A-Level Physics  A-Level Chemistry is preferred; however, AS-level Chemistry will be accepted  O-Level English '
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            if 'Biology' in major_name_en:
                ib = 'Biology 1-00 (Unspecified First Year Full Credit), Credit Weight:1.0'
            elif 'Chemistry' in major_name_en:
                ib = 'Chemistry 1110 Chemistry 1130,Credit Weight:0.5,0.5'
            elif 'Computer Science' in major_name_en:
                ib = 'Computer Science 1_10 (if score of 5,Unspecified First Year Half Credit)Credit Weight:0.5;if score of 6 or better,Computer Science 1_00 (Unspecified First Year Full Credit) Credit Weight:1.0'
            elif 'Economics' in major_name_en:
                ib = 'Economics 1100 ,Credit Weight:1.0'
            elif 'English' in major_name_en:
                ib = 'English 1115 Arts 1-10 (Unspecified First Year Half Credit),Credit Weight:0.5,0.5'
            elif 'French' in major_name_en:
                ib = 'French 1200,Credit Weight: 1.0'
            elif 'Geography' in major_name_en:
                ib = 'Geography 1150,Credit Weight:0.5'
            elif 'Global Politics' in major_name_en:
                ib = 'Political Science 2611,Credit Weight:0.5'
            elif 'History' in major_name_en:
                ib = 'History 1100 (Allows you to begin completing Year 2 History course requirements),Credit Weight:1.0'
            elif 'Information Technology in a Global Society' in major_name_en:
                ib = 'Sociology 2_00 (Unspecified Second Year Full Credit),Credit Weight:1.0'
            elif 'Language' in major_name_en:
                ib = 'Each language will be assessed on an individual basis.,Credit Weight: up to 1.0'
            elif 'Mathematics' in major_name_en:
                ib = 'Math 1-00 (Unspecified First Year Full Credit),Credit Weight: 1.0'
            elif 'Music' in major_name_en:
                ib = 'Assessed on an individual basis.,Credit Weight:up to 1.0'
            elif 'Philosophy' in major_name_en:
                ib = 'Philosophy 1-10 (Unspecified First Year Half Credit),Credit Weight:0.5'
            elif 'Social and Cultural Anthropology' in major_name_en:
                ib = 'Anthropology 1034,Credit Weight:0.5'
            elif 'Social and Cultural Anthropology' in major_name_en:
                ib = 'Anthropology 1034,Credit Weight:0.5'
            elif 'Sports' in major_name_en:
                ib = 'Kinesiology 1010,Credit Weight:0.5'
            elif 'Exercise and Health Science' in major_name_en:
                ib = 'Kinesiology 1_10 (Unspecified First Year Half Credit),Credit Weight:0.5'
            elif 'Visual Arts' in major_name_en:
                ib = 'Visual Arts 1-00 (Unspecified First Year Full Credit),Credit Weight:1.0'
            else:
                ib = 'IB Diploma with a total score of 28* in six subjects, three of which must be at the Higher Level (HL) with no score less than 4 in any subjectProgram-Specific Prerequisite courses *Higher scores may be required for admission to programs in which the demand for places by qualified applicants exceeds the supply of available spaces.'

            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = response.xpath('').extract()[0]
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = '(1).专业特殊要求字段中,匹配不了专业+学位名(因专业详情页中的学位名与专业特殊要求页面中的学位名不相同,)故无法匹配.网站:https://www.lakeheadu.ca/studentcentral/applying/general-admission-requirements/international-student,duration,(2).地点 location,gaokao_desc，huikao_desc，duration， ,sat2_desc,interview_desc,protfolio_desc字段已经与老师核对确认为空,(3).toefl_code 和 sat_code,act_code,每所学校都相同,其他公共字段相同也同老师核对完毕(4)课程字段每个专业进去的页面并不相同,抓取很有难度.故跟老师反映后并获得同意后,只抓能抓到的第一年的课程'
            #other = remove_tags(other)
            # print(other
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0888'
        sat1_desc = 'The SAT with the following scores: Reading* = five hundred and fifty (550) Math = five hundred and fifty (550) Total score = one thousand and one hundred (1100)'
        sat2_desc = None
        act_code = '5190*'
        act_desc = 'The Enhanced Composite ACT with a Total Score of twenty-four (24)'

        item["ap"] = ap
        item["duration_per"] = 1
        #item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        #item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        #item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        #item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc

        for i in campus_list:
            item["campus"] = i
            if 'ThunderBay' in i:
                item["start_date"] = '2019-01,2019-09'
            if 'Orillia' in i:
                item["start_date"] = '2019-01,2019-09'

            for b in degree_name_list:
                #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                if ' in ' in b:
                    item["major_name_en"] = re.findall('in (.*)', b)[0]
                    if ')' in item["major_name_en"]:
                        # print(item["major_name_en"] + "---" + response.url)
                        item["major_name_en"] = re.findall('\((.*)\)', b)[0]
                #print(item["major_name_en"])
                    else:
                        pass
                elif '(' in b:
                    item["major_name_en"] = re.findall('\((.*)\)', b)[0]
                    # print(item["major_name_en"])
                    if 'Co-op program available' in item["major_name_en"]:
                        item[
                            "major_name_en"] = major_name_en + "(Co-op program available)"
                    elif 'Co-op option available' in item["major_name_en"]:
                        item[
                            "major_name_en"] = major_name_en + "(Co-op option available)"
                    elif 'year' in item["major_name_en"]:
                        item["major_name_en"] = major_name_en
                item["major_name_en"] = item["major_name_en"].replace(
                    ' Major', '')
                #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                if '(' in b:
                    item["degree_name"] = re.sub('\(.*\)', '', b)
                    if ' in ' in item["degree_name"]:
                        item["degree_name"] = re.sub(' in .*', '', b)
                elif ' in ' in b:
                    item["degree_name"] = re.sub(' in .*', '', b)
                else:
                    item["degree_name"] = b
                print(item["degree_name"])

                #print(item["major_name_en"])
                if '1' in b:
                    duration = '1'
                    item["duration"] = duration
                elif '2' in b:
                    duration = '2'
                    item["duration"] = duration
                elif '3' in b:
                    duration = '3'
                    item["duration"] = duration
                elif '4' in b:
                    duration = '4'
                    item["duration"] = duration
                elif '5' in b or 'five' in b or 'Five' in b:
                    duration = '5'
                    item["duration"] = duration
                else:
                    item["duration"] = None
                #print(b)
                if 'Minor' not in major_name_en:
                    yield item

Ejemplo n.º 25

Mostrar archivo

    def parse(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)



        try:
            major_name_en = response.xpath('//h1').extract()[0]
            #major_name_en = ''.join(major_name_en)
            #major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
            major_name_en = remove_tags(major_name_en)
           # print(major_name_en)
        except:
            major_name_en = None
           # print(major_name_en)
#1.学校名称
        school_name = 'University of British Columbia'

#2.地点
        try:
            location =response.xpath('//*[@id="program-vitals"]/ul[1]/li[1]/strong').extract()[0]
            location = remove_tags(location)
           # print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus = location
            # campus = remove_tags(campus)
            # campus = campus.replace(', Online','')
            # campus = campus.replace(' ','')
            # campus = campus.split(',')
            #print(campus)
        except:
            campus = None
            #print(campus)

#4. 学院
        try:
            department = response.xpath('//*[@id="program-vitals"]/ul[1]/li[2]/strong').extract()[0]
            department = remove_tags(department)
            #print(len(department))
            #print(department)
            #print(response.url)
        except:
            department = None
            #print(department)

# 4.
        try:
            degree_name = response.xpath('//*[@id="program-vitals"]/ul[1]/li[3]/strong').extract()[0]
            degree_name = remove_tags(degree_name)
            #degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list)
            #print(degree_name)
            #print(response.url)
        except:
            degree_name = None
            #print(degree_name)

#5.学位描述
        try:
            degree_overview_en = response.xpath('//*[@id="program-summary"]/div').extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_overview_en)
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            #degree_overview_en = degree_overview_en.replace('					','')
            #degree_overview_en = degree_overview_en.replace('			  	','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)

#6.专业英文


#7.专业介绍
        try:
            #overview_en = degree_overview_en
            overview_en = degree_overview_en
            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = response.xpath('').extract()[0]
            start_date = remove_tags(start_date)
            # print(start_date)
        except:
            start_date = None
            # print(start_date)

#9.课程长度
        try:
            duration = response.xpath('//*[@id="program-vitals"]/ul[2]/li[1]/strong').extract()[0]
            if '4.5' in duration:
                duration = '4.5'
                duration_per = '1'
            elif '4' in duration:
                duration = '4'
                duration_per = '1'
            elif '2' in duration:
                duration = '2'
                duration_per = '1'
            elif '5' in duration:
                duration = '5'
                duration_per = '1'
            elif '3' in duration:
                duration = '3'
                duration_per = '1'
            elif '11' in duration:
                duration = '11'
                duration_per = '2'
            elif '16' in duration:
                duration = '16'
                duration_per = '2'
            else:
                duration = None
                duration_per = '1'
            #print(duration)
        except:
            duration = None
            duration_per = 1
           # print(duration)

#10.课程设置
        try:
            modules_en = response.xpath('//h3[contains(text(),"What you will learn")]/following-sibling::div').extract()[0]
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',modules_en)
            modules_en = modules_en.replace('\r\n','').replace('\n','').replace('\t','').replace('                                                                                                                         ','').replace('                                                                       ','')
            #print(modules_en)
        except:
            modules_en = None
            #print(modules_en)

#11.就业方向
        try:
            #career_en = response.xpath('//h2/span[contains(text(),"Your future")]/..//following-sibling::div').extract()[0]
            url = response.xpath('//*[@id="container"]/div/nav/div/ul/li[5]/a/@href').extract()[0]
            #print(url + "  ++++++")
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
            response2 = etree.HTML(requests.get(url, headers=headers).text)
            response2 = response2.xpath('//div[@id="career-options"]')
            #print(response2)
            career_en = []
            # print(response2)
            for rea in response2:
                career_en += etree.tostring(rea, method='html', encoding='unicode')
                career_en = ''.join(career_en)
                print(career_en + "++++++++++++++++++++++++++")
            # print(modules_en,'------------')
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            print(career_en)
        except:
            career_en = None
            print(career_en)

#12.截止日期
        try:
            deadline = '2019-01-15'

        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            if 'Okanagan' in campus:
                if 'Bachelor of Applied Science' in degree_name:
                    tuition_fee = '48,196.15'
                elif 'Bachelor of Arts' in degree_name:
                    tuition_fee = '39,082.15'
                elif 'Bachelor of Education' in degree_name:
                    tuition_fee = '31,655.15'
                elif 'Bachelor of Fine Arts' in degree_name:
                    tuition_fee = '38,882.15'
                elif 'Bachelor of Human Kinetics' in degree_name:
                    tuition_fee = '40,425.37'
                elif 'Bachelor of Management' in degree_name:
                    tuition_fee = '40,389.99'
                elif 'Bachelor of Media Studies' in degree_name:
                    tuition_fee = '37,982.15'
                elif 'Bachelor of Science' in degree_name:
                    tuition_fee = '40,344.15'
                else:
                    tuition_fee = None
            elif 'Vancouver' in campus:
                if 'Bachelor of Applied Science' in degree_name:
                    tuition_fee = '50,820.79'
                elif 'Bachelor of Arts' in degree_name:
                    tuition_fee = '39,746.83'
                elif 'Bachelor of Education' in degree_name:
                    tuition_fee = '52,777.83'
                elif 'Bachelor of Fine Arts' in degree_name:
                    tuition_fee = '39,226.83'
                elif 'Bachelor of Human Kinetics' in degree_name:
                    tuition_fee = None
                elif 'Bachelor of Management' in degree_name:
                    tuition_fee = '40,150.83'
                elif 'Bachelor of Media Studies' in degree_name:
                    tuition_fee = '39,746.83'
                elif 'Bachelor of Environmental Design' in degree_name:
                    tuition_fee = '49,663.83'
                elif 'Science' in degree_name:
                    tuition_fee = '55,959.22'
                elif 'Bachelor of Midwifery' in degree_name:
                    tuition_fee = '48,836.83'
                elif 'Bachelor of Urban Forestry' in degree_name:
                    tuition_fee = '40,200.83'
                elif 'Bachelor of Social Work' in degree_name:
                    tuition_fee = '38,503.83'
                elif 'Bachelor of International Economics' in degree_name:
                    tuition_fee = '48,878.31'
                elif 'Bachelor of Kinesiology' in degree_name:
                    tuition_fee = '40,930.83'
                elif 'Bachelor of Music' in degree_name:
                    tuition_fee = '37,250.83'
                else:
                    tuition_fee = None
            else:
                tuition_fee = None
        except:
            tuition_fee = None
        #print(tuition_fee)
        tuition_fee_per = '5'
#14 申请费:
        apply_fee = '90'

#15 申请要求
        try:
            entry_requirements_en = response.xpath('//*[@id="requirement_countries"]').extract()
            entry_requirements_en = ''.join(entry_requirements_en)
            entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en)
            #entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en)
            #entry_requirements_en = remove_tags(entry_requirements_en)
            #print(entry_requirements_en)
        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = 'Graduation from a university-preparatory program at a senior secondary school: <br><br>Upper Middle School Report of Grades.<br>Huikao or Academic proficiency test results provided by China Credentials Verification (CHESICC) or China Qualifications Verification (CQV).<br>Gaokao examinations results provided by CHESICC or CQV. UBC expects students to achieve  a minimum provincial Tier 1 university cut-off score.<br>If you are not writing the Gaokao, you must submit three or more Advanced Placement (AP) exam results in three or more distinct subjects. Competitive results must be achieved in these examinations.<br>Admission average calculated on final year academic courses/exams:<br><br>Grades required for admission will vary by program, but based on the China grading scale, the minimum average needed to fall within the competitive range is approximately 84% (where the minimum pass grade is 60%).'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = response.xpath('//h4[contains(text(),"Degree-specific")]/following-sibling::*').extract()[0]

            specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
            # specific_requirement_en = specific_requirement_en.replace('\r\n','')
            # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul"))
           # print(specific_requirement_en)
        except:
            specific_requirement_en = None
           # print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc =  'Gaokao examinations results provided by CHESICC or CQV. UBC expects students to achieve  a minimum provincial Tier 1 university cut-off score.'
            gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = response.xpath('').extract()[0]
            gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = 'Huikao or Academic proficiency test results provided by China Credentials Verification (CHESICC) or China Qualifications Verification (CQV)'
            huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = '6.5, with no part less than 6.0'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = '6.5, with no part less than 6.0'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

#28 toefl_code
        try:
            toefl_code = '0965'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = 'Overall: 90,Reading: 22,Listening: 22,Writing: 21,Speaking: 21'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '90'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
           # print(toefl)
        except:
            toefl = None
           # print(toefl)

#31 toefl_?
        toefl_l = 22
        toefl_s = 21
        toefl_r = 22
        toefl_w = 21

# 32 alevel
        try:
            alevel  = None
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
           ib = 'English-language requirements,English is the language of instruction at UBC. All prospective students must demonstrate English-language competency prior to admission. There are numerous ways to meet the English Language Admission Standard.,General admission requirements,Completion of the IB Diploma with a minimum score of 24 points, including at least three Higher Level courses and additional points for Extended Essay and Theory of Knowledge.,Completion of Standard Level or Higher Level English A at a minimum score of 3, where English is the primary language of instruction.,Degree-specific requirements: Arts,No specific courses required beyond those needed for general admission'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'AP courses completed as part of the high school curriculum may be used to meet admission requirements'
            ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = ''
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0965'
        sat1_desc = 'SAT 1 or ACT + Writing. In countries where the SAT and ACT are unavailable, exemptions may be granted. UBC’s SAT institution code is 0965 and its ACT institution code is 5259. The optional SAT essay section is recommended, but not required.'
        sat2_desc = None
        act_code = '5259'
        act_desc = 'SAT 1 or ACT + Writing. In countries where the SAT and ACT are unavailable, exemptions may be granted. UBC’s SAT institution code is 0965 and its ACT institution code is 5259. The optional SAT essay section is recommended, but not required.'

        item["ap"] = ap
        item["duration_per"] = duration_per
        item["duration"] = duration
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        item["tuition_fee_per"] = tuition_fee_per
        #yield item

Ejemplo n.º 26

Mostrar archivo

Archivo: RyersonUniversity_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Ryerson University'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en_a = response.xpath("//div[contains(@class,'resPageHeadin')]//h1").extract()
        major_name_en_a = ''.join(major_name_en_a)
        major_name_en_a = remove_tags(major_name_en_a).strip()
        if '(' in major_name_en_a:
            major_name_en = re.findall('(.*?)\(',major_name_en_a)[0].strip().replace('amp;','')
        else:
            major_name_en = None
        # print(major_name_en)

        #4.degree_name
        degree_name = response.xpath("//strong[contains(text(),'Degree Earned')]//following-sibling::text()").extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name).strip()
        # print(degree_name,'----------')

        #5.overview_en
        overview_en = response.xpath("//h2[contains(text(),'Is It for You?')]//following-sibling::*").extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #6.department
        department = response.xpath("//strong[contains(text(),'Faculty')]//following-sibling::*").extract()
        department = ''.join(department)
        department = remove_class(department).strip().replace('<br>','').replace('amp;','')
        # print(department)

        #7.duration #8.duration_per
        duration = 4
        duration_per = 1



        #10.tuition_fee
        tuition_fee = response.xpath("//div[contains(@class,'stackparsys')]//div[contains(@class,'parbase')]//div[contains(@class,'richtextContent')]").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = re.findall('\$(\d{2},\d{3})',tuition_fee)
        if len(tuition_fee)==1:
            tuition_fee =  tuition_fee[0]
        else:
            tuition_fee = str(tuition_fee[-2]) + '-' + str(tuition_fee[-1])
        # print(tuition_fee)

        #11.tuition_fee_pre
        tuition_fee_pre = '$'

        #12.entry_requirements_en
        entry_requirements_en = response.xpath("//a[contains(text(),'Requirements')]/../../following-sibling::*").extract()[0]
        entry_requirements_en = remove_class(entry_requirements_en).strip()
        entry_requirements_en = clear_space_str(entry_requirements_en)
        # print(entry_requirements_en)

        #13.modules_en
        modules_en_url = response.xpath("//span[contains(text(),'Program Courses')]//..//..//preceding-sibling::*//@href").extract()
        modules_en_url = ''.join(modules_en_url)
        modules_en_url = 'https://www.ryerson.ca' + modules_en_url
        headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        data = requests.get(modules_en_url,headers=headers)
        response1 = etree.HTML(data.text)
        modules_en = response1.xpath("//a[@class='defaultSubTitle' and contains(text(),'Semester')]/../following-sibling::*//tbody")
        doc = ""
        if len(modules_en) > 0:
            for a in modules_en:
                doc += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html'))
        modules_en = doc
        modules_en = clear_space_str(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        # 9.career_en
        career_en = response.xpath("//h2[contains(text(),'After Graduation')]//following-sibling::p").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en,'--------------')

        #14.ielts 15161718
        ielts = 6.5
        ielts_r = 6
        ielts_w = 6
        ielts_s = 6
        ielts_l = 6

        #19.toefl
        if 'Engineering' in department:
            toefl = '83-87'
        elif 'Faculty of Science' in department:
            toefl = '83-87'
        else:
            toefl = '92-93'

        #20.toefl_code,sat_coade
        toefl_code = '0886'
        sat_code = '0886'

        #22.require_chinese_en
        require_chinese_en = '<p>Senior High School (Upper Middle School) Graduation Diploma, Academic Proficiency Test/Upper Middle School Graduation (Hui Kao) Exam, and Chinese National University Entrance Examinations (Gao Kao). Applicants who have not written, or do not intend on writing the Gao Kao examination must submit a signed and dated letter providing the reason(s). Additional requirements include a notarized copy of Hui Kao (or equivalent test) results and results from any other standardized academic tests (e.g. SAT, ACT or AP tests). Copies of awards received for academic achievement as well as reference letters from school officials highlighting academic accomplishments are encouraged. Document Requirements:Interim Secondary A current transcript including any mid-term/mid-year results available, the grading scale, and the name of the diploma to be awarded upon completion of your studies. A school profile should accompany your transcript. Please also provide a letter indicating the date you will write the Gao Kao and proof of registration (if available).  If you have not written, or do not intend on writing the Gao Kao examination, you must submit a signed and dated letter providing the reason(s). Additional requirements include a notarized copy of your Hui Kao (or equivalent test) results and results from any other standardized academic tests (e.g. SAT, ACT or AP tests). Copies of awards received for academic achievement as well as reference letters from school officials highlighting academic accomplishments are encouraged.</p>'

        #23.ap
        ap = 'Graduation from Grade 12 of an academic program at an accredited secondary school with high academic standing including minimum B grades in the program-specific subject prerequisites and a minimum B overall average. Subject to competition, applicants may be required to present averages/grades above the minimum. In most cases, subject prerequisites should be completed at the AP level and/or Grade 12 senior academic level (some exceptions apply). The high school profile (including accreditation, grading scheme, etc.) must accompany the academic record. While we do not have minimum SAT or ACT score requirements, strong performance on a standardized test can strengthen an application. If SAT or ACT examinations have been written, the results should be submitted. Advanced Placement (AP) examination results will also be considered. AP courses with examination scores of 4 or higher will be considered for transfer credit on an individual basis. Engineering students are not eligible for transfer credits for core and professional engineering courses using AP examinations.'

        #24.alevel
        alevel = 'Ryerson University will accept GCE A/AS Levels and GCSE O Levels, as well as the Cambridge Pre-University Certificates or Diploma. BTEC qualifications [BTEC Higher Nationals Level 3, BTEC Higher National Certificate (HNC) Level 4, BTEC Higher National Diploma (HND) Level 5] will be considered for admission on an individual basis provided program specific subject requirements have been completed/are being completed at an appropriate level and the qualifications include sufficient academic content.GCE A Levels with grades of C or better or Pre-U Certificate (Principal Subjects) with grades of M3 or higher, may be considered for transfer credit on an individual basis. No transfer credit is given for AS Levels. Engineering students are not eligible for transfer credits for core and professional engineering courses using GCE A Levels or Pre-U Certificate (Principal Subjects).'

        #25.deadline
        deadline = '2019-02-01'

        #26.location
        location = 'Toronto'

        #27.average_score
        average_score = response.xpath("//div[@class='res-text richtextContent background-opaque']/p[contains(text(),'%')]").extract()
        average_score = ''.join(average_score)
        average_score = remove_tags(average_score)
        try:
            average_score = re.findall('(\d+.*)',average_score)[0]
        except:
            average_score = None

        #28.gaokao_desc 29.huikao_desc
        gaokao_desc = 'Senior High School (Upper Middle School) Graduation Diploma, Academic Proficiency Test/Upper Middle School Graduation (Hui Kao) Exam, and Chinese National University Entrance Examinations (Gao Kao). Applicants who have not written, or do not intend on writing the Gao Kao examination must submit a signed and dated letter providing the reason(s). Additional requirements include a notarized copy of Hui Kao (or equivalent test) results and results from any other standardized academic tests (e.g. SAT, ACT or AP tests). Copies of awards received for academic achievement as well as reference letters from school officials highlighting academic accomplishments are encouraged.A notarized copy of your final Chinese Upper Middle School transcript and graduation diploma are required along with a notarized copy of your Hui Kao (or equivalent test) results and your Gao Kao results verified by China Academic Degrees and Graduate Education Development Centre (CDGDC), external link or China Credentials Verification (CHESICC-Parchment Portal Service), external link. Results from any other standardized academic tests written (e.g. SAT, ACT or AP tests) must also be submitted. If you have not, and will not be sitting for the Gao Kao exam, you must submit a signed and dated letter providing the reason(s).'
        huikao_desc = gaokao_desc




        item['average_score'] = average_score
        item['gaokao_desc'] = gaokao_desc
        item['huikao_desc'] = huikao_desc
        item['location'] = location
        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['department'] = department
        item['duration_per'] = duration_per
        item['duration'] = duration
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['entry_requirements_en'] = entry_requirements_en
        item['modules_en'] = modules_en
        item['career_en'] = career_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        item['require_chinese_en'] = require_chinese_en
        item['ap'] = ap
        item['alevel'] = alevel
        item['deadline'] = deadline
        yield item

Ejemplo n.º 27

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'University of Toronto'

        #2.url
        url = response.url
        # print(url)

        #3.location
        location = 'Toronto'

        #7.major_name_en
        major_name_en = response.xpath(
            '//*[@class="node node-program-details clearfix"]/h2/a').extract()
        major_name_en = ''.join(major_name_en)
        major_name_en = remove_tags(major_name_en)
        major_name_en = major_name_en.strip()
        # print(major_name_en)

        #12.require_chinese_en

        #13.1415161718 ielts_desc
        ielts_desc = 'The minimum requirement is an overall band of 6.5, with no band below 6.0.'
        ielts = 6.5
        ielts_l = 6
        ielts_s = 6
        ielts_r = 6
        ielts_w = 6

        #19 20 21
        toefl_desc = 'Minimum Requirement: total score of 100 + 22 on Writing'
        toefl = 100
        toefl_w = 22

        #22.apply_fee 23 24
        apply_fee = 180
        apply_pre = '$'
        tuition_fee_pre = '$'

        #25 26
        toefl_code = '0982'
        sat_code = '0982'

        #27.deadline
        #28.IB
        ib = 'If you are currently  enrolled in or have completed the International Baccalaureate Diploma, here’s what you need to know about our admission requirements and transfer credit you may be eligible for.An International Baccalaureate Diploma, including English HL or SL is required for admission.Prerequisite courses can be presented at either the Standard or Higher Level. For programs with a Math prerequisite, Math SL or HL is required. “Math Studies” is not acceptable. A total score of 27, not including bonus points, is required for admission consideration. More competitive programs require a significantly higher score. If you are currently enrolled in the IB Diploma program, you must submit your predicted IB results (1-7 scale), confirmation that you are completing the full IB Diploma, a current transcript including any mid-term/mid-year results available, and complete the online self-reported grades form. If you are currently enrolled in the IB Certificate/Course program, you must submit your predicted IB results (1-7 scale), confirmation that you are completing the IB Certificate/Course program, a current transcript including any mid-term/mid-year results available; and importantly, the name of the matriculation certificate/diploma to be awarded upon completion of your studies.Final IB results must be sent to the University electronically by the International Baccalaureate Organization (IBO). '

        item['ib'] = ib
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['tuition_fee_pre'] = tuition_fee_pre
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_desc"] = toefl_desc
        item["toefl"] = toefl
        item["toefl_w"] = toefl_w
        item['school_name'] = school_name
        item['url'] = url
        item['location'] = location
        item['major_name_en'] = major_name_en

        #5.department 需要循环yield
        #6.degree_name 需要循环yield
        #8.entry_requirements_en 需要循环yield
        #9.overview_en 需要循环yield
        #10.career_en 需要循环yield
        #11.modules_en 需要循环yield
        #4.campus 需要循环yield
        campus = response.xpath("//div/h3[@class='field-item even']").extract()

        degree_name = response.xpath(
            "//h3[contains(text(),'Program')]//preceding-sibling::*[1]"
        ).extract()
        if len(degree_name) == 0:
            degree_name = response.xpath(
                "//*[contains(text(),'Program')]//preceding-sibling::*[1]"
            ).extract()

        entry_requirements_en = response.xpath(
            "//h3[contains(text(),'Admissions Requirements')]//following-sibling::ul//li[1]"
        ).extract()
        if len(entry_requirements_en) == 0:
            entry_requirements_en = response.xpath(
                "//h3[contains(text(),'Admissions Requirements')]/../following-sibling::*//ul//li//p[1]"
            ).extract()
        if len(entry_requirements_en) == 0:
            entry_requirements_en = response.xpath(
                "//*[contains(text(),'Admissions Requirements')]//following-sibling::*//li[1]"
            ).extract()

        overview_en_url = response.xpath(
            "//div[@class='field-items']//div[@class='field-item even']//*[contains(text(),'Learn more about')]//@href"
        ).extract()

        if len(campus) != 0:
            for i, j, k, l in zip(campus, degree_name, entry_requirements_en,
                                  overview_en_url):
                response_campus_a = remove_tags(i)

                if ',' in response_campus_a:
                    response_campus = response_campus_a.split(',')[0].strip()
                    if 'John H' in response_campus_a:
                        department = 'John H. Daniels Faculty of Architecture, Landscape and Design'
                    else:
                        department = response_campus_a.split(
                            ',')[-1].strip().replace('amp;', '')
                    response_degree_name = remove_tags(j).strip()
                    response_entry_requirements_en = remove_class(k).strip()
                else:
                    response_campus = response_campus_a.strip()
                    department = ''
                    response_degree_name = remove_tags(j).strip()
                    response_entry_requirements_en = remove_class(k).strip()
                response_degree_name = response_degree_name.replace('amp;', '')
                # print(response_campus,l)

                if 'St. George Campus' in response_campus:
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
                    }
                    data = requests.get(l, headers=headers)
                    response_s = etree.HTML(data.text)

                    overview_en = response_s.xpath(
                        "//*[contains(text(),'Introduction')]//following-sibling::p[position()<3]"
                    )
                    if len(overview_en) == 0:
                        overview_en = response_s.xpath(
                            '//*[@id="block-system-main"]/div/div/div[1]/div/div[2]/div/p'
                        )
                    if len(overview_en) == 0:
                        overview_en = response_s.xpath(
                            "//div[@class='content clearfix']//div[1]/div/p[1]"
                        )
                    response_overview = ''
                    if len(overview_en) > 0:
                        for a in overview_en:
                            response_overview += (etree.tostring(
                                a,
                                encoding='unicode',
                                pretty_print=False,
                                method='html'))
                            response_overview = remove_class(response_overview)
                    if 'engineering' in url:
                        response_overview = '<p>U of T Engineering is a world-renowned faculty known for leading-edge research and discovery. Regardless of the program you choose, you will have access to U of T’s top-ranked professors and facilities, as well as a curriculum that is constantly evolving. We offer you the most interdisciplinary engineering education in Canada. You can tailor your degree through academic options, minors and certificates, adding breadth and depth to your studies. This academic flexibility starts in first year, with a choice of three different entry points: Core Programs (Core 8) TrackOne, Undeclared Engineering Science</p>'

                    modules_en = response_s.xpath(
                        "//div[contains(text(),'Completion')]//following-sibling::*"
                    )
                    # modules_en = response_s.xpath("//*[contains(text(),'Search Courses by Keyword')]/../../../../../../..//following-sibling::h3[contains(@class,'views-accordion')]")
                    # if len(modules_en)==0:
                    #     modules_en = response_s.xpath("//*[contains(text(),'First Year Courses')]//following-sibling::*[1]")
                    # if len(modules_en)==0:
                    #     modules_en = response_s.xpath("//div[contains(text(),'Completion')]//following-sibling::*")
                    # if len(modules_en)<20:
                    #     modules_en = response_s.xpath("//*[contains(text(),'First Year Courses')]//following-sibling::*")
                    # print(modules_en,url)
                    response_modules_en = ''
                    if len(modules_en) > 0:
                        for a in modules_en:
                            response_modules_en += (etree.tostring(
                                a,
                                encoding='unicode',
                                pretty_print=False,
                                method='html'))
                            response_modules_en = remove_class(
                                response_modules_en)

                    response_career_en = None

                elif 'Mississauga Campus' in response_campus:
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
                    }
                    data = requests.get(l, headers=headers)
                    response_m = etree.HTML(data.text)

                    overview_en = response_m.xpath(
                        "//h2[contains(text(),'Programs & Requirements*')]//preceding-sibling::*"
                    )
                    response_overview = ''
                    if len(overview_en) > 0:
                        for a in overview_en:
                            response_overview += (etree.tostring(
                                a,
                                encoding='unicode',
                                pretty_print=False,
                                method='html'))
                            response_overview = remove_class(response_overview)

                    career_en = response_m.xpath(
                        "//*[contains(text(),'Careers by Major')]//following::*[position()<2]"
                    )
                    response_career_en = ''
                    if len(career_en) > 0:
                        for a in career_en:
                            response_career_en += (etree.tostring(
                                a,
                                encoding='unicode',
                                pretty_print=False,
                                method='html'))
                            response_career_en = remove_class(
                                response_career_en)

                    response_modules_en = ''
                elif 'Scarborough Campus' in response_campus:
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
                    }
                    data = requests.get(l, headers=headers)
                    response_sc = etree.HTML(data.text)

                    overview_en = response_sc.xpath(
                        "//div[@class='program_summary indent']//p")
                    response_overview = ''
                    if len(overview_en) > 0:
                        for a in overview_en:
                            response_overview += (etree.tostring(
                                a,
                                encoding='unicode',
                                pretty_print=False,
                                method='html'))
                            response_overview = remove_class(response_overview)

                    career_en_url = response_sc.xpath(
                        "//*[contains(text(),'Career Options')]//following-sibling::*//@href"
                    )
                    data2 = requests.get(career_en_url[0], headers=headers)
                    response_car = etree.HTML(data2.text)
                    career_en = response_car.xpath(
                        "//*[contains(text(),'Entry-Level')]//following-sibling::ul[1]"
                    )
                    response_career_en = ''
                    if len(career_en) > 0:
                        for a in career_en:
                            response_career_en += (etree.tostring(
                                a,
                                encoding='unicode',
                                pretty_print=False,
                                method='html'))
                            response_career_en = remove_class(
                                response_career_en)

                    response_modules_en = None
                else:

                    response_career_en = None
                    response_modules_en = None
                    response_overview = None

                item['modules_en'] = response_modules_en
                item['career_en'] = response_career_en
                item['overview_en'] = response_overview
                item['campus'] = response_campus
                item['department'] = department
                item['degree_name'] = response_degree_name
                item['entry_requirements_en'] = response_entry_requirements_en

                if 'Engineering' in department or 'Engineering' in major_name_en:
                    require_chinese_en = 'Candidates studying in the Chinese High School system are required to present: Senior 3 level Math, Chemistry and Physics — if students are permitted to take only one science subject within their Gao Kao, we recommend Physics; Chemistry should be presented in Senior Year 3  Hui Kao: Chinese Upper Middle School Graduation Exam results (if available in your province)  Gao Kao: Chinese National University Entrance Examinations  Proof of English Facility may be required*'
                else:
                    require_chinese_en = 'Senior High School (Upper Middle School) Graduation Diploma and Academic Proficiency Test/Upper Middle School Graduation (Hui Kao) Exam and  Chinese National University Entrance Examinations (Gao Kao)'
                item['require_chinese_en'] = require_chinese_en

                if 'Applied Science and Engineering' in major_name_en:
                    deadline = '2019-01-10'
                elif 'Architecture' in major_name_en:
                    deadline = '2019-01-10'
                elif 'Landscape' in major_name_en:
                    deadline = '2019-01-10'
                elif 'Design' in major_name_en:
                    deadline = '2019-01-10'
                elif 'Bachelor of Information' in degree_name:
                    deadline = '2019-01-15'
                elif 'Medical Radiation Sciences' in major_name_en:
                    deadline = '2019-02-01'
                elif 'Nursing' in major_name_en:
                    deadline = '2019-01-15'
                elif 'Physician Assistant' in major_name_en:
                    deadline = '2019-01-15'
                else:
                    deadline = '2019-01-16'
                item['deadline'] = deadline

                if 'Engineering' in department or 'Engineering' in major_name_en:
                    tuition_fee = '59230'
                else:
                    tuition_fee = '54280'
                item['tuition_fee'] = tuition_fee
                yield item

Ejemplo n.º 28

Mostrar archivo

    def parses(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)
        #1.学校名称
        Didian = response.meta['didian']
        Didian = remove_tags(Didian)
        #print(Didian)
        school_name = 'University of Alberta'
        #2.地点
        try:
            location = Didian
            location = re.findall('\((.*)\)', location)[0]
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus = re.findall('(.*) \(.*\)', Didian)[0]
            #campus = remove_tags(campus)
            #campus = campus.replace(', Online','')
            #campus = campus.replace(' ','')
            #campus = campus.split(',')
            #print(campus)
        except:
            campus = None
            #print(campus)

#4. 学院
        try:
            department = response.xpath(
                '//*[@id="page-content"]/div[1]/div[1]/p[2]').extract()[0]
            department = remove_tags(department)
            #department = department.replace(' class="fa fa-graduation-cap"','').replace(' class="fa fa-university"','')
            #department = department.replace(' <i> </i> ','<i></i>').replace('<i> </i> ','')
            #department = department.split('<i></i>')[-1]
            #print(len(department))
            #print(department)
            #print(response.url)
        except:
            department = None
            #print(department)

# 4.
        try:
            degree_name = response.xpath(
                '//*[@id="page-content"]/div[1]/div[1]/p[1]').extract()[0]
            degree_name = remove_tags(degree_name)

            #degree_name = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list)
            #degree_name = degree_name.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','')
            # degree_name = degree_name.replace('<li>','').replace('</li>','---')
            # degree_name = degree_name.replace('<span>','').replace('</span>','---')
            #degree_name = degree_name.split('</li><li>')
            #print(degree_name)
            #print(response.url)
        except:

            degree_name = None
            #print(degree_name)

#5.学位描述
        try:
            degree_overview_en = response.xpath(
                '//*[@id="page-content"]/div[3]/div[1]/h4[1]/following-sibling::p'
            ).extract()
            degree_overview_en = ''.join(degree_overview_en)
            #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            degree_overview_en = re.sub('<p(https:.*)">?', '',
                                        degree_overview_en)
            degree_overview_en = degree_overview_en.replace(
                'https://www.ualberta.ca/agriculture-life-environment-sciences/"',
                '')
            degree_overview_en = degree_overview_en.replace(
                'https://www.ualberta.ca/campus-saint-jean"', '')
            degree_overview_en = degree_overview_en.replace(
                'https://www.ualberta.ca/en/campus-saint-jean"', '')
            degree_overview_en = degree_overview_en.replace(' xmlns=""', '')
            #degree_overview_en = degree_overview_en.replace('https://www.ualberta.ca/arts"','')
            #degree_overview_en = degree_overview_en.replace('\r\n','')
            degree_overview_en = degree_overview_en.replace('\n', '')
            #degree_overview_en = degree_overview_en.replace('\n','')
            #degree_overview_en = degree_overview_en.replace('  ',' ')
            degree_overview_en = degree_overview_en.replace(
                '                           ', '')
            degree_overview_en = degree_overview_en.replace('   ', '')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)
        #//*[@id="page-content"]/div[3]/div[1]/*

#6.专业英文
        try:
            major_name_en = response.xpath('//h1').extract()[1]
            major_name_en = major_name_en.replace('\r\n', '').replace(
                '\n', '').replace('           ',
                                  '').replace('\t', '').replace('     ', '')
            major_name_en = remove_tags(major_name_en)
        # print(major_name_en)
        #print(major_name_en)
        except:
            major_name_en = None
            #print(major_name_en)

#7.专业介绍
        try:
            overview_en = degree_overview_en
            # overview_en = response.xpath('').extract()
            # overview_en = ''.join(overview_en)
            # overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',overview_en)

            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = '2019-09'
            #start_date = ','.join(start_date)
            #start_date = remove_tags(start_date)
            #start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            #start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
# try:
#     duration = response.xpath('').extract()[0]
#     duration = remove_tags(duration)
#     # print(duration)
# except:
#     duration = None
#     # print(duration)

#10.课程设置
        try:
            modules_en_dict = {
                "Augustana FacultyAccounting": "AUACC",
                "Augustana FacultyArt": "AUART",
                "Augustana FacultyBiology": "AUBIO",
                "Augustana FacultyChemistry": "AUCHE",
                "Augustana FacultyClassical Studies": "AUCLA",
                "Augustana FacultyCrime and Community": "AUCRI",
                "Augustana FacultyComputing Science": "AUCSC",
                "Augustana FacultyCommunity Service Learning": "AUCSL",
                "Augustana FacultyDrama": "AUDRA",
                "Augustana FacultyEnglish for Academic Purposes": "AUEAP",
                "Augustana FacultyEconomics": "AUECO",
                "Augustana FacultyEducational Computing": "AUEDC",
                "Augustana FacultyEducation Field Experience": "AUEFX",
                "Augustana FacultyEnglish": "AUENG",
                "Augustana FacultyEnvironmental Studies": "AUENV",
                "Augustana FacultyEducational Psychology": "AUEPS",
                "Fine Arts Option": "AUFAR",
                "Augustana FacultyFrench": "AUFRE",
                "Augustana FacultyGlobal and Development Studies": "AUGDS",
                "Augustana FacultyGeography": "AUGEO",
                "Augustana FacultyGerman": "AUGER",
                "Augustana FacultyGreek": "AUGRE",
                "Augustana FacultyHistory": "AUHIS",
                "Augustana FacultyInterdisciplinary Studies": "AUIDS",
                "Augustana FacultyIndigenous Studies": "AUIND",
                "Augustana FacultyLanguage Studies": "AULAN",
                "Augustana FacultyLatin": "AULAT",
                "Augustana FacultyWorld Literatures": "AULIT",
                "Augustana FacultyMathematics": "AUMAT",
                "Augustana FacultyManagement": "AUMGT",
                "Augustana FacultyMusic": "AUMUS",
                "Augustana FacultyPhysical Activity": "AUPAC",
                "Augustana FacultyPhysical Education": "AUPED",
                "Augustana FacultyPhilosophy": "AUPHI",
                "Augustana FacultyPhysics": "AUPHY",
                "Augustana FacultyPolitical Studies": "AUPOL",
                "Augustana FacultyPsychology": "AUPSY",
                "Augustana FacultyReligion": "AUREL",
                "Augustana FacultyScandinavian": "AUSCA",
                "Augustana FacultySociology": "AUSOC",
                "Augustana FacultySpanish": "AUSPA",
                "Augustana FacultyStatistics": "AUSTA",
                "Agriculture, Forestry, and Home Economics": "AFHE",
                "Agricultural, Food and Nutritional Science": "AFNS",
                "Agricultural, Life and Environmental Sciences": "ALES",
                "Animal Science": "AN SC",
                "Agricultural and Resource Economics": "AREC",
                "Environmental and Conservation Sciences": "ENCS",
                "Forest Science": "FOR",
                "Forest Economics": "FOREC",
                "Human Ecology": "HECOL",
                "Nutrition and Food Sciences": "NU FS",
                "Nutrition": "NUTR",
                "Plant Science": "PL SC",
                "Rural Sociology": "R SOC",
                "Renewable Resources": "REN R",
                "University": "UNIV",
                "Astronomy": "ASTRO",
                "Bioinformatics": "BIOIN",
                "Biology": "BIOL",
                "Botany": "BOT",
                "Chemistry": "CHEM",
                "Computing Science": "CMPUT",
                "Earth and Atmospheric Sciences": "EAS",
                "Engineering Physics": "EN PH",
                "Entomology": "ENT",
                "Environmental Physical Sciences": "ENVPS",
                "Genetics": "GENET",
                "Geophysics": "GEOPH",
                "Immunology and Infection": "IMIN",
                "Integrated Petroleum Geosciences": "IPG",
                "Mathematical Physics": "MA PH",
                "Marine Science": "MA SC",
                "Mathematics": "MATH",
                "Microbiology": "MICRB",
                "Master of Internetworking": "MINT",
                "Multimedia": "MM",
                "Paleontology": "PALEO",
                "Physics": "PHYS",
                "Urban and Regional Planning": "PLAN",
                "Psychology": "PSYCO",
                "Science": "SCI",
                "Statistics": "STAT",
                "Work Experience": "WKEXP",
                "Zoology": "ZOOL",
                "Pharmacy": "PHARM",
                "Nursing": "NURS",
                "Sciences infirmières": "SC INF",
                "Native Studies": "NS",
                "Anaesthesia": "ANAES",
                "Anatomy": "ANAT",
                "Biochemistry": "BIOCH",
                "Biomedical Engineering": "BME",
                "Cell Biology": "CELL",
                "Dental Hygiene": "D HYG",
                "Dentistry": "DDS",
                "Dentistry": "DENT",
                "Dentistry/Medicine": "DMED",
                "Electrical and Computer Engineering/Biomedical Eng": "EE BE",
                "Family Medicine": "F MED",
                "Laboratory Medicine and Pathology": "LABMP",
                "Medical Genetics": "MDGEN",
                "Medicine": "MED",
                "Medical Laboratory Science": "MLSCI",
                "Medical Microbiology and Immunology": "MMI",
                "Neuroscience (Centre for)": "NEURO",
                "Obstetrics and Gynaecology": "OB GY",
                "Oral Biology": "OBIOL",
                "Oncology": "ONCOL",
                "Ophthalmology": "OPHTH",
                "Paediatrics": "PAED",
                "Postgraduate Dental Education": "PGDE",
                "Postgraduate Medical Education": "PGME",
                "Physiology": "PHYSL",
                "Pharmacology": "PMCOL",
                "Psychiatry": "PSYCI",
                "Radiology and Diagnostic Imaging": "RADDI",
                "Radiation Therapy": "RADTH",
                "Surgery": "SURG",
                "Law": "LAW",
                "Dance Activity": "DAC",
                "Dance": "DANCE",
                "Health Education": "HE ED",
                "Kinesiology": "KIN",
                "Kinesiology, Recreation, Leisure and Sport": "KRLS",
                "Physical Activity": "PAC",
                "Physical Education and Sport": "PEDS",
                "Physical Education, Recreation and Leisure Studies": "PERLS",
                "Recreation and Leisure Studies": "RLS",
                "Chemical Engineering": "CH E",
                "Civil Engineering": "CIV E",
                "Chemical and Materials Engineering": "CME",
                "Computer Engineering": "CMPE",
                "Electrical Engineering": "E E",
                "Electrical and Computer Engineering": "ECE",
                "Engineering, Computer": "ENCMP",
                "Engineering Management": "ENG M",
                "Engineering, General": "ENGG",
                "Environmental Engineering": "ENV E",
                "Materials Engineering": "MAT E",
                "Mechanical Engineering": "MEC E",
                "Mining Engineering": "MIN E",
                "Petroleum Engineering": "PET E",
                "EducationAdult": "EDAE",
                "EducationBusiness": "EDBU",
                "EducationCareer Technology Studies": "EDCT",
                "EducationElementary": "EDEL",
                "EducationElementary and Secondary": "EDES",
                "EducationField Experience": "EDFX",
                "EducationInstructional Technology": "EDIT",
                "EducationPolicy Studies": "EDPS",
                "EducationPsychology": "EDPY",
                "EducationSecondary": "EDSE",
                "Education": "EDU",
                "Library and Information Studies": "LIS",
                "Accounting": "ACCTG",
                "Business Law": "B LAW",
                "Business Economics": "BUEC",
                "Business": "BUS",
                "Finance": "FIN",
                "Industrial Relations": "IND R",
                "Marketing": "MARK",
                "Management Science": "MGTSC",
                "Management Information Systems": "MIS",
                "Operations Management": "OM",
                "Organizational Analysis": "ORG A",
                "Strategic Management and Organization": "SMO",
                "Anthropology": "ANTHR",
                "Arabic": "ARAB",
                "Art": "ART",
                "Art History": "ART H",
                "American Sign Language": "ASL",
                "Comparative Literature": "C LIT",
                "Chinese": "CHINA",
                "Classics": "CLASS",
                "Community Service-Learning": "CSL",
                "Danish": "DANSK",
                "Design": "DES",
                "Drama": "DRAMA",
                "East Asian Studies": "EASIA",
                "Economics": "ECON",
                "English": "ENGL",
                "French Language and Literature": "FREN",
                "Film Studies": "FS",
                "German": "GERM",
                "Greek": "GREEK",
                "Gender and Social Justice": "GSJ",
                "History of Art, Design, and Visual Culture": "HADVC",
                "Hebrew": "HEBR",
                "Human Geography and Planning": "HGP",
                "Hindi": "HINDI",
                "History": "HIST",
                "Humanities Computing": "HUCO",
                "Hungarian": "HUNG",
                "Interdisciplinary Undergraduate & Graduate Courses": "INT D",
                "Italian": "ITAL",
                "Japanese": "JAPAN",
                "Korean": "KOREA",
                "Latin American Studies": "LA ST",
                "Latin": "LATIN",
                "Linguistics": "LING",
                "Middle Eastern and African Studies": "MEAS",
                "Modern Languages and Cultural Studies": "MLCS",
                "Music": "MUSIC",
                "Norwegian": "NORW",
                "Persian": "PERS",
                "Philosophy": "PHIL",
                "Political Science": "POL S",
                "Polish": "POLSH",
                "Portuguese": "PORT",
                "Punjabi": "PUNJ",
                "Religious Studies": "RELIG",
                "Russian": "RUSS",
                "Scandinavian": "SCAND",
                "Slavic and East European Studies": "SLAV",
                "Sociology": "SOC",
                "Spanish": "SPAN",
                "Science, Technology, and Society": "STS",
                "Swahili": "SWAH",
                "Swedish": "SWED",
                "Theatre Design": "T DES",
                "Ukrainian": "UKR",
                "Women's Studies": "W ST",
                "Women's and Gender Studies": "WGS",
                "Writing (Creative Writing)": "WRITE",
                "Writing Studies": "WRS"
            }
            if 'Augustana Faculty' in department:
                modules_en_val = 'Augustana Faculty' + major_name_en
                modules_en_val = modules_en_val.replace(' Co-operative', '')
                print('第一:' + modules_en_val)
            else:
                modules_en_val = major_name_en
                modules_en_val = modules_en_val.replace(' Co-operative', '')
                print("第一:" + modules_en_val)
            modules_en_val = modules_en_dict[modules_en_val]
            url = 'https://catalogue.ualberta.ca/Course/Subject?subjectCode=' + modules_en_val + '&all=True'
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
            }
            response2 = etree.HTML(requests.get(url, headers=headers).text)
            response2 = response2.xpath('//div[@class = "claptrap-course"]')
            modules_en = []
            for rea in response2:
                modules_en += etree.tostring(rea,
                                             method='html',
                                             encoding='unicode')
                modules_en = ''.join(modules_en)
            modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en)
            modules_en = modules_en.replace('\r\n', '').replace(
                '\n', '').replace('            ',
                                  '').replace('        ',
                                              '').replace('    ', '')
            print(modules_en)
        except:
            modules_en = None
            print(modules_en)

#11.就业方向
        try:
            career_en = response.xpath(
                '//*[@id="page-content"]/div[3]/div[1]/h4/following-sibling::ul'
            ).extract()[0]
            career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en)
            career_en = career_en.replace('\r\n', '').replace(
                '\n', ''
            ).replace('\t', '').replace(
                '                                                                                                                         ',
                ''
            ).replace(
                '                                                                       ',
                '')
            career_en = career_en.replace('                              ', '')
            #print(career_en)
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            deadline = '2019-03-01'
            #deadline = response.xpath('//h3[contains(text(),"Deadline")]/following-sibling::p[1]').extract()[0]
            #deadline = remove_tags(deadline)
            #deadline = deadline.replace('Documents due: ', '')
            #deadline = remove_tags(deadline)
            #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
        try:
            abaa = 'https://apps.admissions.ualberta.ca/costcalculator/faculties/%s/international/off-campus?pttool=true'
            acc = response.xpath(
                '//*[@id="get-program-costs"]/@href').extract()[0]
            acc = re.findall('\d', acc)[0]
            #print(acc)
            url = abaa % acc
            #print(url)
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
            }
            response1 = requests.get(url, headers=headers)
            #print(response1.text)
            tuition_fee = response1.text
            tuition_fee = remove_tags(tuition_fee)
            tuition_fee = tuition_fee.replace('\n', '').replace(
                '\r\n', '').replace('                           ', '')
            tuition_fee = re.findall('\$(\d+\.\d+)', tuition_fee)[0]
            #print(tuition_fee)
        except:
            tuition_fee = None
            #print(tuition_fee)
#14 申请费:
        apply_fee = '100'

        #15 申请要求
        try:

            entry_requirements_en = '<div><div><div><h3><em> <strong></strong></em>Competitive<em><strong> </strong></em>Admission</h3><p><em><strong>We encourage you to apply early, as admission is competitive, and space in each program is limited.</strong></em>&nbsp;</p><p>The University of Alberta provides a wide range of programs, from highly accessible to highly competitive.<br><br>In order to be considered for admission, you need to present a competitive average for your faculty/program of choice, based on the required courses for that program. <br><br>The competitive average for each faculty/program may change throughout the year, based on the competitiveness of the applicant pool.<br><br>In addition to presenting a competitive average when you apply, you must also meet the university’s minimum admission requirements after receiving an admission offer in order to remain eligible.</p></div></div><div><div><h3>Minimum Requirements</h3><p>Admission is competitive; meeting the minimum requirements does not guarantee admission. </p><p>You must meet the minimum requirements in all subjects, even after receiving an admission offer, in order to remain admissible. (See your admission offer letter for more details.)</p><p>&nbsp;</p><table><tbody><tr><td><strong>Minimum requirements</strong><br></td><td><strong>&nbsp;Grade 11 final marks</strong></td><td><strong>&nbsp;Grade 12 first semester / interim marks</strong></td><td><strong>&nbsp;Grade 12 final marks</strong><br></td></tr><tr><td>Minimum grade for each of the five required courses<br></td><td>&nbsp;60%+</td><td>&nbsp;50%+<br></td><td>&nbsp;50%+</td></tr><tr><td>Minimum overall average across all five required courses<br></td><td>&nbsp;70%+</td><td>&nbsp;70%+&nbsp;</td><td>&nbsp;70%+</td></tr></tbody></table></div></div></div>'
            #entry_requirements_en = ''.join(entry_requirements_en)
            #entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en)
            #entry_requirements_en = remove_tags(entry_requirements_en)
            #print(entry_requirements_en)
        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = '<p>A combination of A’s and B’s on the Joint Graduation Exam (Hui Kao) or a competitive score on the University Entrance Exam (Gao Kao) that would normally be required for admission to a key university in China. Results must be issued by the governing authority, not by the school.</p>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
        try:
            specific_requirement_en = response.xpath(
                '//div[@class = "row-fluid required-courses"]').extract()
            specific_requirement_en = ''.join(specific_requirement_en)
            specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]',
                                             '', specific_requirement_en)
            #print(specific_requirement_en)
        except:
            specific_requirement_en = None
            #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = 'A combination of A’s and B’s on the Joint Graduation Exam (Hui Kao) or a competitive score on the University Entrance Exam (Gao Kao) that would normally be required for admission to a key university in China. Results must be issued by the governing authority, not by the school.'
            #gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = None
            #gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = 'A combination of A’s and B’s on the Joint Graduation Exam (Hui Kao) or a competitive score on the University Entrance Exam (Gao Kao) that would normally be required for admission to a key university in China. Results must be issued by the governing authority, not by the school.'
            #huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = response.xpath('').extract()[0]
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'At least 6.5 with no band less than 5.5'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'At least 6.5 with no band less than 5.5'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 5.5
        ielts_s = 5.5
        ielts_r = 5.5
        ielts_w = 5.5

        #28 toefl_code
        try:
            toefl_code = '0963'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = 'iBT: At least 90 with a minimum score of 21 points in each section Note: the PBT is no longer accepted'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '90'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 21
        toefl_s = 21
        toefl_r = 21
        toefl_w = 21

        # 32 alevel
        try:
            alevel = ''
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'Approximately 30–35 IB predicted points on the full diploma, including bonus points.Full IB Diploma students may be eligible to receive admission based on predicted scores. Competitive predicted IB scores for admission vary by Faculty. Final IB grades in the range of 4 to 7 are considered competitive. IB Grade:7(Percent Equivalent:98%),6(Percent Equivalent:90%),5(Percent Equivalent:82%),4(Percent Equivalent:73%),3(Percent Equivalent:55%),2(Percent Equivalent:not accepted),1(Percent Equivalent:not accepted)'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            ap = 'A combination of grades of 4 and 5.AP Result:5(Percent Equivalent:96%),4(Percent Equivalent:86%),3(Percent Equivalent:76%),2(Percent Equivalent:65%)'
            #ap = remove_tags(ap)
            # print(ap)
        except:
            ap = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = '1.列表页有校区信息,从列表页进入详情页.2.学位是js加载需要get获取.3.课程界面需要匹配详情页内的学院+专业名称跳转三次获取课程信息4.部分课程数据需要根据业务老师提供链接手动抓取'
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0963'
        sat1_desc = 'SAT Score of 1200 with a minimum 600 in each section. See International Course Equivalencies for more details.'
        sat2_desc = None
        act_code = ''
        act_desc = ''
        if 'Faculty of Agricultural' in department:
            average_score = 'At least 80% of students admitted into the Faculty of ALES for Fall 2018 had admission averages in the mid-70s or higher.'
        elif 'Arts' in department:
            average_score = 'At least 75% of students admitted into the Faculty of Arts for Fall 2018 had admission averages in the 80s or higher.'
        elif 'Augustana' in department:
            average_score = 'At least 60% of students admitted into the Augustana Faculty for Fall 2018 had admission averages in the 80s or higher.'
        elif 'Engineering' in department:
            average_score = 'At least 70% of students admitted into the Faculty of Engineering for Fall 2018 had admission averages in the mid-80s or higher.'
        elif 'Education' in department:
            average_score = 'At least 77% of students admitted into the Faculty of Education for Fall 2018 had admission averages in the 80s or higher.'
        elif 'Native Studies' in department:
            average_score = 'At least 70% of students admitted into the Faculty of Native Studies for Fall 2018 had admission averages in the mid-70s or higher.'
        elif 'Faculty of Science' in department:
            average_score = 'At least 70% of students admitted into the Faculty of Science for Fall 2018 had admission averages in the mid-80s or higher.'
        elif 'Faculty of Nursing' in department:
            average_score = 'At least 73% of students admitted into the Faculty of Nursing for Fall 2018 had admission averages in the 90s.'
        elif 'Kinesiology' in department:
            average_score = 'At least 79% of students admitted into the Faculty of Kinesiology, Sport, and Recreation for Fall 2018 had admission averages in the mid-80s or higher.'
        elif 'Faculty of Business' in department:
            average_score = '75'
        else:
            average_score = None
        item["ap"] = ap
        item["duration_per"] = 1
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat_code"] = sat1_desc
        item["sat_code"] = sat2_desc
        item["sat_code"] = act_code
        item["sat_code"] = act_desc
        item["gaokao_desc"] = gaokao_desc
        item["average_score"] = average_score
        aaa = response.xpath('//body').extract()
        #print(aaa)
        if 'Minor' not in degree_name and 'After Degree' not in degree_name and 'This program does not allow admission directly from high school. See requirements below for more details.' not in aaa and 'Faculté Saint-Jean' not in department:
            yield item

            #pass
        #print(degree_name)
        else:
            pass

Ejemplo n.º 29

Mostrar archivo

    def parses(self, response):
        item = getItem.get_item(ScrapyschoolCanadaBenItem)

        #1.学校名称
        degree = response.meta['degree']
        #degree = remove_tags(degree)
        depart = response.meta["depart"]
        depart = remove_tags(depart)
        # print(depart)
        # print(degree)
        school_name = 'University of Western Ontario'

        #2.地点
        try:
            location = 'London,Canada'
            location = re.findall('\((.*)\)', location)[0]
            #print(location)
        except:
            location = None
            #print(location)

#3. 校区
        try:
            campus = 'main campus'
            #campus = remove_tags(campus)
            #campus = campus.replace(', Online','')
            #campus = campus.replace(' ','')
            #campus = campus.split(',')
            #print(campus)
        except:
            campus = None
            #print(campus)

#4. 学院
        try:
            department = depart
            #department = department.replace(' class="fa fa-graduation-cap"','').replace(' class="fa fa-university"','')
            #department = department.replace(' <i> </i> ','<i></i>').replace('<i> </i> ','')
            #department = department.split('<i></i>')[-1]
            #print(len(department))
            #print(department)
            #print(response.url)
        except:
            department = None
            #print(department)

# 4.
#         try:
#             degree_name_list = response.xpath('//a[contains(text(),"Degrees")]/../following-sibling::div[1]').extract()
#             degree_name_list = ''.join(degree_name_list)
#             degree_name_list = remove_tags(degree_name_list)
#             degree_name_list = degree_name_list.replace('\xa0','')
#             degree_name_list = degree_name_list.replace('\n\n','')
#             degree_name_list = degree_name_list.split('\n')
#             #degree_name_list = filter(None, degree_name_list)
#             if '' in degree_name_list:
#                 degree_name_list = ['Bachelor of Science in Nursing (BScN)']

#degree_name = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list)
#degree_name = degree_name.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','')
# degree_name = degree_name.replace('<li>','').replace('</li>','---')
# degree_name = degree_name.replace('<span>','').replace('</span>','---')
#degree_name = degree_name.split('</li><li>')
# print(degree_name_list)
#    # print(response.url)
#     #print(response.url)
# except:
#
#     degree_name_list = None
#    # print(degree_name_list)

#5.学位描述
        try:
            degree_overview_en = degree
            #degree_overview_en = ''.join(degree_overview_en)
            # #degree_overview_en = remove_tags(degree_overview_en)
            degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '',
                                        degree_overview_en)
            # degree_overview_en = re.sub('<p(https:.*)">?','',degree_overview_en)
            degree_overview_en = degree_overview_en.replace(
                '<br><a><img> Learn more</a></p>', '')
            # degree_overview_en = degree_overview_en.replace('https://www.ualberta.ca/campus-saint-jean"','')
            # degree_overview_en = degree_overview_en.replace('https://www.ualberta.ca/en/campus-saint-jean"','')
            # degree_overview_en = degree_overview_en.replace(' xmlns=""','')
            # #degree_overview_en = degree_overview_en.replace('https://www.ualberta.ca/arts"','')
            # #degree_overview_en = degree_overview_en.replace('\r\n','')
            # degree_overview_en = degree_overview_en.replace('\n','')
            # #degree_overview_en = degree_overview_en.replace('\n','')
            # #degree_overview_en = degree_overview_en.replace('  ',' ')
            # degree_overview_en = degree_overview_en.replace('                           ','')
            # degree_overview_en = degree_overview_en.replace('   ','')
            #print(degree_overview_en)
        except:
            degree_overview_en = None
            #print(degree_overview_en)
        #//*[@id="page-content"]/div[3]/div[1]/*

#6.专业英文
        try:
            major_name_en_list = response.xpath(
                '//div[4]/div/div[1]/ul').extract()
            major_name_en_list = ''.join(major_name_en_list)
            #major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
            #major_name_en_list = remove_tags(major_name_en_list)
            major_name_en_list = major_name_en_list.replace('\n', '').replace(
                '<ul><li>',
                '').replace('</li></ul>',
                            '').replace('<ul class="squarelist"><li>', '')
            major_name_en_list = major_name_en_list.split('</li><li>')
            #print(major_name_en_list)
        except:
            major_name_en_list = None
            #print(major_name_en_list)

        try:
            degree_name_li = response.xpath('//div[4]/div/div[2]/ul').extract()
            degree_name_li = ''.join(degree_name_li)
            #major_name_en = major_name_en.replace('\r\n','').replace('\n','').replace('           ','').replace('\t','').replace('     ','')
            #degree_name_li = remove_tags(degree_name_li)
            degree_name_li = degree_name_li.replace('\n', '').replace(
                '<ul><li>',
                '').replace('</li></ul>',
                            '').replace('<ul class="squarelist"><li>', '')
            degree_name_li = degree_name_li.split('</li><li>')
            #print(degree_name_li)
        except:
            degree_name_li = None
            #print(degree_name_li)

#7.专业介绍
        try:
            overview_en = degree_overview_en
            # overview_en = response.xpath('').extract()
            # overview_en = ''.join(overview_en)
            # overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',overview_en)

            # print(overview_en)
        except:
            overview_en = degree_overview_en
            # print(overview_en)

#8.入学时间
        try:
            start_date = '2019-09'
            #start_date = ','.join(start_date)
            #start_date = remove_tags(start_date)
            #start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','')
            #start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01')
            #print(start_date)
        except:
            start_date = None
            #print(start_date)

#9.课程长度
# try:
#     duration = response.xpath('').extract()[0]
#     duration = remove_tags(duration)
#     # print(duration)
# except:
#     duration = None
#     # print(duration)

#10.课程设置
        try:
            if 'medical_sciences.html' in response.url:
                modules_en = '<div><p><strong><a>There are 6 Specialization modules </a></strong>available in the BMSc Program and only students registered in Years 3 and 4 BMSc may register in these modules.</p><p><strong>NOTE:</strong> Very few students pursue the Specialization modules since these modules lead only to non-honors BMSc degrees. Since most students in the BMSc Program meet and/or exceed the marks/averages required to register in Honors degrees, students are strongly encouraged to pursue either Honors Specialization modules or Double Majors.</p><p>Enrollment in the Specialization modules is <strong>not</strong> limited as none of these modules contain a capstone course in Year 4</p><ul><li>the 4000-level capstone courses (<strong><a>Research Projects</a></strong> and <strong><a>Medical Sciences 4900F/G + 4930F/G</a></strong>) <strong>cannot be taken by students in the Specialization modules</strong></li><li>see <strong><a>Admission to Year 3 BMSc</a></strong> and <strong><a>Admission to Year 4 BMSc</a></strong> for information about admission to the Specialization modules in Years 3 and 4</li></ul><p>See the <strong><a>Academic Calendar</a></strong> for the complete listing of modules offered by the basic medical science departments</p></div>'
            else:
                modules_en = None
        except:
            modules_en = None
            #print(modules_en)

#11.就业方向
        try:
            if 'medical_sciences.html' in response.url:
                career_en = '<p>Students with a BMSc degree can contribute to society in a variety career opportunities including professional programs, academic and other research institutions, and industry. A large proportion of the graduates of this program choose a career in Medicine, Dentistry or Graduate Studies. Others enter professional programs such as Pharmacy, Optometry, Law, Education, Physiotherapy, Occupational Therapy and Nursing.</p><p>Each year BMSc graduates are surveyed to determine their career plans for the upcoming September.  The following chart summarizes the responses from 238 of the 344 (70%) BMSc students who graduated in June 2017.</p>'
            else:
                career_en = None
        except:
            career_en = None
            #print(career_en)

#12.截止日期
        try:
            deadline = '2019-02-15'
            #deadline = response.xpath('//h3[contains(text(),"Deadline")]/following-sibling::p[1]').extract()[0]
            #deadline = remove_tags(deadline)
            #deadline = deadline.replace('Documents due: ', '')
            #deadline = remove_tags(deadline)
            #print(deadline)
        except:
            deadline = None
            #print(deadline)
#13.学费
# try:
#     abaa = 'https://apps.admissions.ualberta.ca/costcalculator/faculties/%s/international/off-campus?pttool=true'
#     acc = response.xpath('//*[@id="get-program-costs"]/@href').extract()[0]
#     acc = re.findall('\d',acc)[0]
#     #print(acc)
#     url = abaa % acc
#     #print(url)
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
#     response1 = requests.get(url,headers=headers)
#     #print(response1.text)
#     tuition_fee = response1.text
#     tuition_fee = remove_tags(tuition_fee)
#     tuition_fee = tuition_fee.replace('\n','').replace('\r\n','').replace('                           ','')
#     tuition_fee = re.findall('\$(\d+\.\d+)',tuition_fee)[0]
#     #print(tuition_fee)
# except:
#     tuition_fee = None
#     #print(tuition_fee)
#14 申请费:
        apply_fee = '156'

        #15 申请要求
        try:

            entry_requirements_en = '<ol><li><p><strong>Academic transcript of Senior Secondary</strong> indicating all subjects taken and grades earned.</p><p>Applicants may be considered for conditional admission on the basis of mid-year/mid-term results.&nbsp; Mid-year results and a secondary school transcript must be submitted directly to Western from the institutions attended.&nbsp; If you are issued a conditional offer of admission, you will be required to have your final official academic transcript, Graduation Examinations, and University Entrance Examinations sent directly from the proper issuing authority along with word-for-word English translations, to World Education Services (WES) Canada for authentication and verification.**</p></li><li><p><strong>Senior Secondary Graduation Diploma</strong></p></li><li><p><strong>General Education Examination results (Graduation Exams/ Hui Kao/Xuéyé Shuiping Cèshi/Academic Proficiency Test).</strong>&nbsp; For provinces that do not administer or are exempt from taking the general education examinations, a letter from your Senior Secondary verifying the schools and/or provinces examination policy is required.</p></li><li><p><strong>Chinese University Entrance Examination (NCEE / Gao Kao)</strong></p></li><li><p><strong>Proof of English language proficiency.</strong> &nbsp;Test results must be issued directly to Western from the Examining Board. &nbsp;Western\'s institution code is 0984. Please review .</p></li><li><p><strong>Refer to the&nbsp;content below for program specific required and recommended courses. &nbsp;Course prerequisites should be presented at the senior level:</strong></p></li></ol>'
            #entry_requirements_en = ''.join(entry_requirements_en)
            #entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en)
            #entry_requirements_en = remove_tags(entry_requirements_en)
            #print(entry_requirements_en)
        except:
            entry_requirements_en = None
            #print(entry_requirements_en)
            #print(abc)

#16 中国学生申请要求
        try:
            require_chinese_en = '<ol><li><p><strong>Academic transcript of Senior Secondary</strong> indicating all subjects taken and grades earned.</p><p>Applicants may be considered for conditional admission on the basis of mid-year/mid-term results.&nbsp; Mid-year results and a secondary school transcript must be submitted directly to Western from the institutions attended.&nbsp; If you are issued a conditional offer of admission, you will be required to have your final official academic transcript, Graduation Examinations, and University Entrance Examinations sent directly from the proper issuing authority along with word-for-word English translations, to World Education Services (WES) Canada for authentication and verification.**</p></li><li><p><strong>Senior Secondary Graduation Diploma</strong></p></li><li><p><strong>General Education Examination results (Graduation Exams/ Hui Kao/Xuéyé Shuiping Cèshi/Academic Proficiency Test).</strong>&nbsp; For provinces that do not administer or are exempt from taking the general education examinations, a letter from your Senior Secondary verifying the schools and/or provinces examination policy is required.</p></li><li><p><strong>Chinese University Entrance Examination (NCEE / Gao Kao)</strong></p></li><li><p><strong>Proof of English language proficiency.</strong> &nbsp;Test results must be issued directly to Western from the Examining Board. &nbsp;Western\'s institution code is 0984. Please review .</p></li><li><p><strong>Refer to the&nbsp;content below for program specific required and recommended courses. &nbsp;Course prerequisites should be presented at the senior level:</strong></p></li></ol>'
            #require_chinese_en = remove_tags(require_chinese_en)
            # print(require_chinese_en)
        except:
            require_chinese_en = None
            # print(require_chinese_en)

#17 特殊专业要求
# try:
#     specific_requirement_en = response.xpath('//div[@class = "row-fluid required-courses"]').extract()
#     specific_requirement_en = ''.join(specific_requirement_en)
#     specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en)
#     #print(specific_requirement_en)
# except:
#     specific_requirement_en = None
#     #print(specific_requirement_en)

#18 高考(官网要求)
        try:
            gaokao_desc = 'General Education Examination results (Graduation Exams/ Hui Kao/Xuéyé Shuiping Cèshi/Academic Proficiency Test).  For provinces that do not administer or are exempt from taking the general education examinations, a letter from your Senior Secondary verifying the schools and/or provinces examination policy is required.'
            #gaokao_desc = remove_tags(gaokao_desc)
            # print(gaokao_desc)
        except:
            gaokao_desc = None
            # print(gaokao_desc)

#19 高考(展示以及判断字段)
        try:
            gaokao_zs = None
            #gaokao_zs = remove_tags(gaokao_zs)
            # print(gaokao_zs)
        except:
            gaokao_zs = None
            # print(gaokao_zs)

#20 高考分数(文科)
        try:
            gaokao_score_wk = response.xpath('').extract()[0]
            gaokao_score_wk = remove_tags(gaokao_score_wk)
            # print(gaokao_score_wk)
        except:
            gaokao_score_wk = None
            # print(gaokao_score_wk)

#21 高考分数(理科)
        try:
            gaokao_score_lk = response.xpath('').extract()[0]
            gaokao_score_lk = remove_tags(gaokao_score_lk)
            # print(gaokao_score_lk)
        except:
            gaokao_score_lk = None
            # print(gaokao_score_lk)

#22 会考描述
        try:
            huikao_desc = 'General Education Examination results (Graduation Exams/Hui Kao/Xuéyé Shuiping Cèshi/Academic Proficiency Test).  For provinces that do not administer or are exempt from taking the general education examinations, a letter from your Senior Secondary verifying the schools and/or provinces examination policy is required.'
            #huikao_desc = remove_tags(huikao_desc)
            # print(huikao_desc)
        except:
            huikao_desc = None
            # print(huikao_desc)

#23 会考描述
        try:
            huikao_zs = 'General Education Examination results (Graduation Exams/ Hui Kao/Xuéyé Shuiping Cèshi/Academic Proficiency Test).  For provinces that do not administer or are exempt from taking the general education examinations, a letter from your Senior Secondary verifying the schools and/or provinces examination policy is required.'
            huikao_zs = remove_tags(huikao_zs)
            # print(huikao_zs)
        except:
            huikao_zs = None
            # print(huikao_zs)

#24 最低语言要求
        try:
            min_language_require = 'IELTS Academic is required with a minimum overall band score of 6.5 with no part less than 6.0.'
            min_language_require = remove_tags(min_language_require)
            # print(min_language_require)
        except:
            min_language_require = None
            # print(min_language_require)

#25 雅思要求
        try:
            ielts_desc = 'At least 6.5 with no band less than 5.5'
            #ielts_desc = remove_tags(ielts_desc)
            # print(ielts_desc)
        except:
            ielts_desc = None
            # print(ielts_desc)

#26 ielts
        try:
            ielts = '6.5'
            #ielts = re.findall('\d\.\d',ielts)
            #ielts = remove_tags(ielts)
            #print(ielts)
        except:
            ielts = None
            #print(ielts)
#27 ielts_?

        ielts_l = 6.0
        ielts_s = 6.0
        ielts_r = 6.0
        ielts_w = 6.0

        #28 toefl_code
        try:
            toefl_code = '0984'
            #toefl_code = remove_tags(toefl_code)
            # print(toefl_code)
        except:
            toefl_code = None
            # print(toefl_code)

#29 toefl_desc
        try:
            toefl_desc = 'The minimum score required on the TOEFL is 550 on the paperbased with a 5 on the TWE, and 83 on the internet-based tests with no score below 20.'
            #toefl_desc = remove_tags(toefl_desc)
            # print(toefl_desc)
        except:
            toefl_desc = None
            # print(toefl_desc)

#30 toefl
        try:
            toefl = '83'
            #toefl = re.findall('\d\d',toefl)
            #toefl = remove_tags(toefl)
            #print(toefl)
        except:
            toefl = None
            #print(toefl)

#31 toefl_?
        toefl_l = 20
        toefl_s = 20
        toefl_r = 20
        toefl_w = 20

        # 32 alevel
        try:
            alevel = ''
            #alevel = remove_tags(alevel)
            # print(alevel)
        except:
            alevel = None
            # print(alevel)

#33 ib
        try:
            ib = 'If you are currently completing the International Baccalaureate program, you must complete the full International Baccalaureate Diploma including each of the following to be considered for admission:<br>Completion of the entire Diploma including the Theory of Knowledge and Extended Essay<br>Passes in a minimum of 6 subjects of which 3 must be at the Higher Level<br>A minimum grade total of 28 including points awarded for the Extended Essay and Theory of Knowledge<br>No mark less than 4 on any individual course<br>Prerequisites for your program as specified by Western <br>Please note the minimum grade total quoted is the minimum required for admission consideration. Competitive admission based on predicted results is usually in the low to mid 30\'s and can vary depending on the program.<br>Applications should be submitted in the Fall prior to the year in which you are seeking admission.'
            #print(ib)
        except:
            ib = None
            #print(ib)

#34 ap
        try:
            specific_requirement_en = ''
            #ap = remove_tags(ap)
            # print(ap)
        except:
            specific_requirement_en = None
            # print(ap)

#35 面试描述
        try:
            interview_desc_en = response.xpath('').extract()[0]
            interview_desc_en = remove_tags(interview_desc_en)
            # print(interview_desc_en
        except:
            interview_desc_en = None
            # print(interview_desc_en)

#36 作品集描述
        try:
            portfolio_desc_en = response.xpath('').extract()[0]
            portfolio_desc_en = remove_tags(portfolio_desc_en)
            # print(portfolio_desc_en
        except:
            portfolio_desc_en = None
            # print(portfolio_desc_en)

#37 other
        try:
            other = '1.页面需要从列表页跳转,列表页有专业和学位介绍,2,详情页需要拆开专业和学位3.课程,就业,特殊专业要求字段,学费字段 需要跳转到各自页面根据列表页信息做匹配.4,后期需要手动补充课程确实字段.因其每个专业课程字段所在页面不同'
            #other = remove_tags(other)
            # print(other)
        except:
            other = None
            # print(other)

        # sat act 代码 介绍
        sat_code = '0984'
        sat1_desc = 'SAT Reasoning Test results submitted directly to Western by the College Board.  Western\'s institution number is 0984. A minimum SAT combined Evidence Based Reading and Writing + Math score of 1190 is required for admission consideration.'
        sat2_desc = None
        act_code = '4837'
        act_desc = ' ACT Test results submitted directly to Western by ACT Institutional Services.  Western\'s institution number is 4837. A minimum ACT composite score of 24 is required for admissions consideration'

        # item["ap"] = ap
        item["duration_per"] = 1
        item["school_name"] = school_name
        item["location"] = location
        item["campus"] = campus
        #item["degree_type"] = 1
        item["department"] = department
        #item["degree_name"] = degree_name
        item["degree_overview_en"] = degree_overview_en
        #item["major_name_en"] = major_name_en
        item["overview_en"] = overview_en
        #item["teach_time"] = 1
        item["start_date"] = start_date
        item["modules_en"] = modules_en
        item["career_en"] = career_en
        item["deadline"] = deadline
        item["apply_pre"] = 'CAD$'
        item["apply_fee"] = apply_fee
        item["entry_requirements_en"] = entry_requirements_en
        item["tuition_fee_pre"] = 'CAD$'
        item["require_chinese_en"] = require_chinese_en
        item["ielts_desc"] = ielts_desc
        item["ielts"] = ielts
        item["ielts_l"] = ielts_l
        item["ielts_s"] = ielts_s
        item["ielts_r"] = ielts_r
        item["ielts_w"] = ielts_w
        item["toefl_code"] = toefl_code
        item["toefl_desc"] = toefl_desc
        item["toefl_l"] = toefl_l
        item["toefl"] = toefl
        item["toefl_s"] = toefl_s
        item["toefl_r"] = toefl_r
        item["toefl_w"] = toefl_w
        item["interview_desc_en"] = interview_desc_en
        item["portfolio_desc_en"] = portfolio_desc_en
        item["other"] = other
        item["url"] = response.url
        item["gatherer"] = 'weihongbo'
        item["finishing"] = 0
        item["import_status"] = 0
        #item["duration"] = duration
        #item["tuition_fee"] = tuition_fee
        item["alevel"] = alevel
        item["ib"] = ib
        item["gaokao_zs"] = gaokao_zs
        item["gaokao_score_wk"] = gaokao_score_wk
        item["gaokao_score_lk"] = gaokao_score_lk
        item["specific_requirement_en"] = specific_requirement_en
        item["huikao_desc"] = huikao_desc
        item["huikao_zs"] = huikao_zs
        item["min_language_require"] = min_language_require
        item["sat_code"] = sat_code
        item["sat1_desc"] = sat1_desc
        item["sat2_desc"] = sat2_desc
        item["act_code"] = act_code
        item["act_desc"] = act_desc
        item["gaokao_desc"] = gaokao_desc
        #item["duration"] = '4'

        if '/science.html' in response.url:
            degree_name = 'BA/BSc'
            tuition_fee = '28,743.00'
            ap = '<div><p><strong>Recommended<span>&nbsp;</span>Prerequisites</strong></p><ul><li>Senior Level Calculus</li></ul><p><span>Notes:</span><br><span>▪&nbsp; First year Biology and Chemistry courses require Grade 12 Biology and Chemistry, respectively. First year Biology and Chemistry courses are required for ALL modules offered by the Department of Biology and for some modules offered by the Department of Chemistry and other Science departments.<br></span><span>▪&nbsp; It is strongly recommended that applicants complete a Grade 12 math course.</span><span>&nbsp;</span></p></div>'
        elif 'arts_and_humanities.html' in response.url:
            degree_name = 'BA'
            tuition_fee = '28,743.00'
            ap = 'No course prerequisites'
        elif 'engineering.html' in response.url:
            degree_name = 'BSc'
            tuition_fee = '36,886.00'
            ap = '<div><p><strong>Required Prerequisites</strong></p><ul><li>Senior Level Calculus</li><li>Senior Level Chemistry</li><li>Senior Level Physics</li></ul><p><strong>Secondary School course descriptions/syllabi</strong> may be requested from applicants to certain programs/faculties at Western for assessment of prerequisite course requirements. Applicants will be notified of this requirement at the point of application review and acknowledgement by the Undergraduate Admissions Office.</p></div>'
        elif 'health_studies.html' in response.url:
            degree_name = 'BHSc'
            tuition_fee = '28,743.00'
            ap = '<div><p><strong>Required<span> </span>Prerequisites</strong></p><ul><li>Senior Level Biology</li><li>Senior Level&nbsp;Math (Recommended)</li></ul><p>Note: Students considering the Honors Specialization in Health Sciences with Biology will need Grade 12 Chemistry in order to fulfill the first year Chemistry requirements of the module.</p><p><strong>Secondary School course descriptions/syllabi</strong> may be requested from applicants to certain programs/faculties at Western for assessment of prerequisite course requirements. Applicants will be notified of this requirement at the point of application review and acknowledgement by the Undergraduate Admissions Office.</p></div>'
        elif 'kinesiology.html' in response.url:
            degree_name = 'BA/BSc'
            tuition_fee = '28,743.00'
            ap = '<div><p><strong>Required Prerequisites</strong></p><ul><li>Senior Level&nbsp;Biology</li></ul><p>Notes:<br><span>▪&nbsp;A Grade 12 Math and Grade 11 or 12 Physics are recommended to prepare for senior Kinesiology subjects in biomechanics, research methods, and statistics.<br></span><span>▪&nbsp;</span>It is strongly recommended that students interested in the BSc program take Grade 12 Science courses such as: Chemistry, Calculus, Pre-Calculus Math, or Physics.<br><span>▪&nbsp;</span>Grade 12 Chemistry is a prerequisite for first year Chemistry courses.</p><p><strong>Secondary School course descriptions/syllabi</strong> may be requested from applicants to certain programs/faculties at Western for assessment of prerequisite course requirements. Applicants will be notified of this requirement at the point of application review and acknowledgement by the Undergraduate Admissions Office.</p></div>'
        elif 'nursing.html' in response.url:
            degree_name = 'BSc'
            tuition_fee = '36,886.00'
            ap = '<div><p><strong>Required Prerequisites</strong></p><ul><li>Senior Level&nbsp;Biology</li><li>Senior Level Chemistry</li><li>Senior Level English</li><li>Senior Level Math</li></ul><p><strong>Secondary School course descriptions/syllabi</strong> may be requested from applicants to certain programs/faculties at Western for assessment of prerequisite course requirements. Applicants will be notified of this requirement at the point of application review and acknowledgement by the Undergraduate Admissions Office.</p></div>'
        elif 'information_and_media_studies.html' in response.url:
            degree_name = 'BA'
            tuition_fee = '28,743.00'
            ap = 'No course prerequisites'
        elif 'music.html' in response.url:
            degree_name = 'BA'
            tuition_fee = '28,743.00'
            ap = '<div><ul><li>No course prerequisites</li></ul><p>Faculty recommendation required based on <a>Audition, Interview, Piano Proficiency and Theory Placement</a> results.</p><p>Note:&nbsp; A senior level math course is recommended for applicants to Music Administrative Studies.</p></div>'
        elif 'medical_sciences.html' in response.url:
            degree_name = 'BMSc'
            tuition_fee = '28,743.00'
            ap = '<div><p><strong>Recommended<span>&nbsp;</span>Prerequisites</strong></p><ul><li>Senior Level Biology</li><li>Senior Level Calculus</li><li>Senior Level Chemistry</li></ul><p><span>Notes:</span><br><span>▪ First year Biology and Chemistry courses require Grade 12 Biology and Chemistry, respectively. First year Biology and Chemistry courses are required for ALL modules offered in the Bachelor of Medical Sciences and Neurosciences programs.<br></span><span>▪ Although Western offers first year Physics courses that do not require Grade 12 Physics as a prerequisite, it is strongly recommended that you complete&nbsp;Grade 12 Physics.</span>&nbsp;</p></div>'
        elif 'social_science.html' in response.url:
            degree_name = 'BA/BSc'
            tuition_fee = '28,743.00'
            ap = '<div><ul><li>No course prerequisites</li></ul><p><span>Notes:</span><br><span>▪&nbsp;</span>All Specializations and Majors in Psychology requires a first-year university Math course, therefore any Grade 12 level academic Math is highly recommended for this program.<br><span>▪&nbsp;</span>Math is helpful as preparation for Sociology and Geography programs.<br><span>▪&nbsp;</span>A Grade 12 Pre-Calculus Math and a Grade 12 Calculus (equivalent to Ontario Grade 12 Advanced Functions and Grade 12 Calculus) are required for all Economics modules.<br><span>▪&nbsp;</span>Grade 12 Biology, Chemistry, and Physics are highly recommended for the BSc in Psychology.</p></div>'
        elif 'management_and_organizational_studies.html' in response.url:
            degree_name = 'BMOS'
            tuition_fee = '34,474.00'
            ap = '<div><p><strong>Recommended Prerequisites</strong></p><ul><li>Senior Level&nbsp;Math</li></ul><p>Note: For Management &amp; Organizational Studies&nbsp;a Grade 12 Calculus and/or a university Calculus course is required prior to taking mandatory upper-year Economics courses in Finance, and pursuing a Major or Honors Specialization in Economics.</p></div>'
        else:
            degree_name = None
            tuition_fee = None
            ap = None
            re.findall("/\\d\d\d\d\d")
        #item["degree_name"] = degree_name
        item["tuition_fee"] = tuition_fee
        item["ap"] = ap

        for acd in major_name_en_list:
            major_name_en = acd
            item["major_name_en"] = major_name_en
            #print(major_name_en)
            for bbc in degree_name_li:
                degree_name = bbc
                item["degree_name"] = degree_name
                #print(degree_name)
                if '3' in degree_name and '4' in degree_name:
                    item["duration"] = '3,4'
                elif '4' in degree_name:
                    item["duration"] = '4'
                elif '3' in degree_name:
                    item["duration"] = '3'
                else:
                    item["duration"] = '123'
                #print(item["duration"])
                if '123' not in item[
                        "duration"] and 'jointly' not in degree_name and 'Double' not in degree_name and 'Certificate' not in degree_name and 'minor' not in degree_name and 'After degree' not in degree_name:

                    yield item

Ejemplo n.º 30

Mostrar archivo

Archivo: DalhousieUniversity_U.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolCanadaBenItem)

        #1.school_name
        school_name = 'Dalhousie University'
        # print(school_name)

        #2.url
        url = response.url
        # print(url)

        #3.major_name_en
        major_name_en = response.xpath(
            '//*[@id="skipContent"]/div/div/div[1]/div[1]/div/h2/a/text()'
        ).extract()
        major_name_en = ''.join(major_name_en)
        major_name_en = remove_tags(major_name_en).strip()
        # print(major_name_en)

        #4.degree_name
        degree_name = response.xpath(
            '//*[@id="skipContent"]/div/div/div[1]/div[1]/div/h2/a/span'
        ).extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name).strip().replace('amp;', '')
        # print(degree_name)

        #5.overview_en
        try:
            overview_en_url = response.xpath(
                "//a[contains(text(),'Program overview') or contains(text(),'Program Overview')]//@href"
            ).extract()[0]
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
            }
            data = requests.get(overview_en_url, headers=headers)
            response1 = etree.HTML(data.text)
            overview_en = response1.xpath(
                "//div[@class='topRichText text parbase']|//div[@class='text parbase section']"
            )
            doc = ""
            if len(overview_en) > 0:
                for a in overview_en:
                    doc += (etree.tostring(a,
                                           encoding='unicode',
                                           pretty_print=False,
                                           method='html'))
                    doc = remove_class(doc)
                    overview_en = doc
        except:
            overview_en = None
        # print(overview_en,url)

        #6.modules_en
        try:
            modules_en_url = response.xpath(
                "//a[contains(text(),'What will I learn?')]//@href").extract(
                )[0]
            # print(modules_en_url)
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
            }
            data2 = requests.get(modules_en_url, headers=headers)
            response2 = etree.HTML(data2.text)
            modules_en = response2.xpath("//div[@class='contentPar parsys']")
            doc2 = ""
            if len(modules_en) > 0:
                for a in modules_en:
                    doc2 += (etree.tostring(a,
                                            encoding='unicode',
                                            pretty_print=False,
                                            method='html'))
                    doc2 = remove_class(doc2)
                    modules_en = doc2
        except:
            modules_en = None
        # print(modules_en,url)

        #7.career_en
        try:
            career_en_url = response.xpath(
                "//a[contains(text(),'What can I do with this degree?') or contains(text(),'What can I do with Pre-Vet studies?')]//@href"
            ).extract()[0]
            # print(career_en_url)
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
            }
            data3 = requests.get(career_en_url, headers=headers)
            response3 = etree.HTML(data3.text)
            career_en = response3.xpath("//div[@class='text parbase section']")
            doc3 = ""
            if len(career_en) > 0:
                for a in career_en:
                    doc3 += (etree.tostring(a,
                                            encoding='unicode',
                                            pretty_print=False,
                                            method='html'))
                    doc3 = remove_class(doc3)
                    career_en = doc3
        except:
            career_en = None
        # print(career_en)

        #8.entry_requirements_en
        entry_requirements_en = '<p>You can apply to many Dalhousie programs directly from high school. The general admission requirements for degree programs are:Completion of secondary school (i.e. Grade 12) with a minimum overall average of 70% in five academic Grade 12 courses;A final grade of at least 70% in Grade 12 English and any other required subject</p>'

        #9.require_chinese_en
        require_chinese_en = "<p>Senior Middle School Graduation Certificate and Nation Matriculation Examination or Graduation Examination</p>"

        #10.alevel
        alevel = "As a GCE A Level.British System applicant you need a minimum of five subjects, including two A (Advanced) levels or four AS (Advanced Subsidiary) levels with grades of C or better, for admission. Exceptional candidates may be accepted on GCSE or O (Ordinary) levels."

        #11.ib
        ib = 'As an IB student, you must meet our general entrance requirements. If you are completing the IB Diploma you need at least 26 points (including bonus points) for admission.'

        #12.ap
        ap = 'Dalhousie awards university credit for selected AP courses completed with a national exam result of 4 or 5. '

        #13.toefl_desc 1415161718
        toefl_desc = '237 (computer-based) 90(iBT) and no lower than 20 in each band'
        toefl = 90
        toefl_r = 20
        toefl_w = 20
        toefl_s = 20
        toefl_l = 20

        #19.ielts_desc 2021222324
        ielts_desc = '6.5 overall and no lower than 6.0 in each band'
        ielts = 6.5
        ielts_r = 6.0
        ielts_w = 6.0
        ielts_s = 6.0
        ielts_l = 6.0

        #25.toefl_code #26.sat_code
        toefl_code = '0915'
        sat_code = toefl_code

        #27.apply_fee #28.apply_pre
        apply_fee = 70
        apply_pre = '$'

        #29.deadline
        if 'Health Sciences' in degree_name:
            deadline = '2019-02-15'
        elif 'Nursing' in degree_name:
            deadline = '2019-02-28'
        elif 'Music' in degree_name or 'Environmental Design Studie' in degree_name:
            deadline = '2019-03-01'
        elif 'Medical Sciences' in degree_name:
            deadline = '2019-03-15'
        elif 'International Food Business' in degree_name:
            deadline = '2019-07-01'
        else:
            deadline = '2019-04-01'

        #30.tuition_fee_pre
        tuition_fee_pre = '$'

        #31.act_code
        act_code = '5373'

        #32.sat_desc
        sat_desc = '<p>We also require strong academic standing with a final senior year minimum average of ‘B’ for consideration, and a minimum SAT score of 1100 (new SAT; Post March 2016). Applicants who took the SAT before March 2016 are required to have a minimum score of 1650. Dalhousie’s SAT code is 0915. SAT subject tests are not required for admission. You may present an ACT result in lieu of an SAT result. Dalhousie requires a minimum ACT composite score of 23, with no individual score less than 20. Dalhousie’s ACT code is 5373.For both the SAT & ACT, Dalhousie will super score an applicant with multiple test dates, where the highest category score is taken irrespective of test date. Admission to Dalhousie is purely quantitative for most direct-entry programs. As such, essays, references and/or interviews are not required for admission.Applicants applying for early admission between October 15 and January 31 are required to send in official transcripts of all results up until the end of junior year, in lieu of completing the self-reported grades section of the application.</p>'

        #33.act_desc
        act_desc = sat_desc

        #34.tuition_fee
        tuition_fee_dict = {
            'Agriculture': '16,669.14',
            'Architecture': '20,036.06',
            'Arts & Social Science': '19,153.06',
            'Music': '20,038.66',
            'Theatre': '19,345.06',
            'Commerce Co-op': '21,331.06',
            'Community Design': '19,712.06',
            'Costume Studies': '19,954.06',
            'Engineering': '21,206.06',
            'Health Sciences': '20,414.06',
            'Kinesiology': '20,410.06',
            'Nursing': '21,071.06',
            'Pharmacy': '21,740.06',
            'Recreation': '20,410.06',
            'Health': '20,410.06',
            'Social Work': '19,967.06',
        }
        tuition_fee = tuition_fee_dict.get(major_name_en)
        if tuition_fee == None:
            if 'Computer Science' in major_name_en:
                tuition_fee = '20,196.06'
            elif 'Management' in major_name_en:
                tuition_fee = '19,585.56'
            elif 'Health' in major_name_en:
                tuition_fee = '20,410.06'
            elif 'Science' in major_name_en:
                tuition_fee = '20,182.06'
            elif 'Sustainability' in major_name_en:
                tuition_fee = '20,363.56'
            else:
                tuition_fee = None
        # print(tuition_fee)

        #35.location
        location = 'Halifax'

        department = response.xpath(
            '//*[@id="skipContent"]/div/div/div[1]/div[1]/div/h2/span/a[1]'
        ).extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        item['department'] = department
        item['location'] = location
        item['school_name'] = school_name
        item['url'] = url
        item['major_name_en'] = major_name_en
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['career_en'] = career_en
        item['entry_requirements_en'] = entry_requirements_en
        item['require_chinese_en'] = require_chinese_en
        item['alevel'] = alevel
        item['ib'] = ib
        item['ap'] = ap
        item['toefl_desc'] = toefl_desc
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_w'] = toefl_w
        item['ielts_desc'] = ielts_desc
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['toefl_code'] = toefl_code
        item['sat_code'] = sat_code
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['deadline'] = deadline
        item['tuition_fee_pre'] = tuition_fee_pre
        item['act_code'] = act_code
        item['sat1_desc'] = sat_desc
        item['act_desc'] = act_desc
        item['tuition_fee'] = tuition_fee
        yield item