Ejemplos de get_ielts en Python

Lenguaje de programación: Python

Namespace/Package Name: scrapySchool_England_Ben.getIELTS

Método / Función: get_ielts

Ejemplos en hotexamples.com: 20

Python get_ielts - 20 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de scrapySchool_England_Ben.getIELTS.get_ielts extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace(
                "Award:", "").replace("(Hons)", "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(
                item['degree_name'], '').replace("(Hons)",
                                                 "").replace("()", "").strip()
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            alevel = response.xpath(
                "//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Undergraduate entry to year 1 </strong></p>
<p>The above qualifications and completion of a suitable foundation programme.</p>
<p>Alternatively, successful completion of one year at a recognised Chinese university in a relevant subject.</p>
<p><strong>Undergraduate entry to year 2 / 3</strong></p>
<p>2 Year Diploma in a suitable subject area.</p>
<p>University College Graduation Diploma or Graduation Diploma from recognised institutions.</p>
<p>EDEXCEL or SQA HND</p>
<p>Da Zhuan (3 Year Diploma)</p>
<p>(Year 2 &amp; 3 entry is subject to successful programme mapping)</p>"""
                ]))

            ucascode = response.xpath(
                "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
            ).extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS code:",
                                                         "").strip()
            print("item['ucascode'] = ", item['ucascode'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            teach_time = ''.join(mode)
            print("teach_time: ", teach_time)

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "full" in teach_time.lower():
                if "https://courses.bolton.ac.uk/course" in item['url']:
                    if "undergraduate" in isup_str or len(
                            item['ucascode']) != 0:
                        print("******存到数据库*****")
                        yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 2

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        try:
            overview = response.xpath(
                "//h3[contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            career = response.xpath(
                "//h2[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en")
            # print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/following-sibling::*[1]/preceding-sibling::*"
            ).extract()
            # modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en")
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h2[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<7]|"
                "//h2[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # if item['assessment_en'] == "":
            #     print("***assessment_en")
            # print("item['assessment_en'] = ", item['assessment_en'])

            # //a[contains(text(), 'Faculty of')]|//a[contains(text(), 'School of')]
            department = response.xpath(
                "//a[contains(text(), 'Faculty of')]//text()|"
                "//a[contains(text(), 'School of')]//text()").extract()
            item['department'] = remove_class(
                clear_lianxu_space(department)).replace(
                    "academic staff in the", "").strip()
            # if item['department'] == "":
            #     print("***department")
            # print("item['department'] = ", item['department'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/following-sibling::*[1]//text()"
            ).extract()
            alevel_str = ''.join(alevel).strip()
            if alevel_str == "Overall:" or alevel_str == "Overall":
                alevel = response.xpath(
                    "//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()"
                ).extract()
                alevel_str = ''.join(alevel).replace(
                    "Overall", "").strip().strip(":").strip()
                # print("***alevel")
            item['alevel'] = clear_space_str(alevel_str)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            ib_str = ''.join(ib).strip()
            if ib_str == "Overall:":
                ib = response.xpath(
                    "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()"
                ).extract()
                ib_str = ''.join(ib).strip()
                # print("***ib")
            item['ib'] = ib_str
            # print("item['ib'] = ", item['ib'])

            ielts_str = response.xpath(
                "//div[@id='entry-collapse']//h2[contains(text(),'English')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            item['ielts_desc'] = ''.join(ielts_re).strip()
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            application_open_date = response.xpath(
                "//div[@class='p-3 p-xl-4 text-center text-light']//text()"
            ).extract()
            clear_space(application_open_date)
            # print("application_open_date: ", ''.join(application_open_date))
            item['application_open_date'] = getStartDate(
                ''.join(application_open_date))
            # if item['application_open_date'] == "":
            #     print("***application_open_date")
            # print("item['application_open_date'] = ", item['application_open_date'])

            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h2>Process</h2>
<ol><li>Choose the programmes you want to study. Still undecided? Search our <a href="/undergraduate">undergraduate degrees</a></li>
<li>Find out <a href="/apply/undergraduate/how-to-apply-through-ucas">how to apply through UCAS</a></li>
<li>Wait for universities to make their decisions, <a href="/apply/undergraduate/after-you-apply">learn what happens after you apply</a></li>
<li>Reply to your <a href="/apply/undergraduate/your-offer">university offers</a></li>
<li><a href="/apply/undergraduate/your-offer">Confirm your university place</a></li>
</ol>"""
                ]))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Undergraduate</h2>
<p>We do not accept the Chinese National University Entrance Examination. However, you can apply to study for an <a href="http://isc.surrey.ac.uk/programmes/international-foundation-year?ch=uniweb&amp;cc=uniweb&amp;cid=uniweb&amp;utm_source=signposting&amp;utm_medium=signposting&amp;utm_campaign=uniweb&amp;_ga=2.246594701.825790074.1509959240-87246970.1500115796">International Foundation Year</a> at our <a href="http://isc.surrey.ac.uk/">International Study Centre</a>, which will prepare you for a full undergraduate degree course.</p>"""
                ]))

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_str = (''.join(programme_en).split("–"))[0].strip()
            print(programme_en_str)

            if "2019" in ''.join(programme_en):
                item['start_date'] = '2019'
            # print("item['start_date'] = ", item['start_date'])

            # 判断可以拆分几条数据，ucascode、duration、degree_name
            is_degree_name = response.xpath(
                "//tbody[@class='w-100']/tr").extract()
            # print("is_degree_name: ", is_degree_name)
            print(len(is_degree_name))
            for i in range(len(is_degree_name)):
                print("****************" + str(i + 1) + "***************")
                degree_name_re = re.findall(r"\w+\s\(Hons\).*|\w+$",
                                            programme_en_str)
                if len(degree_name_re) > 0:
                    item['degree_name'] = ''.join(degree_name_re).strip()
                    item['programme_en'] = programme_en_str.replace(
                        item['degree_name'], '').strip()
                else:
                    item['programme_en'] = programme_en_str
                print("item['programme_en'] = ", item['programme_en'])

                degree_name_xpath = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[1]//text()").extract()
                clear_space(degree_name_xpath)
                item['degree_name'] = ''.join(degree_name_xpath).strip()
                print("item['degree_name'] = ", item['degree_name'])

                duration = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[2]//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                if len(duration) != 0:
                    duration_list = getIntDuration(''.join(duration))
                    # print("duration_list: ", duration_list)
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[4]//text()").extract()
                clear_space(ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                print("item['ucascode']: ", item['ucascode'])

                tick = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[3]//i[@class='icon icon-tick']").extract()
                clear_space(tick)
                print("tick: ", tick)
                print(len(tick))
                if len(tick) == 1:
                    item['other'] = 'Professional Training'
                print("item['other']: ", item['other'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 3

Mostrar archivo

Archivo: UniversityOfLincoln_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "University of Lincoln"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'University of Lincoln, Brayford Pool, Lincoln, LN6 7TS'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            if "Foundation" not in item['major_type1']:
                # //table[@id='newTitle']/tbody[@id='newTitleBody']/tr/td/h1[1]/a
                programmeDegreetype = response.xpath("//div[@id='CourseTitleApms']/h1//text()").extract()
                clear_space(programmeDegreetype)
                # print("programmeDegreetype: ", programmeDegreetype)
                if len(programmeDegreetype) > 0:
                    programmeDegreetypeStr = programmeDegreetype[0].strip()

                degree_type = re.findall(r"^\w+\s\(Hons\)|^\(\w+\)|^\w+", programmeDegreetypeStr)
                # print("degree_type: ", degree_type)
                degree_type_str = ''.join(degree_type).strip()
                item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").replace("(", "").replace(")", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                item['programme_en'] = programmeDegreetypeStr.replace(degree_type_str, '').strip()
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath("//div[@class='nd_2019-20']//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                if len(ucascode) == 0:
                    ucascode = response.xpath("//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace("UCAS Code:", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                # //span[@id='durationFT']
                duration = response.xpath("//div[@class='nd_2019-20']//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                if len(duration) == 0:
                    duration = response.xpath("//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_str = ''.join(duration)

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                department = response.xpath("//span[contains(text(),'School:')]/following-sibling::a//text()").extract()
                clear_space(department)
                if len(department) > 0:
                    item['department'] = department[0]
                # print("item['department']: ", item['department'])

                dep_dict = {"lincoln school of architecture and the built environment": "College of Arts",
    "lincoln school of design": "College of Arts",
    "lincoln school of film and media": "College of Arts",
    "school of english and journalism": "College of Arts",
    "school of fine and performing arts": "College of Arts",
    "school of history and heritage": "College of Arts",
    "school of chemistry": "College of Science",
    "school of computer science": "College of Science",
    "school of engineering": "College of Science",
    "school of geography": "College of Science",
    "school of life sciences": "College of Science",
    "school of mathematics and physics": "College of Science",
    "school of pharmacy": "College of Science",
    "national centre for food manufacturing": "College of Science",
    "lincoln institute for agri-tech": "College of Science",
    "school of education": "College of Social Science",
    "school of health and social care": "College of Social Science",
    "professional development centre": "College of Social Science",
    "lincoln law school": "College of Social Science",
    "school of psychology": "College of Social Science",
    "school of social and political sciences": "College of Social Science",
    "school of sport and exercise science": "College of Social Science",}
                if item['department'] != "Lincoln Business School":
                    item['department'] = dep_dict.get(item['department'].lower())
                # print("item['department']1: ", item['department'])

                if item['department'] == None:
                    item['department'] = ''.join(response.xpath("//div[@class='breadcrumb-list']//span//a[@href='/home/collegeofsocialscience/']//text()").extract()).strip()
                # print("item['department']2: ", item['department'])

                # //div[@id='feesTables']/table
                fee = response.xpath("//div[@class='nd_2019-20']//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                if len(fee) == 0:
                    fee = response.xpath(
                        "//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                clear_space(fee)
                # print("fee: ", fee)
                feeStr = ''.join(fee)
                tuitionfee = getTuition_fee(feeStr)
                item['tuition_fee'] = tuitionfee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //h2[contains(text(),'The Course')]/..
                overview = response.xpath("//h2[contains(text(),'The Course')]/..").extract()
                # print("overview: ", overview)
                if len(overview) > 0:
                    item['overview_en'] = remove_class(clear_lianxu_space([overview[-1]]))
                # print("item['overview_en']: ", item['overview_en'])

                modules_en = response.xpath("//a[contains(text(),'Modules')]/../../..").extract()
                modules_en = response.xpath(
                    "//div[@id='collapse62019-20']//div[@class='tab-content clearfix']").extract()

                if len(modules_en) > 0:
                    item['modules_en'] = remove_class(clear_lianxu_space([modules_en[-1]]))
                if item['modules_en'] == "":
                    item['modules_en'] = None
                    print("*** modules_en")
                else:
                    print("===", item['modules_en'])
                    del_cont = re.findall(r"<br>Find out more</p><div><span>.*?</em></span>", item['modules_en'])
                    print("del_cont==", del_cont)
                    if len(del_cont) > 0:
                        for delc in del_cont:
                            item['modules_en'] = item['modules_en'].replace(delc, '<div>').strip()
                print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//a[contains(text(),'How You Are Assessed')]/../../..|//a[contains(text(),'How you are assessed')]/../../..").extract()
                if len(assessment_en) > 0:
                    item['assessment_en'] = remove_class(clear_lianxu_space([assessment_en[-1]]))
                # print("item['assessment_en']: ", item['assessment_en'])

                interview_desc_en = response.xpath(
                    "//a[contains(text(),'Interviews & Applicant Days')]/../../..").extract()
                if len(interview_desc_en) > 0:
                    item['interview_desc_en'] = remove_class(clear_lianxu_space([interview_desc_en[-1]]))
                # print("item['interview_desc_en']: ", item['interview_desc_en'])

                alevel = response.xpath(
                    "//*[contains(text(),'GCE Advanced Levels')]/text()|//*[contains(text(),'A Level')]/text()").extract()
                if len(alevel) > 0:
                    item['alevel'] = clear_lianxu_space([alevel[-1]])
                print("item['alevel']: ", item['alevel'])

                ib = response.xpath(
                    "//p[contains(text(),'International Baccalaureate')]").extract()
                if len(ib) > 0:
                    item['ib'] = remove_tags(clear_lianxu_space([ib[-1]]))
                # print("item['ib']: ", item['ib'])

                rntry_requirements = response.xpath(
                    "//a[contains(text(),'Entry Requirements')]/../../..|//a[contains(text(),'Entry requirements')]/../../..").extract()
                if len(rntry_requirements) > 0:
                    rntry_requirements = remove_tags(clear_lianxu_space([rntry_requirements[-1]]))
                # print("rntry_requirements: ", rntry_requirements)

                ielts = re.findall(r"IELTS.{1,80}", rntry_requirements)
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                career = response.xpath("//div[@id='CourseCareersApms']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                # if item['ielts_desc'] == "":
                #     item['ielts_desc'] = "Prospective students require IELTS 6.0 (with no less than 5.5 in each band score) or an equivalent qualification. Please note that some courses require a higher score."
                #     item['ielts'] = 6.0
                #     item['ielts_l'] = 5.5
                #     item['ielts_s'] = 5.5
                #     item['ielts_r'] = 5.5
                #     item['ielts_w'] = 5.5
                # print("******item['ielts_desc']: ", item['ielts_desc'])
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/englishlanguagerequirementsandsupport/englishlanguagerequirements/
                if item['ielts'] == "6.5":
                    item['toefl'] = 90
                    item['toefl_l'] = 20
                    item['toefl_s'] = 22
                    item['toefl_r'] = 21
                    item['toefl_w'] = 22
                elif item['ielts'] == "7.0":
                    item['toefl'] = 100
                    item['toefl_l'] = 22
                    item['toefl_s'] = 23
                    item['toefl_r'] = 23
                    item['toefl_w'] = 23
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/entryrequirementsandyourcountry/china/
                item["require_chinese_en"] = remove_class(clear_lianxu_space(["""<div class="panel">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" href="#countryUndergraduateTab">
<em class="more-less glyphicon glyphicon-menu-down"></em>Undergraduate Entry
</a>
</h4>
</div>
<div id="countryUndergraduateTab" class="panel-collapse collapse">
<div class="panel-body">
<p>Prospective students require one of the following qualifications for entry into year one of an undergraduate degree:</p>
<ul>
<li>Successful completion of a Foundation programme with a minimum of 50% plus an average of 70% or above in High School. Please note that some programmes may require a higher foundation score e.g. 60%.</li>
<li>Successful completion of the first year of a Chinese degree / Diploma with an average grade of 70% or above.</li>
</ul>
<p><strong>&nbsp;</strong></p>
<p><strong>HND Students (BTEC and SQA)</strong></p>
<p>Students who have successfully completed a HND BTEC or SQA qualification may be accepted directly into year two or three of a University of Lincoln undergraduate course on a case by case basis.</p>
<p><strong>Chinese Degree / Diploma</strong></p>
<p>Students who have successfully completed the second or third year of a Chinese Degree or Diploma may be considered for direct entry into year two or three of a University of Lincoln undergraduate course on a case by case basis. For more information, please contact the International Admissions team:&nbsp;<a href="mailto:intadmissions&#64;lincoln&#46;ac&#46;uk">intadmissions&#64;lincoln&#46;ac&#46;uk</a>.</p>
<p>&nbsp;</p>	
<!-- START ADVANCED ENTRY (UNDERGRADUATE) -->
<p><strong>Advanced Entry (Undergraduate)</strong></p>
<p>Depending on your academic background and intended course of study, it may be possible to apply for advanced entry into year 2 or 3 of a University of Lincoln undergraduate course.</p>

<!-- START COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->


<!-- END COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->

<p id="advEntryUgEu">For more information, please contact the Student Administration Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<p id="advEntryUgInternational">For more information, please contact the International Admissions Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<!-- END ADVANCED ENTRY (UNDERGRADUATE) -->
</div>
</div>					
</div>
"""]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                item['apply_proces_en'] = "http://www.lincoln.ac.uk/home/studywithus/undergraduatestudy/howtoapply/"
                # print("item['apply_proces_en']: ", item['apply_proces_en'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 4

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Sheffield"
        # item['country'] = 'England'
        # item['website'] = 'https://www.sheffield.ac.uk'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Western Bank, Sheffield, S10 2TN, UK"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programmeDegree_type = response.xpath(
                "//div[@class='titles']/h2//text()").extract()
            if len(programmeDegree_type) == 0:
                programmeDegree_type = response.xpath(
                    "//main[@class='main content']/h2[1]//text()").extract()
            programmeDegree_type = ''.join(programmeDegree_type).strip()
            print("programmeDegree_type: ", programmeDegree_type)
            degree_typeList = re.findall(r"[A-Za-z/\(\)]*$",
                                         programmeDegree_type)
            # print("degree_typeList: ", degree_typeList)
            programme = programmeDegree_type
            if len(degree_typeList) != 0:
                degree_type = ''.join(list(degree_typeList[0]))
                item['degree_name'] = degree_type
                programme = programmeDegree_type.replace(
                    item['degree_name'], '')
            print("item['degree_name']: ", item['degree_name'])
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            department = response.xpath(
                "//div[@class='titles']//h3//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            ucascode = response.xpath("//span[@id='adCode']//text()").extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # 课程长度
            durationContent = response.xpath(
                "//h3[contains(text(),'Course details')]/following-sibling::text()"
            ).extract()
            clear_space(durationContent)
            # print(durationContent)

            duration_list = getIntDuration(''.join(durationContent))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # 专业描述
            overview = response.xpath("//div[@class='descHold']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            alevel = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'A Levels')]/following-sibling::td//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltDict.get('IELTS')
            item["ielts_l"] = ieltDict.get('IELTS_L')  # float
            item["ielts_s"] = ieltDict.get('IELTS_S')  # float
            item["ielts_r"] = ieltDict.get('IELTS_R')  # float
            item["ielts_w"] = ieltDict.get('IELTS_W')
            # print("ielts = %s  ielts_l = %s  ielts_s = %s  ielts_r = %s  ielts_w = %s"%(
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            modules_en = response.xpath("//div[@id='modules']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath("//div[@id='ltam']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//div[@id='graduates']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            # https://www.sheffield.ac.uk/prospectus/courseDetails.do?id=N1202019
            # # start_date //a[@href='#tab00']
            # start_date = response.xpath(
            #     "//table[@class='cms-tabs']/tbody/tr[last()]/th[1]//text()").extract()
            # clear_space(start_date)
            # start_date_str = ''.join(start_date).replace('start', '').replace('entry', '').strip()
            # # print(start_date_str)
            # start_date_1 = getStartDate(start_date_str)
            # print(start_date_1)
            # item['start_date'] = start_date_1
            # print("item['start_date']: ", item['start_date'])

            # //div[@id='tab00']
            # modules   评估方式

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h1>How to apply: applying essentials</h1>
    <p><img class="imgRight" alt="Undergraduates in a tutorial"   src="/polopoly_fs/1.550384!/image/undergraduates320.jpg" />This page provides key information about applying to study on an undergraduate course at Sheffield, and contains links to all of our procedures and Admissions policies. Please take the time to read this information before completing your application.</p>
    <h3>Before you apply</h3>
    <p>We normally expect applicants to offer three full A Levels or an accepted equivalent qualification. You can check the University's general entry requirements, including which UK and International qualifications we accept and our English language and mathematics requirements, on our Admissions requirements webpage:</p>
    <p><a  href="/undergraduate/apply/requirements">Admissions requirements</a></p>
    <p>You can find details of the entry requirements for particular courses in our online prospectus. Please note that these represent our typical offer conditions only – we may make different offers in some cases.</p>
    <p><a href="http://www.sheffield.ac.uk/prospectus">Online prospectus</a></p>
    <p>A full list of our formal policies relating to Admissions is available on our Policies webpages. This includes our Student Admissions Policy as well as policies on A Level subject combinations, resits, and qualifications taken early.</p>
    <p><a  href="/undergraduate/policies">Our policies</a></p>
    <h3>Applying</h3>
    <p>You can apply for an undergraduate course at Sheffield via UCAS (the Universities and Colleges Admissions Service):</p>
    <p><a href="http://www.ucas.com/apply">UCAS website – Apply</a></p>
    <p>Applications for places on courses starting the following September (except Medicine and Dentistry) should be submitted to UCAS between:</p>
    <ul>
        <li>1 September and 15 January to be guaranteed equal consideration with other applicants</li>
        <li>16 January and 30 June for further consideration, although we may not be able to consider your application if all the places on the course you have applied for have been filled</li>
    </ul>
    <p>Applications for places on Medicine and Dentistry courses must be submitted between 1 September and 15 October.</p>
    <p>You can find more information about how and when to apply on our Applying webpage. This also contains information about deferred entry, direct entry to year/level 2 and our foundation year courses.</p>
    <p><a  href="/undergraduate/apply/applying">Applying</a></p>
    <p>Our Education For All webpage provides information on the support we provide for Care Leavers, estranged students, carers, mature students and students with a disability or learning difficulty. You can also find information about our outreach activities, our use of contextual data and our Disrupted Studies scheme.</p>
    <p><a  href="/undergraduate/apply/wp">Education for all: Widening Participation and Disrupted Studies</a></p>
    <h3>After you apply</h3>
    <p>You can find out what happens after you have submitted your application on our <a  href="/undergraduate/apply/after">After You Apply</a> webpages. If we make you a Conditional offer and you accept us as either your Firm or Insurance choice, we will also send you an email containing information about what happens when you get your exam results.</p>
    <p><a  href="/undergraduate/apply/after">After you apply</a></p>
    <p>If at any time you find that your studies are&#160;affected by personal, social or domestic issues, please let us know by using our Disrupted Studies form:</p>
    <p><a  href="/undergraduate/apply/applying/disrupted">Disrupted Studies</a></p>
    <h3>Further information</h3>
    <p>If you have any further questions about the University and applying to study with us, please <a href="http://ask.sheffield.ac.uk">Ask Sheffield</a>.</p>
    <p>If you still need help, our Applicant Information Desk (AiD) provides a first point of contact for people who have applied to the University. AiD can help with any questions you have about the process of applying to us and the current status of your application.</p>
    <p><a  href="/aid">Applicant Information Desk</a></p>
    <p>We wish you the best of luck with your application.</p>"""
                ]))
            item['require_chinese_en'] = ''

            tuition_feeDict = {
                "C180": "21450",
                "C200": "21450",
                "C300": "21450",
                "C100": "21450",
                "C109": "21450",
                "C189": "21450",
                "C209": "21450",
                "C309": "21450",
                "C1C9": "21450",
                "C1CX": "21450",
                "C1R9": "21450",
                "C101": "21450",
                "F400": "18900",
                "FV41": "18900",
                "VV46": "18900",
                "VR47": "18900",
                "VR41": "18900",
                "VR42": "18900",
                "F410": "18900",
                "VR44": "18900",
                "QV84": "18900",
                "F401": "18900",
                "KK13": "21450",
                "K100": "21450",
                "ARCU123": "21450",
                "ARCU124": "21450",
                "ARCU13": "21450",
                "ARCU129": "21450",
                "Y001": "16800",
                "H130": "21450",
                "G500": "21450",
                "H690": "21450",
                "H660": "21450",
                "H310": "21450",
                "H360": "21450",
                "H361": "21450",
                "H1NF": "21450",
                "H1NF": "21450",
                "HN62": "21450",
                "OG31": "21450",
                "8L16": "21450",
                "57": "21450",
                "2G36": "21450",
                "8M74": "21450",
                "2A47": "21450",
                "H653": "21450",
                "H659": "21450",
                "B900": "21450",
                "B909": "21450",
                "H810": "21450",
                "H800": "21450",
                "H840": "21450",
                "H8T9": "21450",
                "H8F1": "21450",
                "H8J7": "21450",
                "H801": "21450",
                "F100": "21450",
                "F105": "21450",
                "F107": "21450",
                "F106": "21450",
                "F335": "21450",
                "F109": "21450",
                "F108": "21450",
                "C720": "21450",
                "H210": "21450",
                "HK21": "21450",
                "H2T9": "21450",
                "H200": "21450",
                "H202": "21450",
                "HK2D": "21450",
                "H2N2": "21450",
                "2H26": "21450",
                "8T63": "21450",
                "8L55": "21450",
                "2G91": "21450",
                "H201": "21450",
                "A200": "21450",
                "G600": "21450",
                "G650": "21450",
                "G402": "21450",
                "G400": "21450",
                "GG41": "21450",
                "GG74": "21450",
                "G4G1": "21450",
                "G700": "21450",
                "G490": "21450",
                "G495": "21450",
                "G401": "21450",
                "G651": "21450",
                "GN52": "21450",
                "GN53": "21450",
                "X301": "16800",
                "F401": "18900",
                "Q305": "16800",
                "Q310": "16800",
                "F901": "18900",
                "L701": "18900",
                "V101": "16800",
                "Q307": "16800",
                "V501": "16800",
                "L301": "16800",
                "L401": "16800",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
                "QC19": "21450",
                "B990": "21450",
                "C801": "21450",
                "V642": "16800",
                "L432": "16800",
                "T210": "16800",
                "T300": "18900",
                "TN42": "18900",
                "T110": "16800",
                "T415": "16800",
                "TN12": "18900",
                "T1T2": "16800",
                "T4T2": "16800",
                "T1R2": "16800",
                "T2R2": "16800",
                "T1R4": "16800",
                "T2R4": "16800",
                "T1R7": "16800",
                "T2R7": "16800",
                "T1R1": "16800",
                "TV11": "16800",
                "TV21": "16800",
                "L100": "16800",
                "LV15": "16800",
                "LL12": "16800",
                "L101": "16800",
                "LG11": "16800",
                "L1N3": "16800",
                "LIN3": "16800",
                "X300": "16800",
                "X301": "16800",
                "H620": "21450",
                "H621": "21450",
                "H610": "21450",
                "H613": "21450",
                "H614": "21450",
                "H651": "21450",
                "H647": "21450",
                "H645": "21450",
                "H6T9": "21450",
                "H623": "21450",
                "H615": "21450",
                "H616": "21450",
                "H652": "21450",
                "H649": "21450",
                "H622": "21450",
                "H611": "21450",
                "H648": "21450",
                "H629": "21450",
                "H628": "21450",
                "H602": "21450",
                "H603": "21450",
                "H100": "21450",
                "H104": "21450",
                "H675": "21450",
                "H673": "21450",
                "H67I": "21450",
                "H67H": "21450",
                "Q3Q1": "16800",
                "QL33": "16800",
                "QR14": "16800",
                "QR17": "16800",
                "QR32": "16800",
                "QR37": "16800",
                "QV15": "16800",
                "QT12": "16800",
                "Q304": "16800",
                "Q310": "16800",
                "Q305": "16800",
                "Q306": "16800",
                "QR31": "16800",
                "QV31": "16800",
                "QW33": "16800",
                "QV35": "16800",
                "QR34": "16800",
                "QW34": "16800",
                "Q307": "16800",
                "F309": "21450",
                "G109": "21450",
                "QR11": "16800",
                "R120": "16800",
                "RL11": "16800",
                "RL12": "16800",
                "RN12": "16800",
                "RR12": "16800",
                "RR14": "16800",
                "RR17": "16800",
                "RV11": "16800",
                "RV15": "16800",
                "RW13": "16800",
                "R1R9": "16800",
                "R1T2": "16800",
                "R1R7": "16800",
                "R1RR": "16800",
                "R1RO": "16800",
                "L700": "18900",
                "F800": "18900",
                "F902": "18900",
                "F900": "18900",
                "F901": "18900",
                "QR12": "16800",
                "R220": "16800",
                "RL21": "16800",
                "RL22": "16800",
                "RN22": "16800",
                "RR24": "16800",
                "RR27": "16800",
                "RV21": "16800",
                "RV25": "16800",
                "RW23": "18900",
                "R2R9": "16800",
                "R2T2": "16800",
                "R2R7": "16800",
                "R2RR": "16800",
                "R2R3": "16800",
                "R410": "16800",
                "RL42": "16800",
                "RN42": "16800",
                "RL41": "16800",
                "R4T2": "16800",
                "R4R7": "16800",
                "R4RR": "16800",
                "V100": "16800",
                "RV71": "16800",
                "RV41": "16800",
                "VV15": "16800",
                "VL12": "16800",
                "VL13": "16800",
                "V1R9": "16800",
                "V101": "16800",
                "B620": "21450",
                "QC18": "21450",
                "QC19": "21450",
                "P110": "18900",
                "P500": "18900",
                "K3K4": "18900",
                "KC39": "18900",
                "M100": "16800",
                "ML94": "16800",
                "M1R4": "16800",
                "M1R2": "16800",
                "M1R1": "16800",
                "M930": "16800",
                "M120": "16800",
                "N200": "16800",
                "N420": "16800",
                "NG21": "16800",
                "NG41": "16800",
                "NL21": "16800",
                "NL41": "16800",
                "NP21": "16800",
                "NP41": "16800",
                "NT22": "16800",
                "N120": "16800",
                "JH51": "21450",
                "J500": "21450",
                "J5R9": "21450",
                "FH21": "21450",
                "J200": "21450",
                "FHF1": "21450",
                "H403": "21450",
                "H401": "21450",
                "JH5P": "21450",
                "JH56": "21450",
                "J501": "21450",
                "G100": "18900",
                "G103": "18900",
                "GN13": "18900",
                "G102": "18900",
                "G1R4": "18900",
                "G1R1": "18900",
                "G1R2": "18900",
                "G106": "18900",
                "VG51": "18900",
                "A100": "21450",
                "T900": "16800",
                "C400": "21450",
                "C500": "21450",
                "C440": "21450",
                "C700": "21450",
                "C741": "21450",
                "CC45": "21450",
                "CC74": "21450",
                "CC75": "21450",
                "C709": "21450",
                "CC7C": "21450",
                "CC79": "21450",
                "C409": "21450",
                "CC4C": "21450",
                "C749": "21450",
                "C509": "21450",
                "C449": "21450",
                "C790": "21450",
                "C791": "21450",
                "CC47": "21450",
                "CC4R": "21450",
                "C431": "21450",
                "C433": "21450",
                "C521": "21450",
                "C523": "21450",
                "W302": "18900",
                "RW43": "18900",
                "VW53": "18900",
                "WT34": "18900",
                "WT31": "18900",
                "WTH4": "18900",
                "B991": "21450",
                "B740": "21450",
                "B990": "21450",
                "B520": "21450",
                "QV36": "16800",
                "RV26": "16800",
                "QV16": "16800",
                "VW63": "16800",
                "VV56": "16800",
                "VR61": "16800",
                "BIBU08": "16800",
                "V641": "16800",
                "V500": "16800",
                "RV45": "16800",
                "V501": "16800",
                "F300": "21450",
                "F301": "21450",
                "F344": "21450",
                "F350": "21450",
                "FF35": "21450",
                "F371": "21450",
                "F3F5": "21450",
                "FV35": "21450",
                "F321": "21450",
                "F3G4": "21450",
                "F3GK": "21450",
                "F305": "21450",
                "F304": "21450",
                "F3F5": "21450",
                "L210": "16800",
                "LL23": "16800",
                "LV25": "16800",
                "L201": "16800",
                "LL24": "16800",
                "C800": "21450",
                "C802": "21450",
                "C801": "21450",
                "R710": "16800",
                "RL71": "16800",
                "RL72": "16800",
                "RN72": "16800",
                "RR47": "16800",
                "R7R7": "16800",
                "R7RR": "16800",
                "RV75": "16800",
                "RW73": "18900",
                "R7T2": "16800",
                "L300": "16800",
                "LL43": "16800",
                "NL2K": "16800",
                "NL24": "16800",
                "L391": "16800",
                "L301": "16800",
                "L401": "16800",
                "L722": "16800",
                "TRPU105": "16800",
                "LK74": "18900",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
            }
            tuition_fee = tuition_feeDict.get(item['ucascode'])
            print("tuition_fee: ", tuition_fee)
            if tuition_fee != None:
                item['tuition_fee'] = int(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 5

Mostrar archivo

Archivo: NottinghamTrentUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.ntu.ac.uk/"
        item['university'] = "Nottingham Trent University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===============================")
        # print(response.url)
        print(item['url'])
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//h1[@class='course-heading page-heading']//text()").extract(
                )
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h2[@class='js_qualification']/strong//text()").extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name'] = ", item['degree_name'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[3]/span
            location = response.xpath(
                "//span[@class='location save']//text()").extract()
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting:')]/following-sibling::span//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = ''.join(start_date)
            # print("item['start_date'] = ", item['start_date'])
            item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date']1 = ", item['start_date'])

            # //html//div[@class='content']/div[1]/div  专业描述
            overview = response.xpath(
                "//div[@id='what-you-will-study']/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # modules   课程设置
            modules = response.xpath(
                "//div[@id='what-you-will-study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # teaching_assessment   评估方式
            teaching_assessment = response.xpath(
                "//div[@id='how-youre-taught']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # career   评估方式
            career = response.xpath(
                "//div[@id='careers-and-employability']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //div[@id='entry-requirements-1']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-0']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            print("item['apply_desc_en'] = ", item['apply_desc_en'])

            # //div[@id='entry-requirements-1']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply-1']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]
            interview_desc_en = response.xpath(
                "//div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # deadline
            deadline = response.xpath(
                "//div[@id='how-to-apply-1']//p//strong[contains(text(),'Application closing date')]/../following-sibling::p[1]//text()|//div[@id='how-to-apply-1']//h3[contains(text(),'Application deadline')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            deadline_str = ''.join(deadline)
            item['deadline'] = getStartDate(deadline_str)
            # print("item['deadline'] = ", item['deadline'])

            alevel = response.xpath(
                "//div[@id='entry-requirements-0']//li[contains(text(),'A-levels')]//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            # https://www.ntu.ac.uk/international/scholarships-and-fees/tuition-fees
            tuition_fee = response.xpath(
                "//html//div[@id='fees-and-funding-1']//text()").extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            tuition_fee = getTuition_fee(''.join(tuition_fee))
            item['tuition_fee'] = tuition_fee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            departmentDict = {
                "Economics with Business":
                "Nottingham Business School",
                "Animal Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Applied Anthrozoology":
                "School of Animal, Rural and Environmental Sciences",
                "Biodiversity Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Endangered Species Recovery and Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance, Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Global Food Security and Development":
                "School of Animal, Rural and Environmental Sciences",
                "Architecture":
                "School of Architecture, Design and the Built Environment",
                "Architecture (ARB/RIBA Part 2)rch":
                "School of Architecture, Design and the Built Environment",
                "Building Surveying":
                "School of Architecture, Design and the Built Environment",
                "Civil Engineering":
                "School of Architecture, Design and the Built Environment",
                "Construction Management":
                "School of Architecture, Design and the Built Environment",
                "Construction Project Management (Online)":
                "School of Architecture, Design and the Built Environment",
                "Interior Architecture and Design":
                "School of Architecture, Design and the Built Environment",
                "International Real Estate Investment and Finance":
                "School of Architecture, Design and the Built Environment",
                "Planning and Development":
                "School of Architecture, Design and the Built Environment",
                "Project Management (Construction)":
                "School of Architecture, Design and the Built Environment",
                "Quantity Surveying":
                "School of Architecture, Design and the Built Environment",
                "Real Estate":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Management":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Materials":
                "School of Architecture, Design and the Built Environment",
                "Animation":
                "School of Art & Design",
                "Commercial Photography":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Branding and Identity":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Design":
                "School of Art & Design",
                "Fashion Knitwear Design":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design":
                "School of Art & Design",
                "Illustration":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "Textile Design Innovation":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fashion and Textile Design":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design Theory and Practice":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "PG Cert Creative Pattern Cutting (15 weeks)":
                "School of Art & Design",
                "Art and Design Professional Doctorate":
                "School of Art & Design",
                "Art and Design":
                "School of Art & Design",
                "Broadcast Journalism":
                "School of Arts and Humanities",
                "Digital and Newspaper Journalism":
                "School of Arts and Humanities",
                "Magazine Journalism":
                "School of Arts and Humanities",
                "Documentary Journalism":
                "School of Arts and Humanities",
                "Media and Globalisation":
                "School of Arts and Humanities",
                "Creative Writing":
                "School of Arts and Humanities",
                "English Literary Research":
                "School of Arts and Humanities",
                "Linguistics":
                "School of Arts and Humanities",
                "Philosophy":
                "School of Arts and Humanities",
                "History":
                "School of Arts and Humanities",
                "PGCert Museum and Heritage Development":
                "School of Arts and Humanities",
                "Holocaust and Genocide":
                "School of Arts and Humanities",
                "International Development":
                "School of Arts and Humanities",
                "English Language Teaching":
                "School of Arts and Humanities",
                "TESOL (Teaching English to Speakers of Other Languages)":
                "School of Arts and Humanities",
                "Management":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Management and Innovation and Enterprise":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Digital Marketing":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "fees, funding and scholarships":
                "Nottingham Business School",
                "Return to all courses":
                "Nottingham Business School",
                "Human resource Management":
                "Nottingham Business School",
                "Economics":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "International Business (Dual Award) ":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and International Publishing":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "Finance and Accounting":
                "Nottingham Business School",
                "Finance and Investment Banking":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "Entrepreneurship":
                "Nottingham Business School",
                "Project Management":
                "Nottingham Business School",
                "Management":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "Assessment Only Route to QTS (Primary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Assessment Only Route to QTS (Secondary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training (Assessment Only) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Education":
                "Nottingham Institute of Education",
                "English Language Teaching":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with English and Literacy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Mathematics and Numeracy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Science, Engineering and Technology)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Special and Inclusive Practice)":
                "Nottingham Institute of Education",
                "Primary Education":
                "Nottingham Institute of Education",
                "Primary: School-Centred Initial Teacher Training (SCITT)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary)":
                "Nottingham Institute of Education",
                "Secondary Biology":
                "Nottingham Institute of Education",
                "Secondary Business Education":
                "Nottingham Institute of Education",
                "Secondary Chemistry":
                "Nottingham Institute of Education",
                "Secondary Computer Science with ICT":
                "Nottingham Institute of Education",
                "Secondary Education (Design and Technology)":
                "Nottingham Institute of Education",
                "Secondary Education (Physics)":
                "Nottingham Institute of Education",
                "Secondary English":
                "Nottingham Institute of Education",
                "Secondary Mathematics":
                "Nottingham Institute of Education",
                "Secondary Music":
                "Nottingham Institute of Education",
                "Special Educational Needs Coordination - National Award":
                "Nottingham Institute of Education",
                "Teaching English to Speakers of Other Languages (TESOL)":
                "Nottingham Institute of Education",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "Dual LLM in Corporate and Insolvency Law / European and Insolvency Law":
                "Nottingham Law School",
                "General Law":
                "Nottingham Law School",
                "Health Law and Ethics":
                "Nottingham Law School",
                "Human Rights and Justice":
                "Nottingham Law School",
                "Intellectual Property Law":
                "Nottingham Law School",
                "International Financial Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Sports Law":
                "Nottingham Law School",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Legal Practice":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Biomedical Science":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Molecular Cell Biology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Cancer Biology":
                "School of Science and Technology",
                "Cell Biology":
                "School of Science and Technology",
                "Molecular Biology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Chemistry / Chemistry (Professional Practice)":
                "School of Science and Technology",
                "Pharmaceutical and Medicinal Science":
                "School of Science and Technology",
                "Pharmaceutical Analysis":
                "School of Science and Technology",
                "Analytical Chemistry":
                "School of Science and Technology",
                "Chemistry":
                "School of Science and Technology",
                "Advanced Materials Engineering":
                "School of Science and Technology",
                "Forensic Science":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Cloud and Enterprise Computing":
                "School of Science and Technology",
                "IT Security":
                "School of Science and Technology",
                "Engineering (Electronics)":
                "School of Science and Technology",
                "Engineering (Cybernetics and Communications)":
                "School of Science and Technology",
                "Engineering Management":
                "School of Science and Technology",
                "Computing Systems":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Electronic Systems":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Mathematical Sciences":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Medical and Materials Imaging":
                "School of Science and Technology",
                "Medical Imaging":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Sport Science":
                "School of Science and Technology",
                "Exercise Physiology":
                "School of Science and Technology",
                "Performance Nutrition":
                "School of Science and Technology",
                "Performance Analysis":
                "School of Science and Technology",
                "Biomechanics":
                "School of Science and Technology",
                "Sport and Exercise Psychology":
                "School of Science and Technology",
                "Psychology":
                "School of Social Sciences",
                "Applied Child Psychology":
                "School of Social Sciences",
                "sychological Research Methods":
                "School of Social Sciences",
                "Forensic Mental Health":
                "School of Social Sciences",
                "Forensic Psychology (BPS accredited)":
                "School of Social Sciences",
                "Cyberpsychology":
                "School of Social Sciences",
                "Psychology in Clinical Practice":
                "School of Social Sciences",
                "Psychological Wellbeing and Mental Health":
                "School of Social Sciences",
                "Criminology":
                "School of Social Sciences",
                "Sociology":
                "School of Social Sciences",
                "Politics":
                "School of Social Sciences",
                "International Relations":
                "School of Social Sciences",
                "Online International Relations (Distance learning)":
                "School of Social Sciences",
                "Public Health":
                "School of Social Sciences",
                "Career Development":
                "School of Social Sciences",
                "Social Work (January 2019 entry)":
                "School of Social Sciences",
            }
            item['department'] = departmentDict.get(item['programme_en'])
            if item['department'] == None:
                item['department'] = departmentDict.get(item['programme_en'])
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
                    if item['department'] == None:
                        item['department'] = departmentDict.get(
                            item['programme_en'].replace(" ", " "))
            print("item['department'] = ", item['department'])

            # School of Animal, Rural and Environmental Sciences
            # School of Architecture, Design and the Built Environment
            # School of Art &amp; Design
            # School of Arts and Humanities
            # Nottingham Business School
            # Nottingham Institute of Education
            # Nottingham Law School
            # School of Science and Technology
            # School of Social Sciences
            if item['department'] is None:
                if "/animal-rural-environmental-sciences" in item['url']:
                    item[
                        'department'] = "School of Animal, Rural and Environmental Sciences"
                elif "/architecture-design-built-environment" in item['url']:
                    item[
                        'department'] = "School of Architecture, Design and the Built Environment"
                elif "/art-design" in item['url']:
                    item['department'] = "School of Art & Design"
                elif "/arts-humanities" in item['url']:
                    item['department'] = "School of Arts and Humanities"
                elif "/business" in item['url']:
                    item['department'] = "Nottingham Business School"
                elif "/education" in item['url']:
                    item['department'] = "Nottingham Institute of Education"
                elif "/law" in item['url']:
                    item['department'] = "Nottingham Law School"
                elif "/science-technology" in item['url']:
                    item['department'] = "School of Science and Technology"
                elif "/social-sciences" in item['url']:
                    item['department'] = "School of Social Sciences"
            print("item['department']1 = ", item['department'])

            if item['degree_name'] == "BA (Hons)":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
            elif item['department'] == "School of Art & Design" or item[
                    'department'] == "School of Animal, Rural and Environmental Sciences" or item[
                        'department'] == "School of Science and Technology":
                item['ielts'] = 6.0
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "Nottingham Business School":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "School of Architecture, Design and the Built Environment" or item[
                    'department'] == "School of Arts and Humanities" or item[
                        'department'] == "Nottingham Institute of Education" or item[
                            'department'] == "Nottingham Law School" or item[
                                'department'] == "School of Social Sciences" or item[
                                    'department'] == "School of Art & Design":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("item['IELTS'] = %s item['IELTS_L'] = %s item['IELTS_S'] = %s item['IELTS_R'] = %s item['IELTS_W'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-1']//text()").extract()
            entry_requirementsStr = ''.join(entry_requirements)
            ielts = re.findall(r"IELTS.{1,200}", entry_requirementsStr)
            item['ielts_desc'] = ''.join(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts'] == None:
                ieltsDict = get_ielts(''.join(ielts))
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
                        <table id="table76765" style="width: 100%;"><thead><tr><th id="table76765r1c1"> Your qualification</th><th id="table76765r1c2"> You could study</th></tr></thead><tbody><tr><td headers="table76765r1c1">         High School Year 2<br />Grades of 70% and above       </td><td headers="table76765r1c2">         Foundation courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College (NTIC) </a></td></tr><tr><td headers="table76765r1c1">         High School Year 3<br />Grades of 80% and above       </td><td headers="table76765r1c2">         International Year One courses at NTIC       </td></tr><tr><td headers="table76765r1c1">         Completion of first year of Chinese university degree       </td><td headers="table76765r1c2">         First year bachelors degrees       </td></tr><tr><td headers="table76765r1c1">         Three year diploma or higher national diploma       </td><td headers="table76765r1c2">         Considered for final year entry to selected bachelors degrees or for Pre-Masters courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College</a></td></tr><tr><td headers="table76765r1c1">         Bachelors degree (four years or six years in medicine / dentistry) from recognised institution in China. <br />Grades of 75% or above<br />Grades of 70% or above from 211 universities       </td><td headers="table76765r1c2">         Postgraduate (Masters) courses       </td></tr><tr><td headers="table76765r1c1">         Masters degree from a recognised institution in China.<br />Grades of 70% or above       </td><td headers="table76765r1c2">         Postgraduate research       </td></tr></tbody></table><p>If you have questions about your qualification and it is not listed here, please <a href="mailto:[email protected]">contact us</a> for advice.</p>
"""
                ]))

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code(s):')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).replace(" / ",
                                                                "/").strip()
            print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//strong[contains(text(),'Course duration:')]/following-sibling::span//text()"
            ).extract()
            print("duration: ", duration)
            duration_str = ''.join(duration).replace("/ sandwich", "").strip()
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            duration_per = item['duration_per']
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            if "/" in item['ucascode']:
                ucascode_sp = item['ucascode'].split("/")
                if "/" in duration_str:
                    duration_sp = duration_str.split("/")
                elif " or" in duration_str:
                    duration_sp = duration_str.split(" or")
                elif "," in duration_str:
                    duration_sp = duration_str.split(" or")
                else:
                    duration_sp = [duration_str, duration_str]
                print("ucascode_sp: ", ucascode_sp)
                print("duration_sp: ", duration_sp)
                if len(ucascode_sp) == 2:
                    item['ucascode'] = ucascode_sp[0]
                    duration_list = getIntDuration(duration_sp[0])
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[0].strip())
                        item['duration_per'] = duration_per
                    print("item['ucascode']1: ", item['ucascode'])
                    print("item['duration']1 = ", item['duration'])
                    print("item['duration_per']1 = ", item['duration_per'])
                    yield item

                    item['ucascode'] = ucascode_sp[-1]
                    duration_list = getIntDuration(duration_sp[-1].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[-1].replace(
                            "year", "").replace("(s)", "").strip())
                        item['duration_per'] = 1
                    print("item['ucascode']2: ", item['ucascode'])
                    print("item['duration']2 = ", item['duration'])
                    print("item['duration_per']2 = ", item['duration_per'])
                    yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 6

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            programme_en = ''.join(programme).strip()

            degree_name = re.findall(r"^.*?\s", programme_en)
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programme_en.replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            department = response.xpath(
                "//div[@id='proxy_rightSummary']//p//span[contains(text(),'College:')]/../text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            ucascode = response.xpath(
                "//span[contains(text(),'UCAS code:')]/../text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//span[contains(text(),'Duration:')]/../text()").extract()
            clear_space(duration)
            # print("duration: ", duration)

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            # location = response.xpath(
            #     "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            # clear_space(location)
            item[
                'location'] = '33 Buccleuch Place, City, Edinburgh, Post Code. EH8 9JS'
            # print("item['location']: ", item['location'])

            # # //option[@value='0010']
            # start_date = response.xpath(
            #     "//select[@name='code2']//option//text()").extract()
            # clear_space(start_date)
            # if len(start_date) > 1:
            #     item['start_date'] = start_date[0].strip()
            # # print("item['start_date']: ", item['start_date'])
            # item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_introduction']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='proxy_collapseprogramme']
            modules = response.xpath(
                "//div[@id='proxy_collapseWhatStudy']/..").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(
                list(modules)))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@id='proxy_collapseLearning']/..").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='proxy_collapseCareers']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            # entry_requirements = response.xpath(
            #     "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//li[contains(text(),'A Levels:')]//text()|//p[contains(text(),'A levels:')]//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[-1]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//ul[1]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()").extract()
            ib = response.xpath(
                "//html//ul[3]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()
            print("item['ib'] = ", item['ib'])

            IELTS = response.xpath(
                "//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath(
                "//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            if len(TOEFL) == 0:
                TOEFL = response.xpath(
                    "//*[contains(text(),'TOEFL')]//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath(
                "//html//div[@id='proxy_collapseFees']//p[1]/a/@href").extract(
                )
            print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
                item['tuition_fee_pre'] = ""
            print("item['tuition_fee']: ", item['tuition_fee'])

            # https://www.ed.ac.uk/studying/international/country/asia/east-asia/china
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p class="lead">Undergraduate entry requirements for students from China.</p>


  <h2>Senior High School Certificate</h2>

<p>Students who have completed the Chinese Senior High School Certificate are required to undertake further study for entry to most subjects as this qualification does not normally meet our minimum entry requirements.</p>

<p>We accept the following qualifications for direct entry to our undergraduate degree programmes:</p>

<ul>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/ruk/a-levels" title="A Levels"><abbr title="General Certificate of Education">GCE</abbr> <abbr title="Advanced Level">A Levels</abbr></a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/international/ib" >International Baccalaureate</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/scottish-qualifications/highers" title="SQA Highers and Advanced Highers">Scottish qualifications</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/international/country/americas/united-states-of-america" title="United States of America"><abbr title="United States">US</abbr> qualifications</a></li>
</ul>

<p>Applicants with qualifications other than those listed above will usually be required to complete a Foundation Year before entering the University.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/applying/foundation" title="International Foundation Programme">Foundation year</a></p>

<h2>Science and Engineering</h2>

<p>For degree programmes in Science and Engineering, applicants who have completed a year of study at a leading Chinese University may be eligible to apply.</p>

<p>The College of Science &amp; Engineering will also give consideration to applicants who have achieved excellent results in the Chinese National University Entrance Examination (Gaokao) on an individual basis.</p>

<h2>Further guidance on academic entry requirements</h2>

<p>Each course may have further specific entry requirements. All applicants must meet these requirements. Staff in the Admissions Offices will be able to provide further guidance.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/contacts" title="Contact us with an enquiry about undergraduate study">Undergraduate admissions contacts</a></p>

<h2>English Language requirements</h2>

<p>If your first language is not English, you will also have to meet English Language requirements to apply. These requirements are listed by programme.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/english" title="English language requirements">English Language advice</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/degrees" title="Degree finder">Specific English language requirement by programme</a></p>

<h2>Contact us</h2>

<p>Edinburgh Global's representative for China is Esther Sum.</p>

<p>Esther will help you with admissions advice and support.</p>

<p><a href="mailto:[email protected]">Contact us by email - [email protected]</a></p>

<h2>Support in your country</h2>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/our-visits-overseas" title="Our visits overseas">View a list of our overseas visits</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/agents/list/china" title="China">Our agents in your country</a></p>

<h2>Chat to us</h2>

<p>Talk to a member of staff online and view a presentation about study in Edinburgh.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/chat-to-us-online" title="Online information sessions">Chat to us</a></p>

<h2>Join our mailing list</h2>

<p>We will send you further useful information about the University, admissions and entry.</p>

<p><a href="http://r1.dotmailer-surveys.com/0127judf-2e1gig1f">Join our mailing list </a></p>

<h2>About Edinburgh</h2>

<p><a class="uoe-node-link uoe-published" href="/about" title="About">More information about Edinburgh</a></p>

<p><a class="uoe-node-link uoe-published" href="/global/immigration/applying-for-visa/visa-requirements" >Do I need a visa?</a></p>

<h2>Student numbers</h2>

<p>There are almost 3,000 students students from China currently studying at the University of Edinburgh.</p>
"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item[
                'apply_proces_en'] = "https://www.ed.ac.uk/studying/undergraduate/applying"

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 7

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.uwl.ac.uk/"
        item['university'] = "University of West London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='page-title']//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)

            degree_type = re.findall(r"^\w+\s\(Hons\)|^\(\w+\)|^\w+", programmeDegreetypeStr)
            # print("degree_type: ", degree_type)
            degree_type_str = ''.join(degree_type).strip()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programmeDegreetypeStr.replace(degree_type_str, '').strip()
            print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath("//dt[contains(text(), 'Study mode')]/following-sibling::dd[1]//text()").extract()
            clear_space(mode)
            # print("mode: ", mode)

            location = response.xpath("//dt[contains(text(), 'Location')]/following-sibling::dd[1]//text()").extract()
            item['location'] = ''.join(location).replace("See location information", "").strip()
            # print("item['location']: ", item['location'])

            start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            duration = response.xpath("//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            ucascode = response.xpath(
                "//dt[contains(text(), 'UCAS code')]/following-sibling::dd[1]//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode'] = ", item['ucascode'])

            department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            tuition_fee = response.xpath("//h4[contains(text(),'Overseas students')]/following-sibling::dl[1]//dt[contains(text(), 'Main fee')]/following-sibling::dd[1]//text()").extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='course-detail']
            modules = response.xpath("//div[@id='course-detail']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath("//div[@id='entry-requirements']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath("//*[contains(text(),'A Level')]//text()|//*[contains(text(),'A level')]//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ielts_desc = response.xpath("//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()").extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            if len(ielts_desc_re) > 0:
                item['ielts_desc'] = ielts_desc_re[-1]
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            how_to_apply = response.xpath("//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            assessment_en = response.xpath("//h3[contains(text(),'Teaching methods')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                                           "//*[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                                           "//html//div/strong[contains(text(),'How will I be taught?')]/..|"
                                           "//strong[contains(text(),'course be assessed?')]/..|//strong[contains(text(),'course be assessed?')]/../following-sibling::div").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath(
                "//div[@id='career-progression-and-study']|"
                "//div[@id='jobs-and-placements']|"
                "//html//*[contains(text(),'Career and study progression')]/../following-sibling::*[position()<5]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en)).replace("<div></div>","").strip()
            print("item['career_en']: ", item['career_en'])

            overview_en = response.xpath("//div[@id='course-summary']/*[position()<last()]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<h3>Undergraduate entry&nbsp;</h3>
<p>Applicants with the following&nbsp;qualiﬁcations&nbsp;will be considered for entry on an undergraduate degree:</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Chinese university / college entrance examination (Gaokao) with an overall average 65% or higher</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Diploma from specialised college (zhongzhuan) with an overall average of 65% or higher</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Graduation Certificate awarded at &lsquo;Zhuanke&rdquo; / &lsquo;Dazhuan&rsquo; level from universities / colleges with an overall 65% or higher</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Graduation Certificate - Sub degree (Gaozhi) with an overall 65% or above</p>
<p>&bull;&nbsp;&nbsp;&nbsp;&nbsp; British/International A Levels (from a minimum 112&nbsp;UCAS&nbsp;tariff points depending on course)</p>
<p>&bull; &nbsp; &nbsp;&nbsp;IB&nbsp;Diploma (from a minimum 25 points depending on course)</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Second and final year entry on a bachelor&#39;s degree may be available for those with a Higher National Diploma (HND) from the UK with a merit profile</p>
<p>&bull;&nbsp;&nbsp;&nbsp;&nbsp; A recognised foundation course</p>"""]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 8

Mostrar archivo

Archivo: UniversityOfBradford_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bradford"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        # if "ug/" in response.url:
        print("===========================")
        print(response.url)
        # 重定向匹配不了
        item['major_type1'] = response.meta.get(response.url)
        # print("item['major_type1']: ", item['major_type1'])
        item['location'] = 'Bradford West Yorkshire BD7 1DP UK'
        # print("item['location']: ", item['location'])
        try:
            key_url = response.url.split("/")[-2].strip()

            programme = response.xpath(
                "//div[@id='course-key-info']//div[@class='col-xs-12']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath("//p[@id='cAward']//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            mode = response.xpath(
                "//option[@value='fulltime']//text()|//span[@id='cAttendance']//text()|//span[@id='displayYear']//text()"
            ).extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            print("mode: ", mode)

            if "full" in ''.join(
                    mode).lower() and "Foundation" not in item['programme_en']:
                overview_en = response.xpath(
                    "//div[@id='overviewStripe']").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en))
                # print("item['overview_en']: ", item['overview_en'])

                modules = response.xpath(
                    "//div[@id='course-curriculum']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//div[@class='row stripe background--green']").extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])

                career_en = response.xpath(
                    "//div[@id='nav-course-career']").extract()
                item['career_en'] = remove_class(
                    clear_lianxu_space(career_en)).replace("<div></div>",
                                                           "").strip()
                # print("item['career_en']: ", item['career_en'])

                item[
                    'apply_proces_en'] = 'https://www.bradford.ac.uk/undergraduate/apply/'
                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="entryReq __undergraduate"><h3>Undergraduate</h3><table><tbody><tr><th rowspan="2">Qualification</th><th colspan="3">UCAS tariff points</th></tr><tr><th>136+</th><th>120 - 135</th><th>96 - 119</th></tr><tr><td>Senior Secondary School Graduation Certificate / 高中毕业证书</td><td colspan="3">Foundation Programme required</td></tr></tbody></table></div> """
                    ]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                department_dict = {
                    "Animation":
                    "Faculty of Engineering and Informatics",
                    "Biomedical Engineering":
                    "Faculty of Engineering and Informatics",
                    "Business Computing":
                    "Faculty of Engineering and Informatics",
                    "Chemical Engineering":
                    "Faculty of Engineering and Informatics",
                    "Civil and Structural Engineering":
                    "Faculty of Engineering and Informatics",
                    "Clinical Technology":
                    "Faculty of Engineering and Informatics",
                    "Computer Science":
                    "Faculty of Engineering and Informatics",
                    "Computer Science for Cyber Security":
                    "Faculty of Engineering and Informatics",
                    "Computer Science for Games":
                    "Faculty of Engineering and Informatics",
                    "Film and Television Production":
                    "Faculty of Engineering and Informatics",
                    "Film and Visual Effects Technology":
                    "Faculty of Engineering and Informatics",
                    "Game Design and Development":
                    "Faculty of Engineering and Informatics",
                    "Graphics for Games":
                    "Faculty of Engineering and Informatics",
                    "Mechanical Engineering":
                    "Faculty of Engineering and Informatics",
                    "Software Engineering":
                    "Faculty of Engineering and Informatics",
                    "Virtual and Augmented Reality":
                    "Faculty of Engineering and Informatics",
                    "MPhysiotherapy - Sport and Exercise Medicine MPhysio":
                    "Faculty of Health Studies",
                    "Nursing (Adult)":
                    "Faculty of Health Studies",
                    "Nursing (Adult) – Harrogate and District NHS Trust":
                    "Faculty of Health Studies",
                    "Nursing (Mental Health)":
                    "Faculty of Health Studies",
                    "Occupational Therapy":
                    "Faculty of Health Studies",
                    "Physiotherapy":
                    "Faculty of Health Studies",
                    "Public Health and Community Wellbeing":
                    "Faculty of Health Studies",
                    "Archaeology":
                    "Faculty of Life Sciences",
                    "Biomedical Science":
                    "Faculty of Life Sciences",
                    "Certificate of International Foundation Studies":
                    "Faculty of Life Sciences",
                    "Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Analytical Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Industrial Experience":
                    "Faculty of Life Sciences",
                    "Chemistry - Materials Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Mathematical and Computational Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Medicinal Chemistry":
                    "Faculty of Life Sciences",
                    "Clinical Sciences":
                    "Faculty of Life Sciences",
                    "Forensic and Medical Sciences":
                    "Faculty of Life Sciences",
                    "Forensic Archaeology and Anthropology":
                    "Faculty of Life Sciences",
                    "Forensic Science":
                    "Faculty of Life Sciences",
                    "Foundation in Clinical Sciences/Medicine":
                    "Faculty of Life Sciences",
                    "Optometry":
                    "Faculty of Life Sciences",
                    "Pharmacy":
                    "Faculty of Life Sciences",
                    "Pharmacy 5 years (including pre-registration training)":
                    "Faculty of Life Sciences",
                    "Accounting and Finance":
                    "Faculty of Management, Law & Social Sciences",
                    "Business and Management":
                    "Faculty of Management, Law & Social Sciences",
                    "Business Studies and Law":
                    "Faculty of Management, Law & Social Sciences",
                    "Economics":
                    "Faculty of Management, Law & Social Sciences",
                    "Finance and Economics":
                    "Faculty of Management, Law & Social Sciences",
                    "Human Resource Management":
                    "Faculty of Management, Law & Social Sciences",
                    "International Business and Management":
                    "Faculty of Management, Law & Social Sciences",
                    "Law":
                    "Faculty of Management, Law & Social Sciences",
                    "Law (Commercial Law)":
                    "Faculty of Management, Law & Social Sciences",
                    "Law (Criminal Law)":
                    "Faculty of Management, Law & Social Sciences",
                    "Law (Social Justice)":
                    "Faculty of Management, Law & Social Sciences",
                    "Law with Business and Management":
                    "Faculty of Management, Law & Social Sciences",
                    "Marketing":
                    "Faculty of Management, Law & Social Sciences",
                    "Criminology and Criminal Behaviour":
                    "Faculty of Management, Law & Social Sciences",
                    "Psychology":
                    "Faculty of Management, Law & Social Sciences",
                    "Psychology with Counselling":
                    "Faculty of Management, Law & Social Sciences",
                    "Social Work":
                    "Faculty of Management, Law & Social Sciences",
                    "Sociology":
                    "Faculty of Management, Law & Social Sciences",
                    "Working with Children, Young People and Families":
                    "Faculty of Management, Law & Social Sciences",
                }
                item['department'] = department_dict.get(
                    item['programme_en'].strip())
                print("item['department']: ", item['department'])

                # 将Full-time的情况获取duration、ucascode、alevel、ib、tuition_fee字段
                # # 1.duration
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&duration=duration&level=ug&year=y2019&attendance=fulltime
                # duration_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&duration=duration&level=ug&year=y2019&attendance=fulltime"
                # # print("duration_url: ", duration_url)
                # duration = json.loads(requests.get(duration_url).text).get("data")
                # # print("duration: ", duration)
                # if duration != None:
                #     duration_list = getIntDuration(''.join(duration))
                #     if len(duration_list) == 2:
                #         item['duration'] = duration_list[0]
                #         item['duration_per'] = duration_list[-1]
                # # print("item['duration'] = ", item['duration'])
                # # print("item['duration_per'] = ", item['duration_per'])
                #
                # # 2.ucascode
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&ucasCode=ucasCode&level=ug&year=y2019&attendance=fulltime
                # ucascode_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&ucasCode=ucasCode&level=ug&year=y2019&attendance=fulltime"
                # # print("ucascode_url: ", ucascode_url)
                # ucascode = json.loads(requests.get(ucascode_url).text).get("data")
                # # print("ucascode: ", ucascode)
                # if ucascode is not None:
                #     item['ucascode'] = ''.join(ucascode).strip()
                # # print("item['ucascode']: ", item['ucascode'])
                #
                #
                # # 3.alevel、ib
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&entry=entry&level=ug&year=y2019&attendance=filltime
                # entry_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&entry=entry&level=ug&year=y2019&attendance=fulltime"
                # # print("entry_url: ", entry_url)
                # entry = json.loads(requests.get(entry_url).text).get("data")
                # # print("entry: ", entry)
                # if entry is not None:
                #     entry_response = etree.HTML(entry)
                #     alevel = entry_response.xpath("//strong[contains(text(),'A levels')]/../../following-sibling::div//text()")
                #     # print("alevel: ", alevel)
                #     item['alevel'] = clear_lianxu_space(alevel)
                #
                #     ib = entry_response.xpath(
                #         "//strong[contains(text(),'International Baccalaureate requirements')]/../../following-sibling::div//text()")
                #     # print("ib: ", ib)
                #     item['ib'] = clear_lianxu_space(ib)
                #
                #     ielts_desc = entry_response.xpath(
                #         "//strong[contains(text(),'English language requirements')]/../../following-sibling::div/p[1]//text()")
                #     item['ielts_desc'] = clear_lianxu_space(ielts_desc)
                #     print("item['ielts_desc']: ", item['ielts_desc'])
                #
                #     ielts_dict = get_ielts(item['ielts_desc'])
                #     item['ielts'] = ielts_dict.get('IELTS')
                #     item['ielts_l'] = ielts_dict.get('IELTS_L')
                #     item['ielts_s'] = ielts_dict.get('IELTS_S')
                #     item['ielts_r'] = ielts_dict.get('IELTS_R')
                #     item['ielts_w'] = ielts_dict.get('IELTS_W')
                #     print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
                #
                # # 4.tuition_fee
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&fees=fees&level=ug&year=y2019&attendance=fulltime
                # tuition_fee_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&fees=fees&level=ug&year=y2019&attendance=fulltime"
                # # print("tuition_fee_url: ", tuition_fee_url)
                # tuition_fee = json.loads(requests.get(tuition_fee_url).text).get("data")
                # # print("tuition_fee: ", tuition_fee)
                # if tuition_fee is not None:
                #     tuition_fee_response = etree.HTML(tuition_fee)
                #     tuition_fee_str = tuition_fee_response.xpath("//div[@id='tuitionFees']//p[contains(text(),'International:')]//text()")
                #     # print("tuition_fee_str: ", tuition_fee_str)
                #     tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee_str))
                #
                #     if len(tuition_fee_re) > 0:
                #         item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                # # print("item['tuition_fee']: ", item['tuition_fee'])

                is_full_sand = response.xpath(
                    "//select[@id='variant-attendance-mode']/option//text()"
                ).extract()
                clear_space(is_full_sand)
                item['other'] = ','.join(is_full_sand).strip().strip(
                    ',').strip()
                print("is_full_sand: ", is_full_sand)

                if len(is_full_sand) > 0:
                    for f in is_full_sand:
                        if f == "Full-time":
                            mode = 'fulltime'
                            detail_dict = self.parse_detail(key_url, mode)
                            item['duration'] = detail_dict.get('duration')
                            item['duration_per'] = detail_dict.get(
                                'duration_per')
                            item['ucascode'] = detail_dict.get('ucascode')
                            item['alevel'] = detail_dict.get('alevel')
                            item['ib'] = detail_dict.get('ib')
                            item['ielts_desc'] = detail_dict.get('ielts_desc')
                            item['tuition_fee'] = detail_dict.get(
                                'tuition_fee')
                            ielts_dict = get_ielts(item['ielts_desc'])
                            item['ielts'] = ielts_dict.get('IELTS')
                            item['ielts_l'] = ielts_dict.get('IELTS_L')
                            item['ielts_s'] = ielts_dict.get('IELTS_S')
                            item['ielts_r'] = ielts_dict.get('IELTS_R')
                            item['ielts_w'] = ielts_dict.get('IELTS_W')
                            print(
                                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                                % (item['ielts'], item['ielts_l'],
                                   item['ielts_s'], item['ielts_r'],
                                   item['ielts_w']))
                        elif f == "Full-time with Sandwich Year":
                            mode = 'sandwich'
                            detail_dict = self.parse_detail(key_url, mode)
                            item['duration'] = detail_dict.get('duration')
                            item['duration_per'] = detail_dict.get(
                                'duration_per')
                            item['ucascode'] = detail_dict.get('ucascode')
                            item['alevel'] = detail_dict.get('alevel')
                            item['ib'] = detail_dict.get('ib')
                            item['ielts_desc'] = detail_dict.get('ielts_desc')
                            item['tuition_fee'] = detail_dict.get(
                                'tuition_fee')
                            ielts_dict = get_ielts(item['ielts_desc'])
                            item['ielts'] = ielts_dict.get('IELTS')
                            item['ielts_l'] = ielts_dict.get('IELTS_L')
                            item['ielts_s'] = ielts_dict.get('IELTS_S')
                            item['ielts_r'] = ielts_dict.get('IELTS_R')
                            item['ielts_w'] = ielts_dict.get('IELTS_W')
                            print(
                                "===item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                                % (item['ielts'], item['ielts_l'],
                                   item['ielts_s'], item['ielts_r'],
                                   item['ielts_w']))
                        yield item
                else:
                    yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 9

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.plymouth.ac.uk/"
        item['university'] = "University of Plymouth"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===============================")
        print(response.url)
        try:
            # //span[@class='course-title']
            programme = response.xpath(
                "//span[@class='course-title']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@class='hero-heading']/text()").extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            ucascode = response.xpath("//td[contains(text(),'UCAS course code')]/following-sibling::td//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            department = response.xpath(
                "//h2[@class='school-title']//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department)
            # print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath("//td[contains(text(),'Duration')]/following-sibling::td//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # location
            location = response.xpath("//td[contains(text(),'Location')]/following-sibling::td//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            # overview
            overview1 = response.xpath("//div[@class='overview']|//div[@id='key-features-accordion']").extract()
            # overview2 = response.xpath("//div[@id='key-features-accordion']").extract()
            # overview = remove_class(clear_lianxu_space(overview1)) + remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = remove_class(clear_lianxu_space(overview1))
            # if item['overview_en'] == '':
            #     print("***overview")
            # print("item['overview_en'] = ", item['overview_en'])

            # modules
            modules = response.xpath("//div[@id='structure-accordion']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            career_en = response.xpath("//div[contains(@id, 'career')]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en'] = ", item['career_en'])

            # entry_requirements
            entry_requirements = response.xpath("//div[@id='entry-requirements-accordion']//text()").extract()
            clear_space(entry_requirements)
            entry_requirements_str = ''.join(entry_requirements)
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # .{1,150}IELTS.{1,150}
            IELTS = re.findall(r"IELT.{1,80}|ILETS.{1,80}", entry_requirements_str)
            print("IELTS: ", IELTS)
            if len(IELTS) != 0:
                ielts = ''.join(list(IELTS[0])).strip()
                item['ielts_desc'] = ielts
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip(".").strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip(".").strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip(".").strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip(".").strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip(".").strip()
            print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # alevel = response.xpath("//div[@id='entry-requirements-accordion']//*[contains(text(),'A Level')]/..//text()|"
            #                         "//div[@id='entry-requirements-accordion']//*[contains(text(),'A level')]//text()").extract()
            alevel = re.findall(r"A.*Level.{1,100}|A.*level.{1,100}", entry_requirements_str)
            clear_space(alevel)
            item['alevel'] = ''.join(alevel).strip()
            if item['alevel'] == '':
                print("***alevel")
            print("item['alevel'] = ", item['alevel'])
            if len(item['alevel']) > 300:
                item['alevel'] = ''.join(item['alevel'][:301])
            print("item['alevel']1 = ", item['alevel'])


            # ib = response.xpath("//div[@id='entry-requirements-accordion']//b[contains(text(),'International Baccalaureate')]/..//text()|"
            #                     "//div[@id='entry-requirements-accordion']//b[contains(text(),'IB')]/..//text()|"
            #                     "//div[@id='entry-requirements-accordion']//b[contains(text(),'International baccalaureate')]/..//text()").extract()
            # if len(ib) == 0:
            #     ib = response.xpath("//div[@id='entry-requirements-accordion']//b[contains(text(),'IB')]/../following-sibling::*[1]//text()").extract()
            #     if len(ib) == 0:
            ib = re.findall(r"IB.{1,100}|International.*Baccalaureate.{1,100}|International.*baccalaureate.{1,100}", entry_requirements_str)
            clear_space(ib)
            item['ib'] = ''.join(ib).strip()
            if item['ib'] == '':
                print("***ib")
            print("item['ib'] = ", item['ib'])

            # how_to_apply
            how_to_apply = response.xpath("//div[@id='how-to-apply-accordion']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //html//div[@class='course-accordions']//tr[3]/td[3]
            # how_to_apply
            tuition_fee = response.xpath("//strong[contains(text(),'International')]/../following-sibling::*[2]//text()|"
                                         "//div[@id='fees-funding-accordion']//table[1]//td//text()").extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            if len(tuition_fee)>0:
                item['tuition_fee'] = getTuition_fee(tuition_fee_str)
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']1 = ", item['tuition_fee'])
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
                item['tuition_fee_pre'] = ""
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # https://www.plymouth.ac.uk/international/study/international-students-country-guides/asia/china
            item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<p><b>Undergraduate</b></p><p>To apply for our undergraduate courses you'll need good grades in your&nbsp;
高中毕业证书
 Senior High School Graduation Examination and the&nbsp;
高考
 Chinese University Entrance Examination (Gaokao) for admission to year 1.&nbsp;</p><p>
Applicants who have completed the 专科毕业证书 &nbsp;Graduation Certificate - Specialist / Sub-degree (Zuanke) level (also known as the &nbsp;大专 Dazhuan) will be considered for final year (top-up) entry. We generally require an overall 70 per cent grade or above but this will vary depending on the institution.&nbsp;Contact us for more information: <a href="mailto:[email protected]">[email protected]</a>
</p><p>If you're a high school leaver, our partner college on campus, <a href="http://www.plymouth.ac.uk/international/plymouth-university-international-college">Plymouth University International College (PUIC)</a>, offers a wide variety of foundation courses. &nbsp;
</p><p><div class="table-responsive">
  <table class="table align-left">
      <tr>
            <td><b>A level</b> </td>
            <td> <b>UCAS tariff</b> </td>
            <td><b>Gaokao - percentage</b>&nbsp;</td>
            <td><b>Gaokao - overall grade</b>&nbsp;</td>
            <td><b>Gaokao - GPA</b>&nbsp;</td>
            <td> </td>
      </tr>
      <tr>
            <td> AAA </td>
            <td> 144 </td>
            <td> 90 – 100 </td>
            <td> A </td>
            <td> 4.0 </td>
            <td> </td>
      </tr>
      <tr>
            <td> BBB </td>
            <td> 120 </td>
            <td> 78 – 81 </td>
            <td> B </td>
            <td> 3.0 </td>
            <td> </td>
      </tr>
      <tr>
            <td> CCC </td>
            <td> 96 </td>
            <td> 70 – 71 </td>
            <td> C </td>
            <td> 2.0 </td>
            <td> </td>
      </tr>
</table>"""]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 10

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.cardiffmet.ac.uk/"
        item['university'] = "Cardiff Metropolitan University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'Llandaff Campus, Western Avenue, Cardiff, CF5 2YB'
        # print("item['location'] = ", item['location'])
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 专业、学位类型
            programmeDegreetype = response.xpath("//div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div/h1//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            if len(programmeDegreetype) == 0:
                programmeDegreetype = response.xpath(
                    "//div[@class='cstcoursetitle']/h1//text()").extract()

            clear_space(programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()
            if programmeDegreetypeStr == "":
                programmeDegreetypeStr = item['major_type1']
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)
            programmeDegreetypesplit = programmeDegreetypeStr.split("-")
            # print(programmeDegreetypesplit)
            if len(programmeDegreetypesplit) > 1:
                degreetype = programmeDegreetypesplit[-1]
                # print(degreetype)
                item['degree_name'] = degreetype
                programme = programmeDegreetypesplit[0]
                # print(programme)
                item['programme_en'] = ''.join(programme)
            else:
                programme = programmeDegreetypesplit[0]
                # print(programme)
                item['programme_en'] = ''.join(programme)
            item['degree_name'] = item['degree_name'].replace("(Hons)", "").replace("Degree", "").replace(" s", "").replace("(Joint Honours)", "").replace("(Franchised)", "").strip()
            print("item['degree_name']: ", item['degree_name'])
            if "(Top-Up)" in item['major_type1']:
                item['programme_en'] = item['programme_en'] + "-Up)"
            print("item['programme_en']: ", item['programme_en'])

            if "Foundation" not in item['major_type1']:
                department = response.xpath("//div[@class='crumbcontainer']/span/span[1]/a[1]//text()").extract()
                clear_space(department)
                item['department'] = ''.join(department)
                # print("item['department'] = ", item['department'])

                duration = response.xpath(
                    "//strong[contains(text(),'Course Length:')]/..//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_list = getIntDuration(''.join(duration).replace("Course Length:", "").strip())
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                # //div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div[@class='rightcontainer']/div[@class='coursefacts']/div/div//p
                overview = response.xpath(
                    "//div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div[@class='coursecontentarea']/div[@class='courseoverview']").extract()
                item['overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['overview_en']: ", item['overview_en'])
                # if item['overview_en'] == "":
                #     print("****111****")

                modules_en = response.xpath(
                    "//h3[contains(text(),'Course Content')]|//h3[contains(text(),'Course Content')]/following-sibling::div[1]").extract()
                if len(modules_en) == 0:
                    modules_en = response.xpath("//h3[contains(text(),'Course content')]/..").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
                # print("item['modules_en']: ", item['modules_en'])
                # if item['modules_en'] == "":
                #     print("****111****")

                assessment_en = response.xpath(
                    "//h3[contains(text(),'Learning & Teaching')]|//h3[contains(text(),'Learning & Teaching')]/following-sibling::div[1]|"
                    "//h3[contains(text(),'Assessment')]|//h3[contains(text(),'Assessment')]/following-sibling::div[1]").extract()
                item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])
                # if item['assessment_en'] == "":
                #     print("****111****")

                career_en = response.xpath(
                    "//h3[contains(text(),'Employability & Careers')]|//h3[contains(text(),'Employability & Careers')]/following-sibling::div[1]").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
                # print("item['career_en']: ", item['career_en'])
                # if item['career_en'] == "":
                #     print("****111****")

                alevel = response.xpath(
                    "//*[contains(text(),'A levels')]//text()|"
                    "//*[contains(text(),'A Levels')]//text()").extract()
                item['alevel'] = clear_lianxu_space(alevel)
                # print("item['alevel']: ", item['alevel'])

                ib = response.xpath(
                    "//strong[contains(text(),'International Baccalaureate:')]/../text()").extract()
                item['ib'] = clear_lianxu_space(ib)
                # print("item['ib']: ", item['ib'])

                rntry_requirements = response.xpath(
                    "//h3[contains(text(),'Entry Requirements')]/following-sibling::div[1]//text()").extract()
                if len(rntry_requirements) == 0:
                    rntry_requirements = response.xpath(
                        "//h3[contains(text(),'Entry Requirements')]/following-sibling::div[1]//text()").extract()
                    if len(rntry_requirements) == 0:
                        rntry_requirements = response.xpath(
                            "//h3[contains(text(),'Entry Requirements & How to Apply')]/following-sibling::div[1]//text()").extract()
                rntry_requirements = clear_lianxu_space(rntry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                ielts = re.findall(r"IELTS.{1,80}", rntry_requirements)
                clear_space(ielts)
                # print("ielts: ", ielts)
                if len(ielts) > 0:
                    item['ielts_desc'] = ielts[0]
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                # if len(ielts_list) == 1:
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


                interview_desc_en = response.xpath("//*[contains(text(),'interview')]").extract()
                if len(interview_desc_en) == 0:
                    interview_desc_en = re.findall(r".{1,100}interview.{1,100}", rntry_requirements)
                item['interview_desc_en'] = remove_class(clear_lianxu_space(interview_desc_en)).strip()
                # print("item['interview_desc_en']: ", item['interview_desc_en'])

                # http://www.cardiffmet.ac.uk/international/study/applying/Pages/Fees-and-Money-Matters.aspx
                item['tuition_fee'] = '12000'
                item['apply_proces_en'] = 'http://www.cardiffmet.ac.uk/study/adviceforapplicants/undergraduate/Pages/default.aspx'
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                ucascode = response.xpath("//strong[contains(text(),'UCAS Code')]/following-sibling::*[1]//strong//text()|"
                                          "//strong[contains(text(),'UCAS Code')]/..//text()|"
                                          "//strong[contains(text(),'UCAS Code')]/../following-sibling::p//text()|"
                                          "//strong[contains(text(),'UCAS Code')]/../following-sibling::p//text()").extract()

                clear_space(ucascode)
                print("ucascode: ", ucascode)
                item['other'] = ' '.join(ucascode).strip()
                print("item['other'] = ", item['other'])

                ucascode_re = re.findall(r"UCAS\sCode:\w{4}|UCAS\sCodes:\w{4}|UCAS\sCodes:\s\w{4}", ''.join(ucascode).strip())
                print("ucascode_re: ", ucascode_re)
                if len(ucascode_re) > 0:
                    item['ucascode'] = ''.join(ucascode_re).replace("UCAS Code:", "").replace("UCAS Codes:", "").strip()
                print("item['ucascode'] = ", item['ucascode'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 11

Mostrar archivo

Archivo: KingCollegeLondon_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.kcl.ac.uk/"
        item['university'] = "King's College London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Strand, London. WC2R 2LS, United Kingdom"
        print("===============================")
        print(response.url)
        try:
            # //div[@id='container']/div[@class='hero clearfix']/div[@class='wrapper']/div[@class='inner']/h1
            # 专业、学位类型
            programmeDegree = response.xpath(
                "//div[@id='container']/div[@class='hero clearfix']/div[@class='wrapper']/div[@class='inner']/h1//text()"
            ).extract()
            clear_space(programmeDegree)
            programmeDegreeStr = ''.join(programmeDegree)
            print(programmeDegreeStr)
            degree_type = re.findall(
                r"(\s\w+)$|(\s\w+\s\(.*\))$|(\s\w+/\w+)$|(\s\w+/\w+/\w+)$",
                programmeDegreeStr)
            if len(degree_type) > 0:
                degree_type = list(degree_type[0])
            while '' in degree_type:
                degree_type.remove('')
            print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            programme = programmeDegreeStr.replace(item['degree_name'],
                                                   '').strip()
            item['programme_en'] = programme
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            if "," in item['ucascode']:
                item['ucascode'] = item['ucascode'].split(',')[0].strip()
            print("item['ucascode']: ", item['ucascode'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[2]/span
            duration = response.xpath(
                "//strong[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            durationStr = ''.join(duration)
            print(durationStr)
            # duration_re = re.findall(r"([a-zA-Z0-9]+\s)(year|month|week){1}", durationStr, re.I)
            duration_re = re.findall(
                r"([a-zA-Z0-9\.]+\s)(year|month|week|yr|yft){1}|([0-9\.]+)(yr|yft|\-month){1}",
                durationStr, re.I)
            # print(duration_re)
            d_dict = {
                "One": "1",
                "Two": "2",
                "Three": "3",
                "Four": "4",
                "Five": "5",
                "Six": "6",
                "Seven": "7",
                "Eight": "8",
                "Nine": "9",
                "Ten": "10",
                "one": "1",
                "two": "2",
                "three": "3",
                "four": "4",
                "five": "5",
                "six": "6",
                "seven": "7",
                "eight": "8",
                "nine": "9",
                "ten": "10",
            }
            if len(duration_re) > 0:
                d_int = re.findall(r"\d+", ''.join(duration_re[0]))
                if len(d_int) > 0:
                    item['duration'] = int(''.join(d_int))
                else:
                    d = re.findall(
                        r"(One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)",
                        ', '.join(duration_re[0]))
                    # print("d = ", d)
                    if len(d) > 0:
                        item['duration'] = int(
                            d_dict.get(''.join(d[0]).strip()))
                if "y" in ''.join(duration_re[0]) or "Y" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 1
                elif "m" in ''.join(duration_re[0]) or "M" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 3
                elif "w" in ''.join(duration_re[0]) or "W" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 4
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-2']
            includeDepartment = response.xpath(
                "//div[@class='tab tab-2']//p[contains(text(), 'Faculty')]/span//text()"
            ).extract()
            if len(includeDepartment) == 0:
                includeDepartment = response.xpath(
                    "//p[contains(text(), 'Department')]/span//text()"
                ).extract()
            clear_space(includeDepartment)
            item['department'] = ''.join(includeDepartment).strip()
            print("item['department'] = ", item['department'])

            # //div[@id='coursepage-overview']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate']
            overview = response.xpath(
                "//div[@id='coursepage-overview']/div[@class='wrapper clearfix']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            print("item['overview_en'] = ", item['overview_en'])

            # //div[@id='coursepage-course-detail']/div[@class='wrapper clearfix']/div
            # modules = response.xpath("//h3[contains(text(),'Course format and assessment')]/preceding-sibling::*").extract()
            modules = response.xpath(
                "//div[@id='coursepage-course-detail']/div[@class='wrapper clearfix']/div[@class='inner right lop-to-measure']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Course format and assessment')]/preceding-sibling::*[1]/following-sibling::*|"
                "//h3[contains(text(),'Course Structure & Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//h3[contains(text(),'Teaching style')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//*[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|"
                "//*[contains(text(),'TEACHING')]/../preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|"
                "//*[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|//strong[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*|"
                "//b[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en'] = ", item['assessment_en'])

            alevel = response.xpath(
                "//div[@class='requirements EntryReqs_UKALevel clearfix']//b[contains(text(),'Required grades')]/../following-sibling::p[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//strong[contains(text(),'A-Level')]/../following-sibling::td[1]//text()"
                ).extract()
                if len(alevel) == 0:
                    alevel = response.xpath(
                        "//div[@class='requirements EntryReqs_UKALevel clearfix']//div[@class='required-grades']//text()//text()"
                    ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel).strip()[:160]
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                # "//div[@class='requirements EntryReqs_UKIB clearfix']//b[contains(text(),'Required grades')]/../following-sibling::p[1]//text()|"
                "//div[@class='requirements EntryReqs_UKIB clearfix']//div[@class='required-grades']//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//div[@class='requirements EntryReqs_UKIB clearfix']//b[contains(text(),'Required grades')]/../../text()"
                ).extract()
                if len(ib) == 0:
                    ib = response.xpath(
                        "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::td[1]//text()"
                    ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()[:160]
            print("item['ib'] = ", item['ib'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']
            entry_requirements = response.xpath(
                "//div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[1]//text()"
            ).extract()
            # item['rntry_requirements'] =clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            item['require_chinese_en'] = '''<h4><b>Undergraduate entry</b></h4>
<p>The Senior High School Certificate and/or <i>Hui&nbsp;</i><i>Kao</i>&nbsp;are not considered suitable for direct entry to undergraduate study at King's. Applicants may wish to consider taking one of our International Foundation programmes (see below).</p>'''

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']
            IELTS = response.xpath(
                "//*[contains(text(),'English')]/../../following-sibling::td[1]//text()|"
                "//*[contains(text(),'English')]/../following-sibling::td[1]//text()|"
                "//strong[contains(text(),'TOEFL iBT')]/../../following-sibling::td[1]//text()|"
                "//*[contains(text(),'English')]//../following-sibling::td[1]//text()"
            ).extract()
            clear_space(IELTS)
            # print(IELTS)
            item['ielts_desc'] = ''.join(IELTS).strip()
            item['toefl_desc'] = item['ielts_desc']
            print("item['ielts_desc'] = ", item['ielts_desc'])

            if item['ielts_desc'] == "Band A":
                item["ielts"] = 7.5  # float
                item["ielts_l"] = 7.0  # float
                item["ielts_s"] = 7.0  # float
                item["ielts_r"] = 7.0  # float
                item["ielts_w"] = 7.0
                item["toefl"] = 109  # float
                item["toefl_l"] = 25  # float
                item["toefl_s"] = 25  # float
                item["toefl_r"] = 25  # float
                item["toefl_w"] = 27
            elif item['ielts_desc'] == "Band B" or item[
                    'department'] == "The Dickson Poon School of Law" or item[
                        'department'] == "Dental Institute" or "Medicine" in item[
                            'programme_en']:
                item["ielts"] = 7.0  # float
                item["ielts_l"] = 6.5  # float
                item["ielts_s"] = 6.5  # float
                item["ielts_r"] = 6.5  # float
                item["ielts_w"] = 6.5
                item["toefl"] = 100  # float
                item["toefl_l"] = 23  # float
                item["toefl_s"] = 23  # float
                item["toefl_r"] = 23  # float
                item["toefl_w"] = 25
            elif item['ielts_desc'] == "Band D" or "Biochemistry" in item[
                    'programme_en']:
                item["ielts"] = 6.5  # float
                item["ielts_l"] = 6.0  # float
                item["ielts_s"] = 6.0  # float
                item["ielts_r"] = 6.0  # float
                item["ielts_w"] = 6.0
                item["toefl"] = 92  # float
                item["toefl_l"] = 20  # float
                item["toefl_s"] = 20  # float
                item["toefl_r"] = 20  # float
                item["toefl_w"] = 23
            elif item['ielts_desc'] == "Band E":
                item["ielts"] = 6.0  # float
                item["ielts_l"] = 5.5  # float
                item["ielts_s"] = 5.5  # float
                item["ielts_r"] = 5.5  # float
                item["ielts_w"] = 5.5
                item["toefl"] = 80  # float
                item["toefl_l"] = 20  # float
                item["toefl_s"] = 20  # float
                item["toefl_r"] = 20  # float
                item["toefl_w"] = 20

            if item['ielts_desc'] == "":
                ielts_desc = response.xpath(
                    "//strong[contains(text(),'IELTS Academic')]/../../../following-sibling::td[1]//text()"
                ).extract()
                item['ielts_desc'] = ''.join(ielts_desc).strip()
            if item['toefl_desc'] == "":
                toefl_desc = response.xpath(
                    "//strong[contains(text(),'TOEFL iBT')]/../../following-sibling::td[1]//text()"
                ).extract()
                item['toefl_desc'] = ''.join(toefl_desc).strip()

            if item['ielts'] == None:
                ielts_dict = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_dict.get('IELTS')  # float
                item["ielts_l"] = ielts_dict.get('IELTS_L')  # float
                item["ielts_s"] = ielts_dict.get('IELTS_S')  # float
                item["ielts_r"] = ielts_dict.get('IELTS_R')  # float
                item["ielts_w"] = ielts_dict.get('IELTS_W')
            if item['toefl'] == None:
                toefl_dict = get_ielts(item['toefl_desc'])
                item["toefl"] = toefl_dict.get('TOEFL')  # float
                item["toefl_l"] = toefl_dict.get('TOEFL_L')  # float
                item["toefl_s"] = toefl_dict.get('TOEFL_S')  # float
                item["toefl_r"] = toefl_dict.get('TOEFL_R')  # float
                item["toefl_w"] = toefl_dict.get('TOEFL_W')
                # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            application_fee = response.xpath(
                "//h3[contains(text(), 'Application procedure')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(application_fee)
            # print(''.join(application_fee))
            application_fee_re = re.findall(r"application\sfee.*£\d+",
                                            ''.join(application_fee))
            print("apply_fee: ", ''.join(application_fee_re))
            af = ''.join(application_fee_re).replace("application fee of",
                                                     "").replace("£",
                                                                 "").strip()
            if len(af) != 0:
                item['apply_fee'] = int(af)
                item['apply_pre'] = "£"
            print("item['apply_fee'] = ", item['apply_fee'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            application_documents = response.xpath(
                "//h3[contains(text(), 'Personal statement and supporting information')]/following-sibling::div[1]"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(application_documents))
            print("item['apply_documents_en'] = ", item['apply_documents_en'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            deadline = response.xpath(
                "//div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[1]/div[@class='requirements uk clearfix']/div[@class='copy'][4]//text()"
            ).extract()
            clear_space(deadline)
            print(deadline)
            deadline_str = ''.join(deadline).strip()
            item['deadline'] = getStartDate(deadline_str)
            print("item['deadline'] = ", item['deadline'])

            # //div[@id='coursepage-fees-and-funding']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off']/ul[1]/li[2]
            tuition_fee = response.xpath(
                "//p[contains(text(),'The International tuition fee for the 2018-2019 ac')]//text()"
            ).extract()
            print("tuition_fee = ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+|£\d+|\d+,\d+",
                                        ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) >= 1:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "£", "").replace(",", "").strip())
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])
            print("item['tuition_fee'] = ", item['tuition_fee'])

            # //div[@id='coursepage-career-prospect']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate']
            career = response.xpath(
                "//div[@id='coursepage-career-prospect']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            # //b[contains(text(),'The interview')]/..|//b[contains(text(),'The interview')]/../following-sibling::*[position()<3]
            interview_desc_en = response.xpath(
                "//b[contains(text(),'The interview')]/..|"
                "//b[contains(text(),'The interview')]/../following-sibling::*[position()<3]|"
                "//b[contains(text(),'Interviewing')]/../following-sibling::*[1]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # //b[contains(text(),'Application deadline:')]/..
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline:')]/../text()"
            ).extract()
            item['deadline'] = remove_class(clear_lianxu_space(deadline))
            print("item['deadline'] = ", item['deadline'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h1>Applying to King&#39;s College London</h1>
<img alt="King's College London lecture at the Strand campus" height="330" width="780" src="/ImportedImages/0Prospectus/undergraduate/apply/generic-page-images/lecturer-2014-strand-6.29.jpg" />
<p><br />We're delighted that you're considering applying to King's. Once you've checked the information and&nbsp;<a class="sys_16" href="https://www.kcl.ac.uk/study/undergraduate/apply/entry-requirements/index.aspx">entry requirements</a>&nbsp;for your chosen course, you will need to follow the correct application procedure, depending on the type of study you're interested in:</p>
<div class="contentpage-accordion clearfix">
<h3 class="accordion-toggle">Undergraduate degree courses (UCAS)</h3>
<div class="accordion-content">
<p class="p1">For all full-time undergraduate higher education courses at universities and colleges in the UK you must make an online application via the Universities and Colleges Admissions Service - more commonly known as&nbsp; <a class="sys_16" href="http://www.ucas.com/">UCAS</a>.<br /><br /></p>
<h4 class="p1"><b>UCAS has three key functions:</b></h4>
<ol>
<li class="p4"><a class="sys_16" href="https://digital.ucas.com/search">Course search</a>:&nbsp;allows you to search for courses throughout the UK. Remember to always check King's&nbsp;online prospectus&nbsp;for the most detailed information on King's courses.</li>
<li class="p4"><a class="sys_16" href="https://www.ucas.com/ucas/undergraduate/register">Apply</a>:&nbsp;the UCAS online application system. You should use this to make your application(s) to King's. 'Apply' will allow you to apply to several different universities and/or courses at once.</li>
<li class="p4"><a class="sys_16" href="https://www.ucas.com/ucas/undergraduate/login">Track</a>:&nbsp;a central tracking system for following the progress of your different applications. King's will also provide you with an account for our own supplementary tracking and messaging system (called King's Apply).</li>
</ol>
<p class="p1">Please read the following guidelines before making your application.</p>
<h4 class="p1"><b><br />Who should use UCAS?</b></h4>
<ul>
<li class="p2">All applicants for full-time undergraduate courses at King's should apply through UCAS (with the exception of applicants from&nbsp; <a class="sys_16" href="http://www.kcl.ac.uk/usa">North America</a>&nbsp;who may use Common App if preferred).</li>
<li class="p2">All applicants for Nursing with registration (graduate entry) PG Dip</li>
<li class="p2">All applicants for Midwifery with registration (graduate entry) PG Dip</li>
</ul>
<p>You can apply through your school or college, or as an individual.</p>
<h4 class="p1"><b><br />When should I apply?</b></h4>
<p class="p2">You can apply to UCAS from 1 September for entry the following autumn, but remember you can start&nbsp; <a title="Undergraduate study" class="sys_0 sys_t0" href="/study/undergraduate/index.aspx">doing your research</a>, attending&nbsp;open days, and preparing your personal statement earlier than this.</p>
<p class="p2">The normal closing date for receipt of applications is&nbsp;15&nbsp;January.&nbsp;However if you are including Oxford or Cambridge, or to Medicine or Dentistry, then the closing date is&nbsp;15 October&nbsp;in the year prior to entry.&nbsp;</p>
<p class="p2">The UCAS website states a more flexible deadline for international students, however, any application received by King's after the above dates is considered late.</p>
<h4 class="p1"><b><br />How do I use UCAS?</b></h4>
<p class="p2">UCAS allows you to apply to a maximum of five courses per year, but only four of those may be Medicine/Dentistry courses.</p>
<p class="p2">You will need to create an account in UCAS 'Apply' and complete an application form. Your application will then be forwarded by UCAS to all of the universities you have applied to for us to consider.</p>
<p class="p2">If you have participated in a King's widening participation scheme such as K+, please ensure you note this in your application as advised by the&nbsp;<a title="Widening Participation" class="sys_0 sys_t0" href="/study/widening-participation/index.aspx">Widening Participation team</a>.</p>
<p class="p2">UCAS has detailed instructions on the&nbsp;<a class="sys_16" href="http://www.ucas.com/how-it-all-works/undergraduate">UCAS website</a>.</p>
<p class="p2">You can also read our&nbsp;<a title="Before you apply" class="sys_0 sys_t7240628" href="/study/undergraduate/apply/faqs/index.aspx">frequently asked questions about applying</a>.<a title="UCAS website" class="sys_16" href="https://www.ucas.com/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Undergraduate degree courses (Common App)</h3>
<div class="accordion-content">

<p>King's College Lon</p>
<h4>UCAS or Common App?</h4>
<p>We will only consider applicants through The Common Application who have not also applied through UCAS.&nbsp;</p>
<h4>Who should use Common App?</h4>
<ul>
<li>Common App is an option to be used by students who will be classified as paying international fees. Therefore Home/EU fee students must use UCAS. Those who are unsure should use UCAS.</li>
</ul>
<ol>
<li>Common App is available for all programmes excluding Physiotherapy, Nursing, Medicine, Dentistry, and Nutrition and Dietetics. Students must use UCAS to apply to these programmes.</li>
<li>Students must apply to no more than a combined total of five courses (UCAS and Common App) within the UK.</li>
</ol>
<p>All Common App applicants will be expected to complete the supplement element of the Common Application. We strongly encourage students to submit their application by January 15, but the College may consider later applications up to May 1. Once a Common Application is submitted to King's, students will be registered on the College's MyApplication system through which they will be able to track the progress of their application.</p>
<h4>King&rsquo;s College London Common Application timeline</h4>
<ol>
<li>Applicant applies to King&rsquo;s using the&nbsp;<a class="sys_16" href="http://www.commonapp.org/">Common Application form</a>.</li>
<li>The application is transferred onto the KCL Admissions Portal by King&rsquo;s admissions staff. This process may take approximately 30 days, depending upon when you submitted your application.</li>
<li>Once the application has been successfully inputted into the system, the applicant receives login details for the &lsquo;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">MyApplication</a>&rsquo; admissions portal. This is used to track the progress of an application and communicate with admissions staff</li>
<li>Applicants will be contacted through&nbsp;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">myApplication</a>&nbsp;in the event that any further supporting documents are required before the application can be fully processed.</li>
<li>Decisions on completed applications submitted by January 15 will be made before March 31st. All notifications of decisions will be sent through&nbsp;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">myApplication</a>.</li>
</ol>
<p>&nbsp;</p>

</div>



<h3 class="accordion-toggle">Post Qualification Nursing BSc Programmes &amp; Free Standing Courses</h3>
<div class="accordion-content">

<p>Applications for our post qualification nursing BSc programmes should be made direct to King&rsquo;s, through the King&rsquo;s Apply portal. <a title="Apply now" class="sys_16" onclick="void(window.open('https://apply.kcl.ac.uk/','','toolbar=yes,menubar=yes,location=yes,scrollbars=yes,status=yes,resizable=yes'));return false;" onkeypress="void(window.open('https://apply.kcl.ac.uk/','','toolbar=yes,menubar=yes,location=yes,scrollbars=yes,status=yes,resizable=yes'));return false;" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Intercalated BScs (iBScs)</h3>
<div class="accordion-content">

<p>Intercalated BScs (iBScs) programmes are for medical, dental and veterinary students. If you are from another university and you are interested in studying an iBSc at King&rsquo;s, then please see our detailed information on <a title="Intercalated BScs: How to apply" class="sys_0 sys_t7240628" href="/study/subject-areas/intercalated/how-to-apply.aspx">how to apply</a>. If you are a current King&rsquo;s student, please refer to application information on the <a class="sys_16" href="https://internal.kcl.ac.uk/lsm/students/ug/intercalated-bsc/how-to-apply.aspx">King's internal website</a>.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Transfers to undergraduate degree courses</h3>
<div class="accordion-content">

<p>Some of our academic departments may consider transfer applications from suitably qualified students currently attending other universities. Visit the&nbsp;<a class="sys_0 sys_t2452" href="https://www.kcl.ac.uk/study/undergraduate/apply/transferring-to-kings.aspx">transferring to King&rsquo;s web page</a>&nbsp;for more information. Transfer applications must be submitted through UCAS</p>
<p><a title="UCAS" class="sys_16" href="https://www.ucas.com/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">King's International Foundation Programme</h3>
<div class="accordion-content">

<p>Applications for our one year full-time International Foundation academic preparation course should be made direct to King's, through the <a class="sys_16" href="https://apply.kcl.ac.uk/">King&rsquo;s Apply portal</a>. We have detailed guidance on the supporting documentation needed for your application on the relevant International Foundation Programme course web page.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Study Abroad</h3>
<div class="accordion-content">

<p>King's welcomes students currently enrolled in universities outside the UK to <a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/">participate</a> on a study abroad programme.&nbsp;</p>
<p>You can study abroad at King's either as an exchange or a study abroad fee-paying student for the full academic year (starting in September) or for one semester only (September-December or January-June).&nbsp;</p>
<p>Visit our&nbsp;<a class="sys_16" title="Study abroad" href="/study/abroad/index.aspx">Study Abroad web pages</a>&nbsp;to find out more.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>


</div>

<h3><br />After you&rsquo;ve applied</h3>
<p>Your application can be tracked using our&nbsp;<a class="sys_16" title="King&#39;s application portal" onclick="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" onkeypress="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" href="https://myapplication.kcl.ac.uk/">online portal, King's Apply</a>, where you can:</p>
<ul>
<li>
<p>see offer details</p>
</li>
<li>
<p>check if you&rsquo;ve been invited to interview</p>
</li>
<li>
<p>apply for accommodation</p>
</li>
<li>
<p>learn more about the &lsquo;points-based&rsquo; visa system</p>
</li>
</ul>
<p>After you have applied to King's, we will send you a username and password so you can access these pages. To contact us about your application during the application year, please use our <a title="King&#39;s application portal" class="sys_16" onclick="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" onkeypress="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" href="https://myapplication.kcl.ac.uk/">online portal, King's Apply</a>&nbsp;.</p>
<p><a class="sys_0 sys_t2452" title="Tracking your application" href="https://www.kcl.ac.uk/study/undergraduate/apply/faqs/tracking-your-application.aspx">Read our FAQs on tracking your application</a></p>


<div class="contentpage-accordion clearfix">
<br />


<h3 class="accordion-toggle">Cancellation rights</h3>
<div class="accordion-content">

<p class="MRNoHead2">Please note, these terms and conditions apply to all levels of study.&nbsp; For applications to undergraduate study, we also advise applicants to contact UCAS directly for details of your cancellation rights.</p>
<p class="MRNoHead2">1.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; You have the right to cancel your acceptance of a place at King&rsquo;s for any reason (including if you change your mind) during a fourteen (14) day cancellation period (the &ldquo;Cancellation Period&rdquo;), which will start on the day you accept an offer from King&rsquo;s.</p>
<p class="MRNoHead2">1.2 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; To cancel your acceptance, you must clearly inform us in writing of your decision to cancel before the Cancellation Period has expired. We ask that you do this by sending a message through &ldquo;King&rsquo;s Apply&rdquo;. Alternatively, you may contact the King&rsquo;s Admissions Office by letter or email. You may also use the <a title="Cancellation Form - Kings College London" class="sys_17" href="/study/assets/word/admissions/v.2-cancellation-form.docx">Cancellation Form </a>to notify us of your decision to cancel.</p>
<p class="MRNoHead2">1.3 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; To meet the cancellation deadline, it is sufficient for you to send your communication concerning your exercise of the right to cancel before the Cancellation Period has expired. We do not have to have received it before the expiry of the Cancellation Period.</p>
<p class="MRNoHead2">1.4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; If you cancel your acceptance within the 14 day Cancellation Period, we will reimburse any tuition fee payment including any deposit received from you as soon as we can, and no later than 14 days after the day on which you informed us of your decision to cancel your acceptance.</p>

</div>


</div>

"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 12

Mostrar archivo

Archivo: AberystwythUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "Aberystwyth University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Aberystwyth University, Reception, Penglais, Aberystwyth, Ceredigion, SY23 3FL"
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            department = response.xpath("//div[@class='banner__caption banner__caption--below']//text()").extract()
            department = ''.join(department).replace("in the ", "").strip()
            item['department'] = department
            # print("item['department']: ", item['department'])

            # 专业、学位类型
            degree_name = response.xpath("//div[@class='hero-header']//header/span//text()").extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            programme_en = response.xpath("//div[@class='hero-header']//header/h1//text()").extract()
            item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])

            # if item['degree_name'] == "":
            #     print("*****111****")

            duration = response.xpath(
                "//h3[contains(text(),'Course Length')]/following-sibling::p//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='overview']
            overview = response.xpath("//div[@class='key-facts']/following-sibling::p|"
                                      "//h3[@id='course-overview']|//h3[@id='course-overview']/following-sibling::div[1]|"
                                      "//h3[@id='coursedetails']|//h3[@id='coursedetails']/following-sibling::div[1]").extract()
            if len(overview) == 0:
                overview = response.xpath("//h2[contains(text(),'Overview')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])
            # if item['overview_en'] == "":
            #     print("*****111****")

            modules = response.xpath("//h3[@id='coursecontent']|//h3[@id='coursecontent']/following-sibling::div[1]").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])
            # if item['modules_en'] == "":
            #     print("*****111****")

            career_en = response.xpath("//h3[@id='employability']|//h3[@id='employability']/following-sibling::div[1]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])
            # if item['career_en'] == "":
                # print("*****111****")

            assessment_en = response.xpath(
                "//h3[contains(text(),'Teaching & Learning')]|//h3[contains(text(),'Teaching & Learning')]/following-sibling::div[1]").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])
            # if item['assessment_en'] == "":
            #     print("*****111****")

            alevel = response.xpath(
                "//h3[contains(text(),'Typical A-level offer')]/following-sibling::p//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//strong[contains(text(),'International Baccalaureate:')]/../text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])


            #     item['tuition_fee'] = int(feelist[0].replace('£', '').replace(',', '').strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # print("entry_requirementsStr: ", entry_requirementsStr)
            ielts = response.xpath("//strong[contains(text(),'English language requirements')]/..//text()").extract()
            item['ielts_desc'] = ''.join(ielts).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            ucascode = response.xpath("//span[contains(@title,'UCAS Code')]/em//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("len: ", len(ucascode))
            print("item['ucascode'] = ", item['ucascode'])

            # 学费链接
            item['other'] = "https://www.aber.ac.uk/en/international/fees-scholarships/fees-money/int-under/"
            item['apply_proces_en'] = "https://www.aber.ac.uk/en/undergrad/apply/?course=W402"
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 13

Mostrar archivo

Archivo: Heriot-WattUniversity_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "Heriot-Watt University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        # item['major_type1'] = response.meta.get(response.url)
        # print("item['major_type1']: ", item['major_type1'])
        try:
            department = response.xpath("//div[@class='banner__caption banner__caption--below']//text()").extract()
            department = ''.join(department).replace("in the ", "").strip()
            item['department'] = department
            # print("item['department']: ", item['department'])

            # 专业、学位类型
            programmedegreenamestr = response.xpath("//div[@class='col-md-12']/h1//text()").extract()
            programmedegreenamestr = clear_lianxu_space(programmedegreenamestr)
            print("programmedegreenamestr: ", programmedegreenamestr)

            if "," in programmedegreenamestr:
                degree_name_programme = programmedegreenamestr.split(",")
                item['degree_name'] = degree_name_programme[-1].replace("(Hons)", "").strip()
                item['programme_en'] = degree_name_programme[0].strip()
            print("item['degree_name']: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            # if item['degree_name'] == "":
            #     print("*****111****")

            location = response.xpath(
                "//dt[contains(text(),'Location')]/following-sibling::dd[1]//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            duration = response.xpath(
                "//dt[contains(text(),'Duration')]/following-sibling::dd[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            item['other'] = '; '.join(duration).replace(",;", ":").replace("; ; ;", ";").strip()
            # print("item['other']: ", item['other'])
            duration_str = ''.join(duration)

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='overview']
            overview = response.xpath("//h3[contains(text(),'Accreditation')]/preceding-sibling::*").extract()
            if len(overview) == 0:
                overview = response.xpath("//h2[contains(text(),'Overview')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])
            # if item['overview_en'] == "":
            #     print("*****111****")

            modules = response.xpath("//section[@id='course-content']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career_en = response.xpath("//section[@id='career']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='fees']
            feeContent = response.xpath("//th[contains(text(),'Fee')]/following-sibling::*[last()]//text()").extract()
            clear_space(feeContent)
            # print("feeContent: ", feeContent)
            feelist = re.findall(r"£[\d,]+", ''.join(feeContent))
            if len(feelist) > 0:
                item['tuition_fee'] = int(feelist[0].replace('£', '').replace(',', '').strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            alevel = response.xpath("//section[@id='entry-requirements']//ul[1]//strong[contains(text(), 'A-Levels')]/../text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath("//section[@id='entry-requirements']//ul[1]//strong[contains(text(), 'Int. Baccalaureate')]/../text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            # //div[@id='how-to-apply']
            entry_requirements = response.xpath("//strong[contains(text(),'English language requirements')]/..//text()").extract()
            rntry_requirements = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # print("entry_requirementsStr: ", entry_requirementsStr)
            ielts = response.xpath("//strong[contains(text(),'English language requirements')]/..//text()").extract()
            if len(ielts) == 0:
                ielts = response.xpath("//li[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = ''.join(ielts).strip()
            print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            if item['ielts'] is None:
                if "Interpreting and Translating" in item['programme_en']:
                    item['ielts'] = 7.5
                    item['ielts_l'] = 7.5
                    item['ielts_s'] = 7.5
                    item['ielts_r'] = 7.5
                    item['ielts_w'] = 7.5
                else:
                    item['ielts'] = 6
                    item['ielts_l'] = 6
                    item['ielts_s'] = 6
                    item['ielts_r'] = 6
                    item['ielts_w'] = 6

            ucascode = response.xpath("//h2[contains(text(),'The course')]/following-sibling::*//dt[contains(text(),'UCAS code')]/following-sibling::dd[1]//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            item['apply_proces_en'] = "https://www.hw.ac.uk/study/apply/uk/undergraduate.htm"
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 14

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.strath.ac.uk/"
        item["university"] = "University of Strathclyde"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "16 Richmond Street, Glasgow, G1 1XQ"
        print("===========================")
        print(response.url)
        try:
            # 学位类型
            degree_type = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/span/text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            # 专业名
            programme = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/text()"
            ).extract()
            # print("programme = ", programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            if "Engineering" in item['programme_en']:
                item['department'] = "Faculty of Engineering"
            elif "Science" in item['programme_en']:
                item['department'] = "Faculty of Science"
            elif "Business" in item['programme_en'] or "Finance" in item[
                    'programme_en'] or "Marketing" in item['programme_en']:
                item['department'] = "Strathclyde Business School"
            print("item['department'] = ", item['department'])

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code:')]/../text()").extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode']: ", item['ucascode'])

            # 课程长度、开学时间、截止日期
            durationTeachtime = response.xpath(
                "//b[contains(text(),'Study mode and duration')]/../text()"
            ).extract()
            clear_space(durationTeachtime)
            print("durationTeachtime: ", durationTeachtime)
            durationTeachtimeStr = ''.join(durationTeachtime)

            duration_list = getIntDuration(durationTeachtimeStr)
            # print(duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            start_date = response.xpath(
                "//b[contains(text(),'Start date')]/../text()").extract()
            start_date_str = ''.join(start_date).replace(":", "")
            # print("start_date_str = ", start_date_str)
            item['start_date'] = getStartDate(start_date_str)
            if item['start_date'] != "" and item[
                    'start_date'] > "06" and "2018" not in item[
                        'start_date'] and "2019" not in item['start_date']:
                item['start_date'] = "2018-" + item['start_date']
            elif item['start_date'] != "" and item[
                    'start_date'] <= "06" and "2018" not in item[
                        'start_date'] and "2019" not in item['start_date']:
                item['start_date'] = "2019-" + item['start_date']
            # print("item['start_date'] = ", item['start_date'])

            # 截止日期
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline')]/../text()"
            ).extract()
            deadline = ''.join(start_date).replace(":", "").strip()
            # print("deadline = ", deadline)
            item['deadline'] = getStartDate(deadline)
            # print("item['deadline'] = ", item['deadline'])

            # 专业描述
            overview = response.xpath(
                "//article[@id='why-this-course']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # 课程设置、评估方式
            modules = response.xpath(
                "//h3[contains(text(),'Assessment')]/preceding-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*"
                ).extract()
                if len(modules) == 0:
                    modules = response.xpath(
                        "//article[@id='course-content']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            if len(assessment_en) == 0:
                assessment_en = response.xpath(
                    "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*[1]/following-sibling::*"
                ).extract()
            item["assessment_en"] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # 学术要求、英语要求
            rntry_requirements = response.xpath(
                "//article[@id='entry-requirements']//text()").extract()
            # item["rntry_requirements"] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            alevel = response.xpath(
                "//h4[contains(text(),'A Levels')]/following-sibling::p[1]//text()|"
                "//strong[contains(text(),'A Levels')]/..//following-sibling::p[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//*[contains(text(),'A Levels')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])
            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            item['alevel'] = None
            from lxml import etree
            tmp_html = response.text
            key1 = "<h4>A Levels"
            if key1 not in tmp_html:
                key1 = "<h4><strong>A Levels"
                if key1 not in tmp_html:
                    key1 = "<p><strong>A Levels"
                    if key1 not in tmp_html:
                        key1 = '<h4><span style="font-size: 1em;">A Levels'
                        if key1 not in tmp_html:
                            key1 = '<h4 class="p1">A Levels'
                            if key1 not in tmp_html:
                                key1 = '<p>Year 1 entry<u1:p>'
            key2 = "<h4>International Baccalaureate"
            if key2 not in tmp_html:
                key2 = "<h4><strong>International Baccalaureate"
                if key2 not in tmp_html:
                    key2 = '<p><strong>International Baccalaureate'
                    if key2 not in tmp_html:
                        key2 = '<h4 class="p1">European Baccalaureate'
                        if key2 not in tmp_html:
                            key2 = '<h4 class="p1">International Baccalaureate'
                            if key2 not in tmp_html:
                                key2 = '<h4>IB'
                                if key2 not in tmp_html:
                                    key2 = '<h4><span style="font-size: 1em;">International Baccalaureate'
                                    if key2 not in tmp_html:
                                        key2 = '<h4>HND/HNC</h4>'
            if key1 in tmp_html and key2 in tmp_html:
                # 使用正则匹配需要的标签内容的首尾
                key1_re = re.findall(key1, tmp_html)
                print("key1_re: ", key1_re)
                key2_re = re.findall(key2, tmp_html)
                print("key2_re: ", key2_re)
                # 增加能准确获取的div
                end_html = tmp_html.replace(
                    ''.join(key1_re),
                    '<div id="container">' + ''.join(key1_re)).replace(
                        ''.join(key2_re), '</div>' + ''.join(key2_re))

                end_html_response = etree.HTML(end_html)
                # 可以使用xpath匹配需要的内容了
                end_content = end_html_response.xpath(
                    "//div[@id='container']//text()")
                # 转化成带标签的数据内容
                end_content_str = "".join(end_content).strip()
                # if len(end_content) > 0:
                #     end_content_str = etree.tostring(end_content[0], encoding='unicode', method='html')
                item['alevel'] = end_content_str
            print("item['alevel'] = ", item['alevel'])
            # print(len("36 points overall with 18 at Higher Level, including 6, 5 at Higher Level in two of the following subjects: Biology, Chemistry, Physics, Mathematics, Psychology"))

            ib = response.xpath(
                "//h4[contains(text(),'International Baccalaureate')]/following-sibling::p[1]//text()|"
                "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::p[1]//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//*[contains(text(),'International Baccalaureate')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib[0]).strip()

            if len(item['ib']) > 160:
                item['ib'] = ''.join(item['ib'][:161])
            # print("item['ib'] = ", item['ib'])

            # ielts = response.xpath("//h3[contains(text(),'English language requirements')]/following-sibling::*[position()<4]//text()").extract()
            # print("ielts: ", ielts)
            ielts_re = re.findall(r"IELTS.{1,80}", ''.join(rntry_requirements))
            # print("ielts_re = ", ielts_re)
            item["ielts_desc"] = ''.join(ielts_re)
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # print(ieltlsrw)
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip('.').strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip('.').strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip('.').strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip('.').strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip('.').strip()
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
            #       %(item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # 学费    //article[@id='fees-and-funding']/ul[3]/li
            tuition_fee = response.xpath(
                "//html//article[@id='fees-and-funding']/*[contains(text(),'International')]/following-sibling::*[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = ''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "")
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            # 就业    //article[@id='careers']
            career = response.xpath("//article[@id='careers']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //article[@id='apply']
            apply_proces_en = response.xpath(
                "//article[@id='apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
              <h3>Undergraduate</h3>
<p><strong>Undergraduate first year entry (four-year degree programme)</strong>&nbsp;</p>
<p>Huikao 80% on average, and at least 80% in the required subjects.</p>
<p>Provinces with no Huikao: 80% on average over six subjects, including required subjects.</p>
<p><strong>Undergraduate second year entry (three-year degree programme)</strong>&nbsp;</p>
<p>Huikao 80% on average, and at least 80% in the required subjects.</p>
<p>Provinces with no Huikao: 80% on average over six subjects, including required subjects.</p>
<div>Gaokao with at least 525 out of 750, or 70% (If the total mark is not out of 750).</div>"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 15

Mostrar archivo

Archivo: UniversityOfChester_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            start_date = response.xpath(
                "//select[@id='edit-date']/option//text()|//label[@for='edit-date']/following-sibling::span//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ""
            if len(start_date) > 0:
                for s in start_date:
                    # start_date_str = getStartDate(s)
                    if getStartDate(s) != None:
                        start_date_str += getStartDate(s) + ", "
            item['start_date'] = start_date_str.strip().strip(',').strip()
            # print("item['start_date']: ", item['start_date'])

            mode = response.xpath(
                "//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            # print("mode: ", mode)

            location = response.xpath(
                "//label[@for='edit-compulsory']/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            ucascode = response.xpath(
                "//dt[contains(text(),'UCAS Code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            duration = response.xpath(
                "//dt[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()"
            ).extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//td[contains(text(),'GCE A Level')]/following-sibling::*//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//td[contains(text(),'International Baccalaureate')]/following-sibling::*//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            ielts_desc = response.xpath(
                "//div[@id='entry-international']//li[contains(text(),'Undergraduate:')]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//div[@class='field-fees-international']/p//text()|"
                "//p[contains(text(),'The tuition fees for international students studyi')]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath(
                "//div[@id='careers-career-services']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            modules = re.findall(
                r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}",
                response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace(
                "function init_drupal_core_settings() {jQuery.extend(Drupal.settings,",
                "").strip()
            modules_dict = json.loads(modules_str)
            # print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            # print(modules_dict.get("courses"))
            if modules_dict.get('courses').get('groupCode') is not False:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/" + modules_dict.get(
                    'courses').get('modulesNid') + "/" + modules_dict.get(
                        'courses').get('groupCode') + "/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            # print("item['modules_en']: ", item['modules_en'])

            item[
                'apply_proces_en'] = "https://www1.chester.ac.uk/undergraduate/how-apply/applying-full-time-courses"
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Undergraduate Study  </h3>

  <ul><li>UK foundation/pathway course with a pass mark of 50% and above.  Engineering courses require an additional mark of at least 55% in a Maths module. </li>
<li>China 3 year National Senior High School Certificate with 80% or above</li>
<li>Gaokao (College Entry Exam) with good grades </li>
<li>Dazhuan considered for entry to 3rd year UG</li>
<li>BFSU Foundation Year at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Foundation Course 60% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Foundation Certificate at 60% or above</li>
<li>Graduation Certificate from a specialised College/School (Zhongzuhan) with 80% or above</li>
</ul></div>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 16

Mostrar archivo

Archivo: Queen'sUniversityBelfast_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Queen's University Belfast"
        # item['country'] = 'England'
        # item['website'] = 'http://www.qub.ac.uk/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            degree_type = response.xpath(
                "//div[@class='columns aligned']//div[@class='column colspan-8']/h2//text()"
            ).extract()
            degree_type = ''.join(degree_type).split("|")
            print("degree_type: ", degree_type)
            if len(degree_type) != 0:
                item['degree_name'] = degree_type[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme = response.xpath(
                "//div[@class='columns aligned']//div[@class='column colspan-8']/h1//text()"
            ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            # start_date
            start_date = response.xpath(
                "//span[@class='cf-key-details key-entry-year']//text()"
            ).extract()
            clear_space(start_date)
            item['start_date'] = ''.join(start_date).strip()
            print("item['start_date']: ", item['start_date'])

            # duration
            duration = response.xpath(
                "//p[@class='cf-key-details-duration']//span[@class='cf-key-details']//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']-: ", item['duration'])
            print("item['duration_per']-: ", item['duration_per'])

            ucascode = response.xpath(
                "//span[@class='cf-key-details key-ucas-code']//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # //div[@id='overview']
            overview = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='overview']
            modules = response.xpath(
                "//h3[@class='alt'][contains(text(),'Course Structure')]/following-sibling::table"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[@class='alt'][contains(text(),'Career Prospects')]/following-sibling::p[1]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //a[@id='teaching']/following-sibling::*[position()<6]
            teaching_assessment = response.xpath(
                "//a[@id='teaching']/following-sibling::*[position()<6]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment).replace("\n", ""))
            # print("item['assessment_en']: ", item['assessment_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry']//text()").extract()
            rntry_requirements = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELTS.{1,150}", rntry_requirements)
            item['ielts_desc'] = ''.join(ielts)
            print("item['ielts_desc']: ", item['ielts_desc'])
            ieltsDict = get_ielts(''.join(ielts))
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//b[contains(text(),'Entry requirements:')]/following-sibling::span//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()").extract()
            # item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            # //html//div[@id='fees']//tr[4]
            tuition_fee = response.xpath(
                "//html//div[@id='fees']//tr[4]//text()").extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_str = ''.join(tuition_fee).strip().strip(
                "International")
            if "£" in tuition_fee_str:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(
                    tuition_fee_str.replace('£', '').replace(',', '').strip())
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # //div[@class='panel bg--primary']//div[@class='inner']//p
            department = response.xpath(
                "//div[@class='panel bg--primary']//div[@class='inner']//p//text()"
            ).extract()
            clear_space(department)
            # print(department)
            for d in department:
                if "School" in d:
                    item['department'] = d.strip()
                elif "College" in d:
                    item['department'] = d.strip()
                elif "Campus" in d or d == "Biological Sciences" or d == "Marketing strategy" or d == "Management":
                    item['department'] = d.strip()
                elif len(d) == 4 or len(
                        d
                ) == 5 or d == "Arts, English and Languages" or d == "Global Food Security" or d == "Centre for Economic History":
                    item['department'] = d.strip()
            # print("item['department']: ", item['department'])

            department = response.xpath(
                "//html//div[@class='panel bg--grey-l']/div[@class='inner']//a//text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            # //html//div[@class='panel bg--grey-l']/div[@class='inner']/p[1]
            location = response.xpath(
                "//html//div[@class='panel bg--grey-l']/div[@class='inner']/text()"
            ).extract()
            clear_space(location)
            item['location'] = '\t'.join(location).strip()
            print("item['location']: ", item['location'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h1 class="alt"><a name="UG"></a>Undergraduate entry requirements</h1>
<p>The following qualifications will be considered for direct entry to undergraduate programmes:</p>
<ul>
<li>Students who have completed 12 years of education in China and attained the Secondary School Leaving Certificate with good grades must complete an approved Foundation programme or GCE A Levels for progression to undergraduate degree programmes.</li>
<li>The 'Gaokao' Chinese University Entrance Examination will be considered, along with performance in the Senior High School examination for entry to Stage 1 of our undergraduate programmes.</li>
<li>Progression to Stage 1 of an undergraduate degree programme at Queen's&nbsp;(with the exception of Agricultural Technology, Medicine, Dentistry and Social Work) is guaranteed for students who successfully complete the <a title="University%20Preparation%20Courses" href="/home/International/International-students/Applying/University-Preparation-Courses/">INTO Queen's International Foundation Programme</a> at the required standard.</li>
<li>Students who have completed one or two years of university study in China may be eligible for admission to Bachelor degree programmes, if relevant subjects have been studied and strong grades have been achieved.</li>
<li>Applicants who have already completed A-Levels/a recognised Foundation programme or the first year of a relevant degree programme in China, but who do not meet the academic or English language requirements for entry, may wish to consider <a title="University%20Preparation%20Courses" href="/home/International/International-students/Applying/University-Preparation-Courses/">INTO Queen's International Year One</a>. Successful completion at the required standard offers direct entry to the second year of selected undergraduate degree programmes in Management, Economics, Finance and Engineering.</li>
<li>Between 30 and 36 points in the International Baccalaureate Diploma (IB). <a href="/home/International/International-students/Your-Country/InternationalBaccalaureateIBDiplomaEntryRequirements/">Information on required grades</a>.</li>
</ul>
<p><strong>Please note: </strong>Grades required vary depending on the programme of study. Further guidance on the entry requirements for each degree programme can be found in the Undergraduate Coursefinder.</p>"""
                ]))

            apply_proces_en = response.xpath("//div[@id='apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 17

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.bathspa.ac.uk/"
        item['university'] = "Bath Spa University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            item['location'] = 'Bath'
            # 专业、学位类型//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1
            programme = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/p[1]//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            # //dt[contains(text(),'School')]/following-sibling::dd[1]
            department = response.xpath(
                "//dt[contains(text(),'School')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            location = response.xpath(
                "//dt[contains(text(),'Campus or location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            modules = response.xpath(
                "//h3[contains(text(),'Course structure')]/..|//h3[contains(text(),'Course modules')]/..|//h2[contains(text(),'Course modules')]/.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[contains(text(),'Career')]/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            feeContent = response.xpath(
                "//h3[contains(text(),'International students full time')]/../div/table[1]//td[contains(text(), 'Year')]/following-sibling::td//text()"
            ).extract()
            clear_space(feeContent)
            # print(feeContent)
            if len(feeContent) > 0:
                item['tuition_fee'] = int(feeContent[0].replace(
                    "£", "").replace(",", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            alevel = response.xpath(
                "//span[contains(text(),'A Level')]/..//text()|//li[contains(text(),'A Level')]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # print("item['alevel']: ", item['alevel'])
            # if item['alevel'] == "":
            #     print("****alevel")

            ib = response.xpath(
                "//span[contains(text(),'International Baccalaureate')]/..//text()|"
                "//li[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib']: ", item['ib'])
            # if item['ib'] == "":
            #     print("****ib")

            # //div[@class='content']/div[@class='collapsible-content highlighted']/div[2]/div[2]

            ieltsList = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            interview_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio and interview')]/..").extract(
                )
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])
            # if item['interview_desc_en'] == "":
            #     print("****interview_desc_en")

            portfolio_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio')]/..").extract()
            item['portfolio_desc_en'] = remove_class(
                clear_lianxu_space(portfolio_desc_en))
            # print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])
            # if item['portfolio_desc_en'] == "":
            #     print("****portfolio_desc_en")

            # https://www.bathspa.ac.uk/international/country-advice/china/

            item[
                'require_chinese_en'] = "<p><strong>Undergraduate</strong></p><ul><li>Senior Secondary School Graduation Certificate with a grade of 70% and a Foundation Certification from a recognised institution.</li></ul><p><strong>Undergraduate - Year 2 or 3 entry</strong></p><ul><li>Students with a Dazhuan Certificate will be considered for Year 3 entry on an individual basis.&nbsp;</li></ul>"

            # https://www.bathspa.ac.uk/applicants/how-to-apply/postgraduate/
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="intro-text">
	<p class="intro">We’re delighted you’re applying to study with us. The process is different based on your location and mode of study. Here’s what you need to do.</p>
</div><div class="rich-text" >
  <div data-hash-anchor='<a id="d.en.1281"></a>'></div>
    <div>
        <h2>UCAS applicants</h2>
<p>If you fit the following criteria, you’ll need to apply through the Universities and Colleges Admissions Service (UCAS):</p>
<ul>
<li>You’re applying directly out of sixth form or college;</li>
<li>You want to study full-time;</li>
<li>You don’t already hold an undergraduate qualification and are from the UK, EU or Channel Islands.</li>
</ul>
<p><strong>The official UCAS deadline for 2018/19 applications to any course: 15 January 2018.</strong></p>
<p>You’ll need some information from your course's webpage, including Bath Spa University’s institution code: BASPA B20.</p>
<p>Read more about <a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">how to </a><a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">apply through UCAS</a> or just get started. You’ll need to register or login to the UCAS site. &nbsp;</p>
<p><a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">Apply via UCAS</a></p>
<h2>International applicants</h2>
<p>You can apply for one of our undergraduate courses online from the course’s webpage.&nbsp;You’ll be asked to create an online account.</p>
<p>Don’t have time to complete your whole application? Don’t worry, you can save your application and come back to it at anytime.</p>
<p>Alternatively, you can also <a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">apply via UCAS</a>.</p>
<p>Entry requirements are listed on the course pages. As part of the process you will be required to provide evidence to support your application.&nbsp;Please see our <a href="/international/">international</a> webpages for more information for international students, including entry requirements and visa advice specific to your country.</p>
<p><a href="/courses/">Search for your course</a></p>
<h2>Applying for part-time study</h2>
<p>If you’d like to study part-time, you’ll need to apply online directly with us, rather than through UCAS. &nbsp;</p>
<p><strong>Click the 'apply now' button on the webpage for the course you’d like to study.</strong></p>
<h2>Already hold an undergraduate degree?</h2>
<p>If you already have a degree or higher qualification than that for which you are applying, your fee requirements may be different, due to the way government University funding is distributed. Please check the Equivalent or Lower Qualification (ELQ) policy&nbsp;for more details.<br><br>This also applies to students who progress to the third year of study, following completion of a Foundation Degree. Please note that Foundation Degrees are currently exempt from higher fees.</p>
    </div>
</div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            ucascode = response.xpath(
                "//dd[contains(text(),'Course Code:')]//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("Course Code:",
                                                         "").strip()
            print("len: ", len(ucascode))
            print("item['ucascode'] = ", item['ucascode'])

            # duration
            durationMode = response.xpath(
                "//dt[contains(text(),'Course length')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(durationMode)
            print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(durationMode.strip())
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']: ", item['duration'])
            print("item['duration_per']: ", item['duration_per'])
            item['other'] = durationMode
            print("item['other']: ", item['other'])

            if "or" in item['ucascode']:
                ucascode_list1 = item['ucascode'].split("or")
                print("ucascode_list1", ucascode_list1)

                # 拆分duration
                if ", or" in item['other']:
                    duration_list1 = item['other'].split(", or")
                else:
                    duration_list1 = [item['other'], item['other']]
                print("duration_list1: ", duration_list1)
                for u in range(len(ucascode_list1)):
                    item['ucascode'] = ucascode_list1[u].strip()
                    duration_list = getIntDuration(duration_list1[u].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    print("item['duration']: ", item['duration'])
                    print("item['duration_per']: ", item['duration_per'])
                    # 分为两种情况，第一种正常采集，第二种为带实习的专业
                    if u == 0:
                        overview = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']1: ", item['overview_en'])

                        assessment_en = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                        ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']1: ", item['assessment_en'])
                    elif u == 1:
                        overview = response.xpath(
                            """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                            if len(overview) == 0:
                                overview = response.xpath(
                                    """//h3[contains(text(),'Overview')]/..|//h3[contains(text(),'overview')]/.."""
                                ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']2: ", item['overview_en'])

                        assessment_en = response.xpath(
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'Assessment')]/.."
                        ).extract()
                        if len(assessment_en) == 0:
                            assessment_en = response.xpath(
                                "//h3[contains(text(),'How will I be assessed?')]/..|"
                                "//h3[contains(text(),'How will I be taught?')]/..|"
                                "//h3[contains(text(),'Assessment')]/.."
                            ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']2: ", item['assessment_en'])
                    yield item
            else:
                overview = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                    ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['overview_en']1: ", item['overview_en'])

                assessment_en = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']1: ", item['assessment_en'])
                yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)


#            department_dict = {"arts management":"Bath Business School","accounting and finance":"Bath Business School",
# "business and management":"Bath Business School",
# "business and management (accounting)":"Bath Business School",
# "business and management (entrepreneurship)":"Bath Business School",
# "business and management (international business)":"Bath Business School",
# "business and management (marketing)":"Bath Business School",
# "curatorial practice":"Bath School of Art and Design",
# "design (ceramics)":"Bath School of Art and Design",
# "design (fashion and textiles)":"Bath School of Art and Design",
# "fine art":"Bath School of Art and Design",
# "visual communication":"Bath School of Art and Design",
# "children's publishing":"College of Liberal Arts",
# "classical acting":"College of Liberal Arts",
# "composition":"College of Liberal Arts",
# "creative producing":"College of Liberal Arts",
# "creative writing":"College of Liberal Arts",
# "creative writing phd":"College of Liberal Arts",
# "crime and gothic fictions":"College of Liberal Arts",
# "dance":"College of Liberal Arts",
# "directing":"College of Liberal Arts",
# "directing circus":"College of Liberal Arts",
# "environmental humanities":"College of Liberal Arts",
# "environmental management":"College of Liberal Arts",
# "feature filmmaking":"College of Liberal Arts",
# "heritage management":"College of Liberal Arts",
# "intercultural musicology":"College of Liberal Arts",
# "liberal arts":"College of Liberal Arts",
# "literature, landscape and environment":"College of Liberal Arts",
# "music performance":"College of Liberal Arts",
# "performing shakespeare":"College of Liberal Arts",
# "principles of applied neuropsychology":"College of Liberal Arts",
# "scriptwriting":"College of Liberal Arts",
# "songwriting (campus based)":"College of Liberal Arts",
# "songwriting (distance learning)":"College of Liberal Arts",
# "sound (arts)":"College of Liberal Arts",
# "sound (design)":"College of Liberal Arts",
# "sound (production)":"College of Liberal Arts",
# "theatre for young audiences":"College of Liberal Arts",
# "transnational writing":"College of Liberal Arts",
# "travel and nature writing":"College of Liberal Arts",
# "writing for young people":"College of Liberal Arts",
# "counselling and psychotherapy practice":"Institute for Education",
# "education (education studies)":"Institute for Education",
# "education (early childhood studies)":"Institute for Education",
# "education (international education)":"Institute for Education",
# "education (leadership and management)":"Institute for Education",
# "inclusive education":"Institute for Education",
# "professional practice":"Institute for Education",
# "professional practice in higher education":"Institute for Education",
# "teaching english to speakers of other languages":"Institute for Education",
# "specific learning difficulties / dyslexia":"Institute for Education",
# "national award for special educational needs coordination":"Institute for Education",
# "professional doctorate in education":"Institute for Education",
# }

Ejemplo n.º 18

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Reading"
        # item['country'] = 'England'
        # item['website'] = 'http://www.reading.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Whiteknights,PO Box 217,Reading, Berkshire,RG6 6AH"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型、ucas_code
            programmeDegree_typeUcascode = response.xpath(
                "//span[@class='text-bg-standout text-nice-wrap']/text() | //h1[@id='heading']//text() | //h1[@class='hero-heading']//text() | //h1[@class='block-heading block-heading-l5 block-heading-b5 block-heading-md-l-reset cell-md-t0']//text()"
            ).extract()
            clear_space(programmeDegree_typeUcascode)
            programmeDegree_typeUcascode = ''.join(
                programmeDegree_typeUcascode).strip()
            # print("programmeDegree_typeUcascode: ", programmeDegree_typeUcascode)

            degree_type = re.findall(r"^\w+/\w+", programmeDegree_typeUcascode)
            if len(degree_type) == 0:
                degree_type = re.findall(r"^\w+", programmeDegree_typeUcascode)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            ucascode = re.findall(r"\w{4}$", programmeDegree_typeUcascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            programme = programmeDegree_typeUcascode.replace(
                item['degree_name'], '').replace(item['ucascode'], "").strip()
            item['programme_en'] = programme.title()
            print("item['programme_en']: ", item['programme_en'])

            # duration
            durationMode = response.xpath(
                "//h2[@class='row-margin-small text-weight-medium text-size-25']/text() | //strong[contains(text(),'Duration')]/../text() | //h3[contains(text(),'Programme length:')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # start_date = response.xpath("//p[@class='headline'][contains(text(), 'Start date')]//text()").extract()
            # # print(start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            overview2 = response.xpath(
                "//div[@class='m-bg-white m-pad-around m-pull-left-normal m-pull-up']//div[@class='theme-editor'] | //div[@id='top-courseOverview'] | //html//div[@id='top-programmeOverview']/h2[1]/following-sibling::div[1] | //div[@id='tc1']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview2))
            # if item['overview_en'] == "":
            #     print("***overview_en")
            # print("item['overview_en']: ", item['overview_en'])

            # department
            item['department'] = response.meta['department']
            # print("item['department']: ", item['department'])

            if item['department'] == "":
                department = response.xpath(
                    "//aside[contains(@class,'pane base4 m-margin-bottom')]//div[contains(@class,'row-small')]//p[contains(text(), 'School')]/following-sibling::*//text()"
                ).extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                item['department'] = item['department'].replace(
                    "How to apply",
                    "").replace("Visit the",
                                "").replace("website",
                                            "").strip().strip('.').strip()
                # print("item['department']1: ", item['department'])
            # if item['department'] == "":
            #     print("***department")

            # //h2[@id='Panel1Trigger']/../..
            entry_requirements = response.xpath(
                "//span[contains(text(),'entry requirements')]/../../.."
            ).extract()
            entry = ''.join(entry_requirements).strip()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # if item['apply_desc_en'] == "":
            #     print("apply_desc_en 为空")
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h4[contains(text(),'Typical')]/following-sibling::*[1]//text()|//h4[contains(text(),'A level')]/following-sibling::*[1]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # if item['alevel'] == "":
            #     print("alevel 为空")
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//h4[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # if item['ib'] == "":
            #     print("ib 为空")
            # print("item['ib']: ", item['ib'])

            ielts = re.findall(r"IELT.{1,100}", entry)
            ielts = response.xpath(
                "//*[contains(text(),'IELT')]//text()").extract()
            if ''.join(ielts).strip() == "IELTS":
                ielts = response.xpath(
                    "//*[contains(text(),'IELT')]/following-sibling::*[1]//text()"
                ).extract()
            clear_space(ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # if item['ielts_desc'] == "":
            #     print("ielts_desc 为空")
            # print("item['ielts_desc']: ", item['ielts_desc'])
            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # toefl = re.findall(r"TOEFL[\s\(\)\w:\.]{1,300}", entry)
            # if item['toefl_desc'] == "":
            #     item['toefl_desc'] = ''.join(toefl)
            # print("item['toefl_desc']: ", item['toefl_desc'])
            # toeflDict = get_toefl(item['toefl_desc'])
            # item['toefl'] = toeflDict.get("TOEFL")
            # item['toefl_l'] = toeflDict.get("TOEFL_L")
            # item['toefl_s'] = toeflDict.get("TOEFL_S")
            # item['toefl_r'] = toeflDict.get("TOEFL_R")
            # item['toefl_w'] = toeflDict.get("TOEFL_W")
            # # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            modules = response.xpath(
                "//h2[@id='Panel2Trigger']/../..|//div[@id='bottom-courseContent']/..|//div[@id='page_content_wrap']/following-sibling::div[position()<3]|//strong[contains(text(),'Programme structure')]/../following-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h4[contains(text(),'Programme structure and content')]/preceding-sibling::*[1]/following-sibling::*[position()<11]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //h2[@id='Panel1Trigger']/../..
            career = response.xpath(
                "//h2[@id='Panel4Trigger']/../following-sibling::div[1]|//div[@id='bottom-careers']/..|//div[@id='careers']|//h3[contains(text(),'Careers')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]
            tuition_fee = response.xpath(
                "//p[contains(text(),'New international students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+|£\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(''.join(tuition_fee_re).replace(
                    "£", "").replace(",", "").strip())

            if item['tuition_fee'] == 0:
                item[tuition_fee] = None
            else:
                item['tuition_fee_pre'] = "£"
            # if item['tuition_fee'] is None:
            #     print("tuition_fee 为空")
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='top-howWeTeachYou']
            assessment_en = response.xpath(
                "//div[@id='top-howWeTeachYou']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="row row-margin-small row-margin-title-10">

                        <h1 class="text-transform-uppercase text-size-30 m-text-size-25 text-weight-medium display-inline text-bg-standout text-nice-wrap">
                            <span class="text-bg-standout">How to apply for undergraduate courses</span>
                        </h1>

                    </div>
                                    <div class="theme-editor theme-editor-break-word">
                        You can apply online for all of our courses via the national admissions service, <a href="http://www.ucas.com">UCAS</a>. You can choose to apply for up to five courses in total, including more than one course at the same institution. <br />
<h4>When to apply&nbsp;</h4>
<p>UK or EU students: You should aim to apply via UCAS between 1 September and 15 January for admission in September 2018. If you have missed the 15 January deadline, there is still the opportunity to apply (via UCAS), and we are happy to consider late applications until 30 June 2018 (all applications received after 30 June are entered into Clearing). Please be aware that some of our courses may be full after the UCAS deadline, so we do recommend early applications where possible.</p>
<p>
International students: You should aim to apply via UCAS between 1 September 2017 and 15 January 2018 for admission in September 2018, though applying before 15 January is encouraged in order to ensure you have time to prepare for studying in the UK. However, if you have missed the 15 January deadline, you are still welcome to apply (via UCAS), and we are happy to consider late applications until 30 June 2018 (all applications received after 30 June are entered into Clearing). Please be aware that some of our courses may be full after the UCAS deadline, so we do recommend early applications where possible.</p>
<h4>UCAS code</h4>
<p>Our UCAS code is R12. The University does not have a campus code.&nbsp;</p>
<h4>UCAS costs</h4>
<p>There is a small charge made by UCAS for applying to university. The application fee is &pound;13 if you&rsquo;re applying to just one course, or &pound;24 for multiple courses and for late applications sent after 30 June.</p>
<h4>Entry requirements</h4>
<p>Please read our <a href="/ready-to-study/study/how-to-apply/entry-requirements-ug.aspx">entry requirements page</a> for more information on accepted qualifications.</p>
<h4>English language requirements</h4>
<p>If English is not your first language, you can find out more information on our <a href="/ready-to-study/international-and-eu/english-language-requirements.aspx">English language requirements</a> page.</p>
                    </div>
                                    <div class="row-large paddingtop-small pad-sides border-top-light">
                        <div class="visuallyhidden" id="show-more-094422b2-b9da-4602-9594-80e05dba925c" aria-hidden="true">
                            <div class="theme-editor">
                                <h4>The application process&nbsp;
</h4>
<p>Once UCAS receives your application, it sends it to our Admissions Office, who assess it and decide whether to offer you a place. The way we assess your application will differ from course to course, but we will use the information supplied in your application form including your personal statement, predicted and achieved grades and the reference supplied by your school or college.&nbsp;</p>
<p>We carefully consider every application so please don’t worry if you don’t hear back from us straight away. We aim to make a decision on all applications within four weeks, and you will be able to track the progress of your application on <a href="https://www.ucas.com/ucas/undergraduate/login">UCAS Track</a>.&nbsp;</p>
<p>We will email you with the outcome of your application and confirm this with UCAS so that you can see the decision online using UCAS Track. If we offer you a place, we will explain any conditions attached to that offer (for example, the need to achieve certain grades in your examinations).&nbsp;</p>
<h4>Interviews</h4>
<p> For some courses, we invite prospective students for an interview before making an offer. These are:&nbsp;</p>
<ul><li>Accounting and Business (assessment centre run in conjunction with PwC)&nbsp;</li>
    <li>Archaeology&nbsp;</li>
    <li>Art</li>
    <li>Chemistry&nbsp;</li>
    <li>Film, Theatre &amp; Television&nbsp;</li>
    <li>Food and Nutritional Sciences&nbsp;</li>
    <li>Graphic Communication&nbsp;</li>
    <li>Pharmacy&nbsp;</li>
    <li>Primary Education&nbsp;</li>
    <li>Psychology (MSci courses)&nbsp;</li>
    <li>Meteorology and Climate (MMet course)&nbsp;</li>
    <li>Theatre Arts, Education and Deaf Studies (TAEDS)&nbsp;</li>
</ul>
<h4>Visit Days</h4>
<p> If you are offered a place to study at the University of Reading without an interview, we will invite you to attend a Visit Day in your department of choice. Visit Days take place between November and March and will usually include a tour of our campus and facilities, a visit to a hall of residence, and the chance to meet academic staff and current students.&nbsp;</p>
<h4>Choosing offers&nbsp;</h4>
<p>Once you have heard from all of the universities that you applied to, UCAS will ask you which offer you want to accept. Most people choose two: one as your ‘firm’ or first choice, the other as your ‘insurance’ or second choice. If you meet the conditions of your offer, you will automatically be accepted onto your firm choice course.&nbsp;</p>
<h4>Confirmation of your place&nbsp;</h4>
<p>Most offers are conditional on exam results. If you meet the conditions set out in our offer, your place is assured and you will see this on <a href="https://www.ucas.com/ucas/undergraduate/login">UCAS Track</a> . If you do not meet the conditions set out in your offer, you may still be able to get on the course. We will let you know as soon as possible after we have received your results.&nbsp;</p>
<h4>Gap year/deferred entry&nbsp;</h4>
<p>We welcome deferred entry applications. You need to apply at the same time as if you were planning to go straight to university, but you should state in your UCAS application that you wish to be considered for deferred admission.</p>
                            </div></div></div>"""
                ]))
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['interview_desc_en'] = remove_class(
                clear_lianxu_space([
                    """<h4>Interviews</h4>
<p> For some courses, we invite prospective students for an interview before making an offer. These are:&nbsp;</p>
<ul><li>Accounting and Business (assessment centre run in conjunction with PwC)&nbsp;</li>
    <li>Archaeology&nbsp;</li>
    <li>Art</li>
    <li>Chemistry&nbsp;</li>
    <li>Film, Theatre &amp; Television&nbsp;</li>
    <li>Food and Nutritional Sciences&nbsp;</li>
    <li>Graphic Communication&nbsp;</li>
    <li>Pharmacy&nbsp;</li>
    <li>Primary Education&nbsp;</li>
    <li>Psychology (MSci courses)&nbsp;</li>
    <li>Meteorology and Climate (MMet course)&nbsp;</li>
    <li>Theatre Arts, Education and Deaf Studies (TAEDS)&nbsp;</li>
</ul>"""
                ]))
            print("item['interview_desc_en']: ", item['interview_desc_en'])

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2 class="trigger">Entry requirements</h2>
<table summary="A table outlining the basic entry requirements for courses at the University of Reading based on the qualifications offered in your country">
<tbody>
<tr><!-- HEADINGS-->
<td class="top-head"><strong>Your highest qualification</strong></td>
<td class="top-head"><strong>Likely entry level</strong></td></tr>
<tr><!-- FIRST ROW -->
<td>
<p><!-- EG FIRST ROW FIRST COLUMN INFO -->High School year 2 (Year 11) with leaving certificate: GPA 85%<br/>High School year 3 (Year 12) with graduation certificate: GPA 80%</p></td>
<td><a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a> </td></tr>
<tr class="even"><!-- SECOND ROW -->
<td>Gao Kao (Chinese University Entrance Exam) 80%</td>
<td><a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a> </td></tr>
<tr><!-- THIRD ROW -->
<td>Gau Cau (Chinese University Entrance Exam) combined with a successfully completed appropriate foundation/bridging programme. (Visit our <a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a>) </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- FOURTH ROW -->
<td>International Baccalaureate (IB) Diploma </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr><!-- FIFTH ROW -->
<td>British/International A Levels </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- SIXTH ROW -->
<td>Chinese-medium A Levels in Mathematics and Sciences (Cambridge Examinations Board) </td>
<td>Undergraduate Degree (Bachelors Degree) in a relevant subject </td></tr>
<tr>
<td>Ameson: High school results of 85% if 11 years completed, 80% if 12 years (with similar grades in relevant subjects), AST Maths: 165 and AST English: 150</td>
<td>Undergraduate Degree (Bachelors Degree)</td></tr>
<tr class="even"><!-- EIGHTH ROW -->
<td>Other international qualifications such as Australian HSC, US SAT or AP Certificates</td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr><!-- NINTH ROW -->
<td>Successfully completed first year of a Chinese University degree </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- TENTH ROW -->
<td>4-year Bachelor degree </td>
<td>Taught Postgraduate (Masters and Doctoral Degree) </td></tr>
<tr>
<td>&nbsp;Masters degree study </td>
<td>&nbsp;Research Postgraduate (Doctoral Degree) </td></tr></tbody></table>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 19

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath("//h1[@class='Title']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//h1[@class='Title']/small//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//nobr[contains(text(),'UCAS Code')]/../following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS Code:",
                                                         "").strip()
            # print("item['ucascode'] = ", item['ucascode'])

            item['start_date'] = response.meta.get(response.url)
            # print("item['start_date'] = ", item['start_date'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            # department = response.xpath(
            #     "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//div[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)
            item['other'] = duration_str

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//div[contains(text(),'Location')]/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']: ", item['location'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    ",", "").replace("£", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            overview = response.xpath(
                """//h2[@id='overview']/..|//h3[contains(text(),"What you'll experience")]/..|
            //h3[contains(text(),'What you’ll experience')]/..|//*[contains(text(),"What you'll experience")]/../.."""
            ).extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)).replace(
                    "<h3>What you'll experience</h3>", "").strip()
            print("item['overview_en']: ", item['overview_en'])

            career = response.xpath(
                "//h3[contains(text(),'Careers and opportunities')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            rntry_requirements_content = response.xpath(
                "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2019 start')]/../../../..//text()"
            ).extract()
            rntry_requirements_str = clear_lianxu_space(
                rntry_requirements_content)

            ieltsList = response.xpath(
                "//*[contains(text(),'English language proficiency')]/text()|"
                "//*[contains(text(),'English Language proficiency')]/text()"
            ).extract()
            # print(ieltsList)
            if len(ieltsList) == 0:
                ieltsList = re.findall(r".{1,45}IELTS.{1,85}",
                                       rntry_requirements_str)
            clear_space(ieltsList)
            if len(ieltsList) > 0:
                item['ielts_desc'] = ''.join(ieltsList[1:]).strip()
                if item['ielts_desc'] == "":
                    item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//*[contains(text(),'A levels')]/text()").extract()
            # print(ieltsList)
            if len(alevel) == 0:
                alevel = re.findall(r".{1,45}A\slevels.{1,85}",
                                    rntry_requirements_str)
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[1:]).strip()
                if item['alevel'] == "":
                    item['alevel'] = ''.join(alevel).strip()
            print("item['alevel']: ", item['alevel'])

            modules = response.xpath(
                "//h2[@id='What youll study']/..|//h2[@id='What youll study']/../following-sibling::div[1]|//div[contains(text(),'Units currently being studied')]/../../.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//h2[@id='Teaching']/..|//h2[@id='Teaching']/../following-sibling::*[1]|//h2[@id='How youre assessed']/..|//h2[@id='How youre assessed']/../following-sibling::*[1]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            apply_proces_en = response.xpath(
                "//h2[@id='Apply']/..|//h2[@id='Apply']/../following-sibling::*"
            ).extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['apply_documents_en'] = remove_class(
                clear_lianxu_space([
                    """<h2 style="color: #384047; margin: 33px 0px 0.7em; padding: 0px;">What you'll need to send us</h2>
<p style="color: #384047; margin: 0px 0px 25px; border: none;">When you apply to join us, we'll need to see the following documents:</p>
<ul style="color: #384047; margin: 0px 0px 25px; padding-left: 35px; border: none; list-style-image: initial;">
    <li style="margin-top: 0px;">A completed application form</li>
    <li>A Personal Statement or Statement of Purpose</li>
    <li>Officially certified and translated copies of your high school or college qualification and grades (for undergraduate courses)</li>
    <li>Officially certified and translated copies of your degree qualification and grades (for Postgraduate courses)</li>
    <li>Proof of your English language level (such as an IELTS Certificate)</li>
    <li style="margin-bottom: 0px;">One academic reference on official headed paper for undergraduate courses or two references for postgraduate courses</li>
</ul>"""
                ]))
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Undergraduate courses</h3>
<p>If you've completed the Chinese Senior High School Diploma plus one year at a recognised university in China, we'll consider you for admission onto an undergraduate course such as a Bachelor's degree. You must have studied relevant subjects and achieved strong grades.</p>
<p>If you don't have a Chinese Senior High School Diploma, you can apply with:</p>
<h4>A levels</h4>
<ul>
    <li>Most courses will require 120 UCAS points. Your A level grades should equal or exceed the total points required. You can use the&nbsp;<a rel="noopener noreferrer" rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator"></a><a rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator" target="_blank">UCAS Tariff Calculator</a>&nbsp;to work out your total points. Please check your specific course page to find the exact number of points.</li>
    <li>Some courses will require you to have studied specific subjects at A level. For example, to study a science course you will usually need to have achieved passing grades in scientific subjects at A level.</li>
    <li>A level points: A* = 56 A = 48 B = 40 C = 32 D = 24.</li>
</ul>
<h4>International Baccalaureate</h4>
<ul>
    <li>Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for.</li>
</ul>
<p>You may also be considered for advanced entry onto a relevant undergraduate degree programme if you have a College Graduation Diploma (Dazhuan) from a recognised university or college on completion of two to three years of study, or a BTEC HND or SQA HND Higher National Diploma in a relevant subject.</p>
<p>You may be able to join an undergraduate course with other qualifications. We do consider qualifications from a range of sources. Contact us to find out more.</p>"""
                ]))
            item[
                "ib"] = "Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for."
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 20

Mostrar archivo

Archivo: UniversityOfSalford_U.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.salford.ac.uk/"
        item['university'] = "University of Salford"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'The Crescent, Salford, M5 4WT, UK'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 专业、学位类型
            programme = response.xpath("//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h1//text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            if "Foundation" not in item['programme_en'] and item['degree_name'] != "Graduate Certificate":
                degree_type = response.xpath("//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h2//text()").extract()
                item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                # //div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/p
                department = response.xpath("//strong[contains(text(), 'School -')]/../text()").extract()
                item['department'] = ''.join(department).strip()
                # print("item['department']: ", item['department'])

                start_date = response.xpath("//strong[contains(text(), 'Start Date(s):')]/../text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                if ";" in ''.join(start_date):
                    start_date = ''.join(start_date).split(";")
                    for s in start_date:
                        item['start_date'] += getStartDate(s) + ","
                else:
                    item['start_date'] = getStartDate(''.join(start_date))
                item['start_date'] = item['start_date'].strip().strip(",").strip()
                # print("item['start_date']: ", item['start_date'])



                # //strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International -')]
                tuition_fee = response.xpath("//strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International')]//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
                # print("tuition_fee_re: ", tuition_fee_re)
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1]
                overview = response.xpath(
                    "//div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1] | //div[@id='content']/div[@class='row']/div[1]").extract()
                item['overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['overview_en']: ", item['overview_en'])

                # //section[@id='about']/div[@id='content']
                modules_en = response.xpath("//div[@id='courseaccordion']").extract()
                if len(modules_en) == 0:
                    # print("********")
                    modules_en = response.xpath("//h2[contains(text(),'Course Details')]/following-sibling::*").extract()
                    if len(modules_en) == 0:
                        modules_en = response.xpath("//h2[contains(text(),'Course Structure')]|//h2[contains(text(),'Course Structure')]/following-sibling::*").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en)) # .replace("&nbsp;", "")
                item['modules_en'] = item['modules_en'].encode('utf-8').decode("unicode-escape").replace("Â ", "")
                # print("item['modules_en']: ", item['modules_en'])

                alevel = response.xpath("//*[contains(text(),'A level')]/following-sibling::td//text()").extract()
                item['alevel'] = clear_lianxu_space(alevel)
                # print("item['alevel']: ", item['alevel'])

                ib = response.xpath("//*[contains(text(),'International Baccalaureate')]/following-sibling::td//text()").extract()
                item['ib'] = clear_lianxu_space(ib)
                # print("item['ib']: ", item['ib'])

                # //section[@id='requirements']/div
                entry_requirements = response.xpath("//section[@id='requirements']/div//text()").extract()
                # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                # 申请材料
                apply_documents_en = response.xpath("//h3[contains(text(),'Applicant profile')]/preceding-sibling::*[1]/following-sibling::*").extract()
                item['apply_documents_en'] = remove_class(clear_lianxu_space(apply_documents_en))
                # print("item['apply_documents_en']: ", item['apply_documents_en'])

                # //h3[contains(text(),'English Language Requirements')]/following-sibling::*[1]
                ielts_desc = response.xpath("//h3[contains(text(),'English Language Requirements')]/following-sibling::*[position()<3]//text()").extract()
                clear_space(ielts_desc)
                item['ielts_desc'] = ''.join(ielts_desc).replace("Suitable For", "").strip()
                # print("item['ielts_desc']: ",item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'].replace("level 4, 5, 6 ", "").strip())
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


                # //section[@id='teaching']/div[@class='container main']/div[@class='col-md-12']/div[@id='teaching_0a19']
                assessment_en = response.xpath("//section[@id='teaching']/div").extract()
                item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])

                # //section[@id='employability']/div[@class='container main']/div[@class='col-md-12']/div[@id='employ_0a19']
                career = response.xpath("//section[@id='employability']/div").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<div class="col-xs-12 col-sm-6 col-md-6 col-lg-6"><h1>How to apply?</h1>
</div>
<div class="clearfix"></div>
<div id="content_container_1135928">
<p class="lead">We try to make applying to Salford as flexible and straightforward as possible.</p><p>If you have any queries about the application process, please contact the Course Enquiries Team on 0161 295 4545 (choose option 1) or <a href="mailto:[email protected]">[email protected]</a></p><p>The majority of our undergraduate applications are via UCAS, however there are some exceptions. The normal closing date for UCAS applications for September entry is 15 January (although if you are at school or college, they may ask you to fill it in earlier to give them time to prepare your reference). Applications received after this date may still be considered if the course is not full. If you apply late you are advised to check the UCAS website for course availability first. To find out more about how to apply and the different options available, we have given some guidance below.</p><p><strong>If you're interested in applying through clearing, please see further details below.</strong></p>
</div>
<div class="accordion-inner" id="accordion_1421686" role="tablist" aria-multiselectable="true">
<div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_1">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_1" aria-expanded="false" aria-controls="collapse_sub_1" class="collapsed">
          Applying through Clearing?
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_1" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_1">
      <div class="panel-body">
<div id="content_container_1615870">
<p>Clearing is applicable to those who have not yet submitted an application or those who are not yet holding any offers. During this time, universities can take applications to fill any course vacancies they may have. UCAS will list course availability so you can take a look on <a href="http://www.ucas.com">www.ucas.com</a> for more information. Despite popular belief, clearing opens at 9.00am on 5 July 2018 so potential applicants needn&rsquo;t wait until Results Day. You can find more information about this process <a href="https://www.ucas.com/undergraduate/results-confirmation-and-clearing/what-clearing">here.</a></p><p>If you'd like to apply to Salford through clearing, or have any questions at all about the process, please give our hotline a call on 0300 555 5030 and one of our friendly team can help you. The lines are open 9.00am to 5.00pm Monday to Thursday and 10.00am to 4.00pm on Friday. Please ensure you have an idea of what course or subject area you&rsquo;re interested in and that you have details of all your current qualifications to hand when you call. This means we'll be able to advise you of the best options to match your interests and qualifications.</p><p>If you are successful in gaining a place through clearing, you will need to either attach to the University via UCAS Track or complete a direct application form, depending on your current circumstances. We can help advise you of this when you call.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_2">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_2" aria-expanded="false" aria-controls="collapse_sub_2" class="collapsed">
          Entry requirements
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_2" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_2">
      <div class="panel-body">
<div id="advice_and_guidance">
<p><script>(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)     })(window,document,'script','//www.google-analytics.com/analytics.js','ga');     ga('create', 'UA-2643712-2', 'salford.ac.uk');     ga('require', 'displayfeatures');     ga('require', 'linkid', 'linkid.js');     ga('send', 'pageview');</script></p><div id="page"><div><div><div id="content_div_44347"><p>From A level to Access to HE diplomas to International Baccalaureate &ndash; our entry requirements are wide and varied and we recognise a wide range of qualifications from around the world.</p><p>If you are a UK applicant leaving school or college we accept the following qualifications:</p><ul><li>A/AS level</li><li>Scottish Higher/Advanced Higher</li><li>14 to 19 Advanced Diplomas</li><li>the International Baccalaureate</li><li>Welsh Baccalaureate</li><li>BTEC Awards</li><li>AQA Baccalaureate</li><li>Cambridge Pre-U</li><li>Access to Higher Education Diploma</li></ul><p>We&rsquo;re committed to widening access to higher education for all sections of the community and so are as flexible as possible in our admissions policy.</p></div></div></div></div>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_3">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_3" aria-expanded="false" aria-controls="collapse_sub_3" class="collapsed">
          The UCAS application process
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_3" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_3">
      <div class="panel-body">
<div id="advice_and_guidance">
<div id="page"><div><div><div id="content_div_44326"></div><h4>APPLYING VIA UCAS</h4></div></div></div><p>You can apply online via the&nbsp;<a href="http://www.ucas.com/students/apply" target="_blank"><u>UCAS website</u></a>. Your school or college should help you to access this service, and will give you a buzzword which you will need to register. You can also use the service if you&rsquo;re applying independently in the UK and overseas.</p><p>Take some time to check your online application before you submit it. It&rsquo;s important that you take care when completing the 'choices' section and use the correct institution codes and programme codes.</p><p>You can apply for up to five programmes, so make sure you think through your options carefully before applying. Also check that the 'education' section displays your correct qualification details. You should then forward your application on to your referee, who will check your application, provide the reference and forward the application to UCAS. If you are applying independently, you will be responsible for obtaining and adding the reference yourself.</p><p>It costs £23 to apply for two-five choices. If, however, you wish to only apply to the University of Salford and you are only applying for one programme, you may use a single entry at a fee of £12. You may pay online using a credit or debit card, or alternatively if you are applying from your school or college they may choose to be invoiced.</p><p>You can follow the progress of your application 24 hours a day, seven days a week using UCAS&nbsp;<cite>Track</cite> by logging on to&nbsp;<a href="http://www.ucas.com" target="_blank"><u>www.ucas.com</u></a>.</p><p>If you experience any difficulties with Apply you should contact the UCAS Customer Services team:<br />T +44 (0)871 468 0468</p><p>If you would like further help from the University, please contact:</p><p>Central Admissions<br />T: 0161 295 4545<br /><a href="mailto:[email protected]"><u>[email protected]</u></a></p><p>All applications should be made via the UCAS website, except for:</p><ul><li>applications for part-time courses and BSc (Hons) Nursing Studies</li><li>applications for the International Foundation Year</li><li>applications for post qualifying Health and Social care courses and single modules</li><li>applications for International and the Graduate Certificate in Management (GCIM)</li></ul><p>If the above applies to you, please see the &lsquo;Alternative application process&rsquo; section on this page for more details of how to apply.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_4">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_4" aria-expanded="false" aria-controls="collapse_sub_4" class="collapsed">
          UCAS applications: key information
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_4" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_4">
      <div class="panel-body">
<div id="content_container_1136542">
<p>When applying, make sure you use the correct codes:</p><p>Salford institution code name:&nbsp;<strong>SALF</strong></p><p>Salford institution code:&nbsp;<strong>S03</strong></p><p>Campus code: there are no campus codes at Salford - leave this blank</p><p>We will start receiving applications for entry in autumn 2018 from early September 2017</p><p>The normal closing date for applications is&nbsp;<strong>15 January 2018</strong> (although if you are at school or college, they may ask you to fill it in earlier to give them time to prepare your reference). Applications received after this date may still be considered if the course is not full. If you apply late, please check the UCAS website for course availability first.</p><p><strong>UCAS Extra:</strong> 25 February &ndash; 4 July 2018</p><p><strong>Clearing starts:</strong> 5 July 2018</p><p><strong>A level results:</strong> 16 August 2018</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_5">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_5" aria-expanded="false" aria-controls="collapse_sub_5" class="collapsed">
          Alternative application process
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_5" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_5">
      <div class="panel-body">
<div id="advice_and_guidance">
<h3>INTERNATIONAL FOUNDATION YEAR APPLICATIONS</h3><p>If you would like to apply for our International Foundation Year you will need to download and complete the application form.</p><p>You should then email it to&nbsp;<a href="mailto:[email protected]">[email protected]</a> or post the completed form to the address on the front cover of the form.</p><p>NOTE: Together with the form, you will also need to send us a photocopy of the identity page of your passport and proof of your current level of English.</p><ul type="disc"><li><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0005/1190894/0518-International-foundation-year-salford-2017.pdf" target="_blank">International Foundation Year application form</a></li></ul><h3>POST QUALIFYING HEALTH AND SOCIAL CARE COURSES AND SINGLE MODULE APPLICATIONS</h3><p>If your Trust is part of the North West SHA-SLA agreement, contact your&nbsp;<strong>CPD Lead to agree funding</strong>. You must then apply through the NHS CPD Apply system: <a href="http://www.cpd-applynw.nhs.uk"><strong>www.cpd-applynw.nhs.uk</strong></a></p><p><strong> </strong></p><p>If you are self-funding or your workplace is funding your course or single module, please complete the appropriate application form below and send directly to the University. Full details on who to send the form to are included on the form. If workplace funded, you will also need to supply a letter&nbsp;&nbsp;from your Trust stating that they will be funding your study.</p><p><strong> </strong></p><h3>APPLICATION FORMS</h3><p><strong>LEVEL 5 AND LEVEL 6 (UNDERGRADUATE/SINGLE MODULE) APPLICATION FORM</strong></p><p><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf"><strong>Download Undergraduate Application Form (please note you will need to provide two references)</strong></a></p><p><strong> </strong></p><p><strong>LEVEL 7 (MASTERS/SINGLE MODULE) APPLICATION FORM</strong></p><p><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0007/79522/RU12483-PGApplicationNov2016.pdf"><strong>Download Postgraduate Application Form (please note you will need to provide two references)</strong></a></p><h3>INTERNATIONAL APPLICATIONS</h3><p>We welcome applications from international students. Applicants from outside the United Kingdom must apply through UCAS.</p><p>Students from the following countries should submit their applications to UCAS through the appropriate overseas office in London as listed in the UCAS Handbook: Cyprus, Guyana, India, Luxembourg, Thailand. Applicants from other countries should send their applications direct to UCAS. (Salford participates&nbsp;in the British Council Education Promotion Service and advisers are available in your local British Council Office).</p><p>Applications for the&nbsp;<strong>Graduate Certificate in Management</strong> should be made by completing the&nbsp;<a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf" target="_blank">undergraduate application form and returned to the address on the form</a>. For more information,&nbsp;see our section specifically for&nbsp;<a href="http://www.salford.ac.uk/international/how-to-apply">international students</a>.</p><h3>APPLYING FOR A TOP-UP COURSE</h3><p>If you are currently studying a Foundation Degree at one of our Partner Colleges and wish to top up your award to an Honours Degree, you should apply directly to the University by completing our&nbsp;<a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf">Undergraduate Studies direct application form</a>.</p><p>Information on the courses available to you to apply for are available from staff at your College.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_6">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_6" aria-expanded="false" aria-controls="collapse_sub_6" class="collapsed">
          Writing your personal statement
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_6" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_6">
      <div class="panel-body">
<div id="content_container_1136542">
<p>Writing your personal statement is a key part of the UCAS application process, but we know that it can sometimes feel like a daunting task. <br /><img width="1" height="15" alt="http://www.salford.ac.uk/__data/assets/image/0005/660326/spacer.gif" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAPCAMAAAASwVXLAAAAAXNSR0ICQMB9xQAAAANQTFRFAAAAp3o92gAAAAF0Uk5TAEDm2GYAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAAZdEVYdFNvZnR3YXJlAE1pY3Jvc29mdCBPZmZpY2V/7TVxAAAAC0lEQVQY02NgwAcAAB4AAcmYI4MAAAAASUVORK5CYII=" /><br /> Here are our top five tips for a winning personal statement&hellip;</p><h4>1. PREPARATION, PREPARATION, PREPARATION</h4><p>Start to prepare your personal statement early on. Without thinking about a structure, it will be harder for you to collect and organise your thoughts later on. Begin with a mind map, noting down anything you think might be relevant, such as your skills, qualities, likes, dislikes and experience. Formulate this into a plan, and use your plan to help structure your first draft.</p><h4>2. capture the attention of the reader</h4><p>Capture the attention of the reader with a sharp introduction; use positive language to show them you&rsquo;re enthusiastic about your chosen subject. The main section of your statement should demonstrate what you have learned from the experiences you are writing about, and how this is relevant to the course you want to study. Tell the reader about your skills and qualities, and how these contribute to your understanding of the subject.</p><h4>3. TELL THEM ABOUT YOURSELF</h4><p>Your work experience and interests are also important, as this can highlight why you&rsquo;re the perfect student for the course. Universities want to know that you&rsquo;ll become an active member of the student community, not just succeed academically. Are you part of a club, do you play any sports, or have an unusual hobby?&nbsp;&nbsp;Make sure you mention these here. Remember; entry requirements are transparent, you and your experiences are unique.</p><h4>4. SELL, SELL, SELL</h4><p>Conclude your personal statement by summarising your key strengths, and reiterate that you are ready (both socially and academically) for university life. Be confident, keep it positive, and really sell yourself.</p><h4>5. LOSE THE WAFFLE</h4><p>Remember that you have a limited number of words, so be clear and concise &ndash; don&rsquo;t waffle! Don&rsquo;t expect your first draft to be perfect; ask a teacher or careers advisor to check over it for you so that your spelling and grammar are spot on.<br /><img width="1" height="15" alt="http://www.salford.ac.uk/__data/assets/image/0005/660326/spacer.gif" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAPCAMAAAASwVXLAAAAAXNSR0ICQMB9xQAAAANQTFRFAAAAp3o92gAAAAF0Uk5TAEDm2GYAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAAZdEVYdFNvZnR3YXJlAE1pY3Jvc29mdCBPZmZpY2V/7TVxAAAAC0lEQVQY02NgwAcAAB4AAcmYI4MAAAAASUVORK5CYII=" /></p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_7">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_7" aria-expanded="false" aria-controls="collapse_sub_7" class="collapsed">
          Salford Alternative Entry Scheme
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_7" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_7">
      <div class="panel-body">
<div id="content_container_1136542">
<p>At the University of Salford we believe that everyone should be able to achieve their full potential.</p><p>If you&rsquo;ve been out of education for a while, want to further your career or simply want to study in a field that you&rsquo;re passionate about, the Salford Alternative Entry Scheme (SAES) could be for you.</p><p><strong>How can I apply?</strong></p><ul><li>Submit your course application as you would normally through UCAS.</li><li>We will then recommend you to SAES if you&rsquo;re eligible (you&rsquo;ll be notified of this recommendation via UCAS or by our admissions team getting in touch with you).&nbsp;&nbsp;</li></ul><p>Successful applicants will be contacted through SAES about an assessment.&nbsp;</p><p>There are two different entry routes depending on your course. If the course you&rsquo;re studying is related to Business and Law, Health Sciences, Environment &amp; Life Sciences or Arts &amp; Media, you will be assessed through Accreditation for Prior Learning.&nbsp;If the course you&rsquo;re studying is related to Nursing, Midwifery, Social Work &amp; Social Sciences, Computing, Science &amp; Engineering or the Built Environment, you will be assessed through a MSAP-UK test.</p><p>For further information about the Salford Alternative Entry Scheme, please see our dedicated <a href="http://www.salford.ac.uk/study/undergraduate/salford-alternative-entry-scheme/entry-routes" target="_blank">SAES webpage</a>.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_8">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_8" aria-expanded="false" aria-controls="collapse_sub_8" class="collapsed">
          Part-time and accelerated degree courses
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_8" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_8">
      <div class="panel-body">
<div id="content_container_1136542">
<p>Part-time applications are made directly to the University. If you want to apply to study a part-time undergraduate qualification please contact our&nbsp;<strong>Course Enquiries Service</strong> on&nbsp;<strong>0161 295 4545</strong> for more information.</p><p>You&rsquo;ll need to complete an <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf" target="_blank"><u>Undergraduate Studies direct application form</u></a>.</p><p>Please note you will need to provide two references.</p><p>If you&rsquo;re applying for one of our accelerated courses in the School of the Built Environment, these applications are made through<a href="http://www.salford.ac.uk/study/undergraduate/how-to-apply/#section1"> UCAS</a>. This applies to both the full-time and the day-release versions of the course, with durations of two or three years respectively. Only the standard part-time courses of five years&rsquo; duration require direct applications.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_9">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_9" aria-expanded="false" aria-controls="collapse_sub_9" class="collapsed">
          Applying for 2019 Entry?
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_9" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_9">
      <div class="panel-body">
<div id="content_container_1572153">
<p>If you&rsquo;re looking to apply for a course starting in 2019, you can start working on your application through UCAS from 22 May, for submission from 5 September onwards.</p><p>In the meantime, there are plenty of ways to get to know us better:</p><p>Visit us on our next Open Day on 23 June &ndash; book your place <a href="http://www.salford.ac.uk/study/visit/undergraduate-open-days" target="_blank">here</a></p><p>Follow us on social media for live updates:</p><p>Twitter.com/@SalfordUni</p><p>Snapchat: Salforduni</p><p>Instagram:@SalfordUni</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_10">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_10" aria-expanded="false" aria-controls="collapse_sub_10" class="collapsed">
          Higher and Degree Apprenticeship Programmes
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_10" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_10">
      <div class="panel-body">
<div id="content_container_1572153">
<p>If you're looking to apply for a Higher or Degree Apprenticeship programme offered by the University of Salford then you will need to visit our apprenticeship webpage below and contact us via the online form. Applications for apprenticeship programmes do not go through the traditional UCAS system - you will apply directly to the University via a separate application form.&nbsp;</p><p><a href="https://www.salford.ac.uk/higher-and-degree-apprenticeships">www.salford.ac.uk/degree-apprenticeships</a></p><p>In order to apply for a Higher or Degree Apprenticeship you must be employed full-time in a relevant role and your employer must be willing to support you through the programme.</p><p>All apprenticeship roles are listed on the GOV.UK site below:&nbsp;</p><p><a href="https://www.gov.uk/apply-apprenticeship" target="_blank">www.gov.uk/apply-apprenticeship</a></p>
</div>
      </div>
    </div>
</div>
</div>"""]))
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<h2>Undergraduate</h2><p>Students who have completed a recognised foundation year or the first year of a relevant degree programme at a Chinese university may be considered</p>"""]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                duration = response.xpath(
                    "//strong[contains(text(), 'Duration')]/../following-sibling::*[position()<3]//text()").extract()
                clear_space(duration)
                print("duration: ", duration)
                # duration_str = ''.join(duration)

                if len(duration) == 1:
                    duration_list = getIntDuration(duration[0])
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                else:
                    item['duration'] = ', '.join(duration).replace("Fees:", "").strip().strip(',').strip()
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath(
                    "//strong[contains(text(),'UCAS Code:')]/..//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    # if len(ucascode[-1]) == 4:
                    #     item['ucascode'] = ucascode[-1]
                    # else:
                    item['ucascode'] = ucascode[-1]
                print("item['ucascode'] = ", item['ucascode'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)