コード例 #1
0
    def parse(self, response):
        #
        # programmeList = response.xpath(
        #     "//div[@class='programmes']/ul[@class='list list--programmes']/li/h2[@class='list__heading heading']/a[@class='list__link']//text()").extract()
        programmeList = response.xpath(
            "//div[@id='content']//ul[@class='list list--degrees']/li//a/h3/span[@class='list__heading-title']//text()"
        ).extract()
        # print("programmeList: ", programmeList)
        # print(len(programmeList))

        # departmentList = response.xpath(
        #     "//div[@class='programmes']/ul[@class='list list--programmes']/li/h3[@class='list__subheading subheading']/a[1]//text()").extract()
        #
        departmentList = response.xpath(
            "//div[@id='content']//ul[@class='list list--degrees']/li//a/p[@class='list__content list__content--department']//span[@class='list__text']//text()"
        ).extract()
        clear_space(departmentList)
        # print("departmentList: ", departmentList)
        # print(len(departmentList))

        departmentDict = {}
        for i in range(len(programmeList)):
            departmentDict[programmeList[i]] = departmentList[i]
        # print(departmentDict)
        links = response.xpath(
            "//div[@class='programmes']/ul[@class='list list--programmes']/li/h2[@class='list__heading heading']/a/@href|"
            "//div[@id='content']//ul[@class='list list--degrees']/li//a/@href"
        ).extract()
        # print(len(links))
        links = list(set(links))
        # print(len(links))

        #         links = ["http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/mechanical-engineering/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/sport-exercise-nutrition/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/international-water-sanitation-engineering/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/sport-biomechanics/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/managing-innovation-creative-organisations/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/security-peace-building-diplomacy/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/media-cultural-analysis/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/internet-technologies-business-management/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/marketing/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/management/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/media-creative-industries/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/sport-exercise-psychology/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/international-water-sanitation-management/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/media-creative-industries-mres/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/mobile-communications/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/sport-business/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/materials-science-technology/",
        # "http://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/low-energy-building-services-engineering/", ]
        #         links = ["https://www.lboro.ac.uk/study/postgraduate/masters-degrees/a-z/social-media-political-communication/"]
        for link in links:
            url = "http://www.lboro.ac.uk" + link
            # url = link
            yield scrapy.Request(url,
                                 callback=self.parse_data,
                                 meta=departmentDict)
コード例 #2
0
    def parse(self, response):
        links = response.xpath(
            "//div[@class='Content']/div/div[@class='o-Grid o-Grid--full']/div/div[@class='Panel-body']/h2/a/@href"
        ).extract()
        start_date = response.xpath(
            "//div[@class='Content']/div[@class='Panel Panel--imageright']//strong[contains(text(), 'Start Date')]/../text()"
        ).extract()
        clear_space(start_date)
        # 组合字典
        programme_dict = {}
        # programme_list = response.xpath("//div[@class='Content']/div[@class='Panel Panel--imageright']//a[contains(@href, '/study/course')]/text()").extract()
        # clear_space(programme_list)

        for link in range(len(links)):
            url = "https://www.port.ac.uk" + links[link]
            programme_dict[url] = start_date[link]

        # print(len(links))
        links = list(set(links))
        # 专业描述未能完全匹配上
        # print(len(links))
        # links = response.xpath(
        #     "//a[contains(@href, 'http://www2.port.ac.uk')][contains(text(), 'here')]/@href").extract()

        #         links = ["https://www.port.ac.uk/study/courses/mpa-public-administration",
        # "https://www.port.ac.uk/study/courses/msc-civil-engineering-with-environmental-engineering",
        # "https://www.port.ac.uk/study/courses/ma-graphic-design",
        # "https://www.port.ac.uk/study/courses/ma-illustration",
        # "https://www.port.ac.uk/study/courses/msc-educational-leadership-and-management",
        # "https://www.port.ac.uk/study/courses/msc-digital-media",
        # "https://www.port.ac.uk/study/courses/msc-international-human-resource-management",
        # "https://www.port.ac.uk/study/courses/msc-crisis-and-disaster-management", ]
        '''2018.11.19'''
        # links = ["https://www.port.ac.uk/study/courses/msc-forensic-information-technology",
        #               "https://www.port.ac.uk/study/courses/msc-logistics-and-supply-chain-management",
        #               "https://www.port.ac.uk/study/courses/llm-law",
        #               "https://www.port.ac.uk/study/courses/mba-global",
        #               "https://www.port.ac.uk/study/courses/msc-crime-science",
        #               "https://www.port.ac.uk/study/courses/llm-corporate-governance-and-law-grad-icsa",
        #               "https://www.port.ac.uk/study/courses/msc-real-estate-management",
        #               "https://www.port.ac.uk/study/courses/msc-quantity-surveying",
        #               "https://www.port.ac.uk/study/courses/msc-project-management",
        #               "https://www.port.ac.uk/study/courses/msc-information-systems",
        #               "https://www.port.ac.uk/study/courses/msc-criminal-psychology",
        #               "https://www.port.ac.uk/study/courses/msc-security-management",
        #               "https://www.port.ac.uk/study/courses/msc-cybercrime-campus-learning-only", ]
        for link in links:
            url = "http://www.port.ac.uk" + link
            # url = link
            yield scrapy.Request(url,
                                 callback=self.parse_data,
                                 meta=programme_dict)
コード例 #3
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.salford.ac.uk/"
        item['university'] = "University of Salford"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = 'The Crescent, Salford, M5 4WT, UK'
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h2//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/p
            department = response.xpath(
                "//strong[contains(text(), 'School -')]/../text()|"
                "//p[contains(text(),'This course is a collaboration between the followi')]/../following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            item['department'] = ', '.join(department).replace(
                ', , ', ', ').strip().strip(',').strip()
            if item['department'] == "":
                print("***")
            print("item['department']: ", item['department'])

            start_date = response.xpath(
                "//strong[contains(text(), 'Start Date(s):')]/../text()"
            ).extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            start_date = ''.join(start_date)
            if ";" in start_date:
                start_date_list = start_date.split(";")
                print(start_date_list)
                for s in start_date_list:
                    item['start_date'] += getStartDate(s.strip().lower()) + ","
            else:
                item['start_date'] = getStartDate(''.join(start_date).lower())
            item['start_date'] = item['start_date'].strip().strip(",").strip()
            print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//strong[contains(text(), 'Duration')]/../following-sibling::*[position()<3]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International -')]
            tuition_fee = response.xpath(
                "//strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International')]//text()"
            ).extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1]
            overview = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1] | //div[@id='content']/div[@class='row']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //section[@id='about']/div[@id='content']
            modules_en = response.xpath(
                "//div[@id='courseaccordion']").extract()
            if len(modules_en) == 0:
                # print("********")
                modules_en = response.xpath(
                    "//h2[contains(text(),'Course Details')]/following-sibling::*"
                ).extract()
            item['modules_en'] = remove_class(
                clear_lianxu_space(modules_en))  # .replace("&nbsp;", "")
            item['modules_en'] = item['modules_en'].encode('utf-8').decode(
                "unicode-escape").replace(" ", "")
            # print("item['modules_en']: ", item['modules_en'])

            # //section[@id='requirements']/div
            entry_requirements = response.xpath(
                "//section[@id='requirements']/div//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # 申请材料
            apply_documents_en = response.xpath(
                "//h3[contains(text(),'Applicant profile')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(apply_documents_en))
            # print("item['apply_documents_en']: ", item['apply_documents_en'])

            # //h3[contains(text(),'English Language Requirements')]/following-sibling::*[1]
            ielts_desc = response.xpath(
                "//h3[contains(text(),'English Language Requirements')]/following-sibling::*[position()<3]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).replace(
                "Suitable For", "").strip()
            # print("item['ielts_desc']: ",item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['url'] == "https://www.salford.ac.uk/pgt-courses/journalism-news-broadcast-sport":
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # //section[@id='teaching']/div[@class='container main']/div[@class='col-md-12']/div[@id='teaching_0a19']
            assessment_en = response.xpath(
                "//section[@id='teaching']/div").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            # //section[@id='employability']/div[@class='container main']/div[@class='col-md-12']/div[@id='employ_0a19']
            career = response.xpath(
                "//section[@id='employability']/div").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div id="content_div_43743">
<h1>How to apply for a postgraduate taught degree</h1><p>You should complete your application online. Click the button below to get started. There is plenty of helpful information throughout the application process.</p><p>If you have all your supporting documents ready, it will only take about 20 minutes to complete the process. However, you can save your application at any stage and come back to it as many times as you like.</p>
</div>

<div id="new_content_container_1410668">
<div class="moneybox" id="new_div_48503">
<p><a href="http://webapps.ascentone.com/login.aspx?key=5D4B012A-BB6C-495B-B2E4-B5A56B3CCF00" class="btn btn-primary btn-large">Apply online here</a></p>
</div>
</div>

<div id="new_content_container_1410670">

</div>

<div id="new_div_48505">
<h2>What documents will I need?</h2><p>To complete the application process, you will need to upload scanned copies of your supporting documents. These documents vary from course to course, but usually include:</p><ul><li>One reference&nbsp;</li> <li>Transcripts or certificates demonstrating that you meet, or are likely to meet, the entry requirements for your course&nbsp;&nbsp;</li> <li>Evidence, <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0018/104841/18-02-23-Vouch-List-Equivalent-qualifications-to-English-GCSE-Grade-C.pdf">if English is not your first language, that your command of English meets the standards required for postgraduate study</a> (an IELTS score of 6.5, or the equivalent, is the norm)&nbsp;&nbsp;</li> <li>A copy of your passport, if you are coming to us from outside the EU and will <a href="http://www.advice.salford.ac.uk/page/visa">require a student visa</a>.&nbsp;&nbsp;</li> <li>If you are applying for Applied Social Work Practice (MSc, PgDip or PgCert)&nbsp;you will also need to complete the <a href="http://www.salford.ac.uk/__data/assets/word_doc/0010/448768/Agency-Agreement.docx">Agency Sponsorship Form</a> and send it to <a href="mailto:[email protected]">[email protected]</a></li> <li>For the MA courses in Media Production you will be required to submit a project proposal related to your chosen specialist field, to support your application.&nbsp;&nbsp;A brief written synopsis (max. 500 words) of your ideas would also be required.&nbsp;&nbsp;Please note that this would be for discussion&nbsp;&nbsp;&nbsp;&nbsp;purposes at the interview only.&nbsp;&nbsp;</li></ul><p>You must ensure that you upload all the documents that are needed to support your application.&nbsp;&nbsp;If you do not provide us with the information we require to make a complete assessment your application this will delay our response to you.</p><h2>What if my documents aren't ready?</h2><p>If you have not yet finished a course, if you are currently studying towards a qualification and receive a conditional offer from us, once you have taken your exams, please ensure that you send copies of your transcripts and certificates to us as soon as possible to allow us to update your admission&nbsp;&nbsp;record.</p><p>Once you have completed your application form and submitted it, you will receive an email from us acknowledging receipt of your application. We aim to consider your application as soon as we can but this can vary depending on whether you are required to attend an interview.</p><h2>Deadlines</h2><p>Postgraduate courses may start at varying times throughout the year. You should&nbsp;&nbsp;submit your application at least one month prior to your chosen course starting date.</p><h2>Course application exceptions</h2><div><p>Applications that are&nbsp;<strong>an exception</strong> to our online application process are:&nbsp;&nbsp;</p> <ul><li><a href="http://www.ucas.com"><strong>MA Social Work full-time study via UCAS</strong></a></li> <li><strong><a href="http://www.salford.ac.uk/study/postgraduate/applying/applying-for-taught-courses/post-qualifying-applications-pg">Post qualifying Health and Social Care single modules</a></strong></li> <li><a href="http://www.unigis.org/uk-courses-introduction/uk-courses-how-apply"><strong>Geographical Information Systems are via our partners for this course Manchester Metropolitan University</strong></a></li> </ul></div><div><h2>Policy statement on equality and diversity&nbsp;&nbsp;</h2></div><p><a href="http://www.salford.ac.uk/study/postgraduate/applying/policy-statement-on-equality-and-diversity">Read our policy statement on equality and diversity</a></p>
</div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Postgraduate</strong></p><p>(4 year) Bachelor degrees with a GPA 2.7/4.0 or 70% from a National University; or from a Project 211 University with a GPA 2.6/4.0 or 65%; or from a Private University with GPA 2.75/4.0 or 75%.</p>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #4
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.aston.ac.uk/"
        item['university'] = "Aston University"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['teach_time'] = 'fulltime'
        item['location'] = "Aston University,Birmingham, B4 7ET"
        print("======================================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='skiplinks']//text()").extract()
            programmeDegreetypeStr = ''.join(programmeDegreetype)
            # print(programmeDegreetypeStr)
            degree_type = re.findall(r"^\w+\s", programmeDegreetypeStr)
            # print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            programme = programmeDegreetypeStr.replace(''.join(degree_type),
                                                       "").strip()
            item['programme_en'] = ''.join(programme).strip().strip(
                "in").strip()
            print("item['degree_name']: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            overview = response.xpath(
                "//*[contains(text(), 'Course outline')]/../../../../../../div/following-sibling::div[1]|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Modules')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Sample module options')]/../../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/..|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Core modules:')]/../preceding-sibling::*|"
                "//strong[contains(text(),'Courses')]/../../following-sibling::div[1]|"
                "//*[contains(text(), 'Programme outline and modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Modules')]/..|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample Module Options')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline & Modules')]/../../../../../following-sibling::div[1]//*[contains(text(),'Modules')]/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            modules_en = response.xpath(
                "//*[contains(text(),'modules:')]/../..").extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(),'Modules')]/../../..").extract()
                if len(modules_en) == 0:
                    modules_en = response.xpath(
                        "//*[contains(text(),'Modules')]/../..").extract()
                    if len(modules_en) == 0:
                        modules_en = response.xpath(
                            "//*[contains(text(),'Modules')]/..").extract()
                        if len(modules_en) == 0:
                            modules_en = response.xpath(
                                "//*[contains(text(),'What you will study')]/../../../../../following-sibling::*"
                            ).extract()
                            if len(modules_en) == 0:
                                modules_en = response.xpath(
                                    "//*[contains(text(), 'Subject guide and modules')]/../../../../../../div/following-sibling::div[1]"
                                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en'] = ", item['modules_en'])

            career_en = response.xpath(
                "//*[contains(text(),'Your future career prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Your future career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional development programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional Development Programme')]/../../../../div/following-sibling::*|"
                # "//*[contains(text(),'Professional Development Programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Opportunities')]/../../../../../following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])

            assessment_en = response.xpath(
                "//*[contains(text(),'Learning, teaching & assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, Teaching & Assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, Teaching and Assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, teaching and assessment')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, teaching and assessments')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Learning, teaching & assesment')]/../../../../../following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//*[contains(text(),'Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'fees')]/../../../../../following-sibling::*//text()"
            ).extract()
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath(
                    "//strong[contains(text(),'Fees:')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            # print("tuition_fee_str: ", tuition_fee_str)
            tuition_fee_re = re.findall(
                r"International.*?£\d+,\d+|non-EU.*?£\d+,\d+|MSc.*?£\d+,\d+|entry:£\d+,\d+|2018/2019:£\d+,\d+|£\d+,\d+\sfor\sOutside\sEU",
                tuition_fee_str, re.I)
            # print(tuition_fee_re)
            if len(tuition_fee_re) != 0:
                t = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
                # item['tuition_fee'] = int(''.join(t).replace(",", "").strip())
                # print("item['tuition_fee']1 = ", item['tuition_fee'])
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            rntry_requirements = response.xpath(
                "//*[contains(text(),'Entry requirements & fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements & Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Key information for applicants & entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements')]/../../../../../following-sibling::*//text()"
            ).extract()
            start_date = rntry_requirements
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            duration = response.xpath(
                "//*[contains(text(),'Duration')]/following-sibling::*//text()|"
                "//*[contains(text(),'Duration')]/..//text()").extract()
            if len(duration) == 0:
                duration = response.xpath(
                    "//*[contains(text(),'Duration of course')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(duration)
            duration_str = ''.join(duration)
            # print("duration_str: ", duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # start_date = response.xpath(
            # "//*[contains(text(),'Start date')]/following-sibling::*//text()|"
            # "//*[contains(text(),'Start date')]/..//text()|"
            # "//*[contains(text(),'Start Date')]/following-sibling::*//text()|"
            # "//*[contains(text(),'Start Date')]/..//text()|"
            # "//*[contains(text(),'Start')]/../..//text()").extract()
            # if len(start_date) == 0:
            #     start_date = response.xpath(
            #         "//*[contains(text(),'Duration of course')]/../following-sibling::*[1]//text()").extract()
            clear_space(start_date)
            start_date_str = '; '.join(start_date)
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r'Start.{1,25}', start_date_str)
            # print("start_date_re", start_date_re)
            item['start_date'] = getStartDate(''.join(start_date_re))
            # print("item['start_date']: ", item['start_date'])

            # ielts_desc = ' '.join(start_date)
            # ielts_desc = re.findall(r'.{1,80}IELTS.{1,80}', ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            allcontent = response.xpath(
                "//div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-rho']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-delta']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma'][2]//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-upsilon']//text()"
            ).extract()
            clear_space(allcontent)
            department_1 = response.xpath(
                "//a[@href='/study/postgraduate/taught-programmes/abs/']//text()"
            ).extract()
            # print(department_1)
            if len(department_1) > 0:
                item['department'] = ''.join(department_1[0]).strip()
            department_re = re.findall(
                r"Life\s&\sHealth\sSciences\s-\sOSPAP|Aston\sBusiness\sSchool|Engineering\s&\sApplied\sScience|Languages\s&\sSocial\sSciences|Life\s&\sHealth\sSciences",
                ''.join(allcontent))
            # print("department_re: ", department_re)
            if item['department'] == "":
                if len(department_re) > 0:
                    item['department'] = ''.join(department_re[0]).strip()
            print("item['department']: ", item['department'])

            # Aston Business School
            de_1 = [
                "full time mba",
                "executive mba - part time",
                "online mba",
                "the executive dba",
                "phd programme",
                "msc business analytics",
                "msc business & management",
                "msc business & management (online)",
                "msc information systems & business analysis",
                "msc supply chain management",
                "msc international business",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc strategy and international business",
                "msc entrepreneurship",
                "msc accounting & finance",
                "msc business economics & finance",
                "msc finance",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc investment analysis",
                "msc strategic marketing management ",
                "msc human resource management & business",
                "msc organisational behaviour",
                "msc work psychology & business",
                "international pre-masters",
            ]
            #Engineering & Applied Science
            de_2 = [
                "msc professional engineering",
                "msc computer science",
                "msc software engineering ",
                "msc software project management",
                "msc professional engineering",
                "msc electrical power engineering and systems ",
                "msc telecommunications systems",
                "msc wireless communications and networking",
                "msc smart telecom and sensing networks (smartnet)",
                "msc photonic integrated circuits, sensors and networks (pixnet)",
                "msc professional engineering",
                "msc engineering management",
                "msc supply chain management",
                "msc engineering leadership & management",
                "msc supply chain leadership and management",
                "msc professional engineering",
                "msc mechanical engineering ",
                "msc product design ",
                "msc professional engineering",
            ]
            #Languages & Social Sciences
            de_3 = [
                "ma in forensic linguistics",
                "ma in the european union & international relations",
                "joint ma in multilevel governance & international relations",
                "double ma in europe & the world",
                "double ma in governance and international politics",
                "ma in international relations and global governance",
                "ma in sociology and social research",
                "ma in policy and social research",
                "ma in teaching english to speakers of other languages (tesol)",
                "ma in tesol and translation studies",
                "ma in tesol and translation studies",
                "ma in translation in a european context",
            ]
            # Life & Health Sciences
            de_4 = [
                "advanced hearing therapy practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "doctor of hearing therapy - professional doctorate",
                "biomedical science - msc",
                "biomedical sciences top modules - all standalone modules",
                "stem cells and regenerative medicine - msc",
                "clinical neurophysiology practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "neurophysiology - pgcert",
                "clinical science (neurosensory sciences) - msc",
                "doctor of optometry / doctor of ophthalmic science - professional doctorate",
                "graduate diploma in optometry - graduate diploma",
                "independent prescribing for optometrists - professional accreditation",
                "optometry / ophthalmic science - msc",
                "overseas pharmacists course (ospap) - full time pgdip / msc",
                "pharmacist independent prescribing - pgcert",
                "pharmacy (includes: msc pharmaceutical sciences, msc drug delivery, and msc pharmacokinetics) – msc",
                "psychiatric pharmacy by distance learning and practice - pgdip",
                "psychiatric pharmacy practice - msc",
                "psychiatric therapeutics by distance learning - pgcert",
                "cognitive neuroscience - msc",
                "health psychology (online) - msc",
                "health psychology (on campus) - msc",
            ]
            if item['department'] == "":
                for de1 in de_1:
                    if item['programme_en'] == de1:
                        item['department'] = "Aston Business School"
                        break
            if item['department'] == "":
                for de2 in de_2:
                    if item['programme_en'] == de2:
                        item['department'] = "Engineering & Applied Science"
                        break
            if item['department'] == "":
                for de3 in de_3:
                    if item['programme_en'] == de3:
                        item['department'] = "Languages & Social Sciences"
                        break
            if item['department'] == "":
                for de4 in de_4:
                    if item['programme_en'] == de4:
                        item['department'] = " Life & Health Sciences"
                        break
            print("item['department']1: ", item['department'])
            if 'business' in item['programme_en'].lower():
                item['department'] = "Aston Business School"
            if 'electrical' in item['programme_en'].lower(
            ) or 'engineering' in item['programme_en'].lower():
                item['department'] = "Engineering & Applied Science"
            if item['department'] == "Life & Health Sciences - OSPAP":
                item['ielts'] = 7
                item['ielts_l'] = 7
                item['ielts_s'] = 7
                item['ielts_r'] = 7
                item['ielts_w'] = 7
            elif 'electrical' in item['programme_en'].lower(
            ) or 'engineering' in item['programme_en'].lower():
                item['ielts'] = 6
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            else:
                item['ielts'] = 6.5
                item['ielts_l'] = 6
                item['ielts_s'] = 6
                item['ielts_r'] = 6
                item['ielts_w'] = 6
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            if item['department'] == "Life & Health Sciences - OSPAP":
                item['toefl'] = 101
                item['toefl_l'] = 26
                item['toefl_r'] = 26
                item['toefl_s'] = 23
                item['toefl_w'] = 28
            else:
                item['toefl'] = 93
                item['toefl_l'] = 19
                item['toefl_r'] = 18
                item['toefl_s'] = 19
                item['toefl_w'] = 23
            print(
                "item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s "
                % (item['toefl'], item['toefl_l'], item['toefl_s'],
                   item['toefl_r'], item['toefl_w']))
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Postgraduate - Taught<span style="line-height: 1.4em; font-size: 16px"> </span></h3> <p>You should have a Bachelors degree from an Chinese university but the specific percentage requirement will vary depending on the course you are applying for at Aston and the Chinese university which you have graduated from. In general applicants should be scoring in the range of 75-85% average as a minimum.  <br />     <br />If you are applying for finance, engineering or science based subjects, you must have studied a similar field in your undergraduate degree.<span style="line-height: 1.4em"> </span></p> <p>There are a number of conversion courses in the Business School which will accept students from any subject background.  <span style="line-height: 1.4em"> </span></p>"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #5
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Chester"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath("//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath("//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            if "doctor of" in item['programme_en'].lower() or item['degree_name'].lower() == "mres":
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            print("item['teach_type']: ", item['teach_type'])
            print("item['degree_type']: ", item['degree_type'])

            start_date = response.xpath("//span[@class='m-facts__fact']//text()|"
                                        "//select[@id='edit-date']//option[@selected='selected']//text()").extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            print("item['start_date']: ", item['start_date'])

            mode = response.xpath("//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            print("item['teach_time']: ", item['teach_time'])

            location = response.xpath("//label[@for='edit-compulsory']/following-sibling::*//text()").extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            duration = response.xpath("//dt[@class='m-facts__label']//following-sibling::*//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath("//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath("//div[@id='entry-international']//li[contains(text(),'Postgraduate:')]//text()").extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath("//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                                           "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/..").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath("//div[@class='field-fees-international']/p//text()").extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath("//div[@id='careers-job-prospects']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en)).replace("<div></div>", "").strip()
            # print("item['career_en']: ", item['career_en'])

            modules = re.findall(r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}", response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace("function init_drupal_core_settings() {jQuery.extend(Drupal.settings,", "").strip()
            modules_dict = json.loads(modules_str)
            print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            print(modules_dict.get("courses"))
            # if modules_dict.get('courses').get('groupCode') is not False:
            if modules_dict.get('courses').get('groupCode') is not None:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/"+modules_dict.get('courses').get('modulesNid')+"/"+modules_dict.get('courses').get('groupCode')+"/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            print("item['modules_en']: ", item['modules_en'])

            item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<div class="content">
    
  <h2>Before You Apply</h2>
<p>Please read the relevant course information carefully. If you would like to know more about a programme or research area, we suggest that you contact the programme leader or centre director, in writing, by telephone or by&nbsp;<a href="mailto:[email protected]">email</a>&nbsp;via Postgraduate Admissions. They will be able to answer your questions in more detail and send you further information.</p>
<h2>How to Apply</h2>
<p>If you are a Home/EU student applying for a postgraduate taught course, you should apply directly via the online application system (AIMS) via the link below.&nbsp; If you are an International student applying for a postgraduate taught course, you should apply via the <a href="http://www1.chester.ac.uk/study/postgraduate/how-apply/applying-taught-courses-international-applicants">International Centre</a>. If you are applying for a PGCE Primary, Secondary or Early Years programmes, please note there is a separate admissions&nbsp;procedure.&nbsp; Please email <a href="mailto:[email protected]">[email protected]</a> and we will forward your details on to PGCE Admissions. All&nbsp;<a href="http://www.chester.ac.uk/research/degrees/application">research degree</a>&nbsp;applicants,&nbsp;whether Home, EU or International, should visit the relevant web pages or follow the links on the right hand side of this page.</p>
<p>Paper application forms are no longer issued, except in cases where an online application would impossible for the candidate. Please complete the relevant online application on our website.&nbsp;</p>
<p>&nbsp;Once you have submitted your application, the system will automatically contact your referees on your behalf. Your application may not be considered without two appropriate references and all additional documents required with your application, which include:&nbsp;</p>
<ul>
<li>Copies of certificates/transcripts</li>
<li>Copy of English language proficiency certificate (if required).&nbsp;<strong>Applicants whose first language is not English must provide evidence of proficiency to IELTS 6.5 with no less than 5.5 in each band or equivalent.</strong></li>
<li>Full curriculum vitae (if required)</li>
<li>You may also be asked to complete a fees assessment in order to determine the level of tuition fee payable.</li>
</ul>
<p>Specific programmes require additional documents to be submitted with your application, e.g. Nutrition and Dietetics, Fine Art.</p>
<p>Before doing so, please ensure that you inform your referees. In most cases the references shall come from independent academic referees, i.e. they are not normally provided by the programme leader of the course you are applying for. Once your application is submitted, we will then forward it to the relevant programme leader for consideration. If your application is successful, an offer of a place will be made in writing by Postgraduate Admissions. This will either be unconditional or conditional, depending on the completeness of your application.</p>
<h2>Entry Requirements</h2>
<p>Usually, postgraduate applicants should have an appropriate first degree, with a minimum of second class honours or equivalent. However, if you do not have appropriate academic qualifications, you may be admitted by virtue of prior work experience or by demonstrating relevant knowledge and skills in a specific field. If you are unsure whether your qualifications are acceptable for admission to your chosen programme of study, contact the programme leader or Postgraduate Admissions for further advice.&nbsp;</p>
<p>If your qualifications or experience are not suitable, we will be able to advise you about further options that might bring you up to the required level necessary to enter the course of your choice.</p>
<p>Each course has its own entry requirements, which are shown on each individual course web page under the 'Entry requirements' tab.</p>
<p>For entry requirements relating to our PGCE<a href="/postgraduate/pgce-in-education-primary" title="PG Primary">&nbsp;Primary</a>,&nbsp;<a href="/postgraduate/pgce-secondary-programme" title="PG Secondary">Secondary&nbsp;</a>and&nbsp;<a href="/postgraduate/pgce-early-years" title="PG Early Years">Early Years</a>&nbsp;courses please refer to the relevant pages.</p>
<p>If you have any queries concerning the applications process please contact us at:</p>
<h4>T: 01244 512456/512474<br />
E:&nbsp;<a href="mailto:[email protected]" title="Postgraduate Enquiries">[email protected]</a></h4>
<p>&nbsp;</p>
<h2>Accreditation of Prior Learning (APL/APEL)</h2>
<p>To be admitted to a postgraduate course, evidence of your prior learning should be equal to higher education Level 3, now referred to as level 6, which is the final year of an undergraduate degree course, or other equivalent, e.g. related professional qualifications. A subject tutor will help you to determine how much of your prior learning can be credited against the course. This may not have been undertaken in an educational environment, but its value may be the same, or more. Information about how this system works and how professional qualification equivalence is available can be obtained from the subject departments.</p>
<p>We may give credit for a course, or part of a course, that would exempt you from having to study that area again. The onus is on you to prove that your learning and experience matches the area for which exemption is claimed.</p>
<p>There may be subject areas for which course attendance is compulsory and credit exemption does not apply, but, equally, there may be areas of study for which credit may be gained purely on the basis of your prior academic achievements or experience.</p>
<p>It is possible to claim credit for up to 66.7% of any award. Please note that this does not apply to MPhil or PhD courses as they have their own process known as 'Advance Standing'. Please contact&nbsp;<a href="mailto:[email protected]">Postgraduate Research Admissions</a>&nbsp;for further details.</p>
<p>If you have any queries or would like to find out more about CATS or APL/APEL, please contact the APL Officer within the relevant faculty.</p>
<h2>When do the programmes start?</h2>
<p>The majority of postgraduate programmes commence in early October each year, although some allow students to enter in January/February or April/May. For specific start dates for your chosen programme, please consult the relevant section of the website, or contact Postgraduate Admissions, who will be able to help you.</p>
<h2>What is the deadline for applications?</h2>
<p>There are no specific deadlines for most applications made directly to us, although there are some exceptions (check your programme details). The University will accept applications throughout the year, but we would generally advise that you send in your application form by the end of July to ensure that you have time to make any funding and/or accommodation arrangements, and for documents such as transcripts and references to be obtained if not submitted with the application. This will also give you more time to meet any conditions we may potentially attach to an offer.&nbsp; Some courses have earlier application deadlines.&nbsp; Please check the deadline that applies to the programme you are interested in before you apply. There is a strict deadline for applications to Nutrition and Dietetics and Social Work. Please refer to the relevant course web pages.</p>
<p>The deadline for PGCE applications is set by the Graduate Teacher Training Register (GTTR).</p>
<h2>Students with Disabilities</h2>
<p>We are committed to a policy of equal opportunities for applicants with disabilities or specific needs. Although applications from all prospective students are considered according to the same entry criteria, those of you who declare a disability or specific need will also be considered on an individual basis. As some of our buildings are old and not purpose-built, they may not be suitable for those of you with restricted mobility.&nbsp;</p>
<p>However, we are continually working to improve access routes and other facilities on campus to assist physically disabled students during their programmes of study. Wherever possible, we try to make arrangements or adaptations as appropriate, within the existing restrictions placed upon us.</p>
<p><strong>Good luck with your application!</strong></p>
<p><a class="m-link m-link--primary" href="https://flow.chester.ac.uk/tkflow_U/Flow.aspx?f=appform1.kdt&amp;template=template5&amp;course=PGT&amp;theme=redmond">Apply Now</a></p>
<div class="m-callout">
<p>If you're interested in a course at University Centre Shrewsbury, <a href="http://ucshrewsbury.ac.uk/postgraduate/apply">find out more about the application process.</a></p>
</div>
  </div>
"""]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(clear_lianxu_space([""" <div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Postgraduate Study  </h3>

  <ul><li>Bachelor's degree with 68% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Pre-Masters Programme at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Pre-Masters Programme at 60% or above</li>
<li>Applicants for the MBA should have 2 years work experience, although well qualified and motivated individuals without this will be considered</li>
</ul></div>  <div class="field-collection-view clearfix view-mode-full field-collection-view-final">
  <p><strong>Academic Requirements:</strong></p>
<ul><li>Master's degree with a recognised institution</li>
</ul><p><strong>English Requirements:</strong></p>
<ul><li><strong>IELTS: 6.5 (no less than 5.5 in any band)</strong></li>
</ul></div>"""]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #6
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "Southampton Solent University"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            degree_name = response.xpath(
                "//div[@class='row column']/h1/abbr/text()").extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = response.xpath(
                "//div[@class='row column']/h1/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # # print("start_date: ", start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//div[@class='banner__stats']//text()").extract()
            clear_space(duration)
            # print("duration: ", ' '.join(duration))
            duration_list = getIntDuration(' '.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//section[@class='intro intro--courses section']").extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview_en)).replace(
                    "Book an open day", "")
            # print("item['overview_en']: ", item['overview_en'])

            tuition_fee = response.xpath(
                "//html//div[@class='facts-figures__panel panel']/p[contains(text(),'The tuition fees for the 2018/19 academic year are:')]//text()|"
                "//html//div[@class='facts-figures__panel panel']/p[contains(text(),'The tuition fees for the 2018/19 academic year are:')]/following-sibling::p[1]//text()"
            ).extract()
            # if len(tuition_fee) == 0:
            #     tuition_fee = response.xpath(
            #         "//html//div[@class='facts-figures__panel panel']/p[contains(text(),'The tuition fees for the 2018/19 academic year are:')]/following-sibling::p[1]//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", ''.join(tuition_fee))
            tuition_fee_re = re.findall(
                r"International\sfull-time\sfees:£\d+,\d+|Internationalfull-timefees:£\d+,\d+|Internationaltotal\scoursefees:£\d+,\d+",
                ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            item['teach_time'] = getTeachTime(''.join(tuition_fee_re))
            # print("item['teach_time']: ", item['teach_time'])

            tuition_fee_re1 = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
            if len(tuition_fee_re1) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re1))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            entry_requirements = response.xpath(
                "//h3[@class='facts-figures__header'][contains(text(),'Key entry requirements')]/../..//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//h3[contains(text(),'English language requirements')]/..//*[contains(text(), 'IELTS')]//text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[2]
                item['ielts_s'] = ielts_list[2]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[1]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl_desc = response.xpath(
                "//h3[contains(text(),'English language requirements')]/..//*[contains(text(), 'TOEFL')]//text()"
            ).extract()
            clear_space(toefl_desc)
            # print("ielts_desc: ", ielts_desc)
            item['toefl_desc'] = ''.join(toefl_desc)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[0]
                item['toefl_r'] = toefl_list[0]
                item['toefl_s'] = toefl_list[0]
                item['toefl_w'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 3:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[2]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 5:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[3]
                item['toefl_w'] = toefl_list[4]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #        item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            assessment_en = response.xpath(
                "//*[contains(text(),'Teaching')]/..|//*[contains(text(),'Assessment')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            if item['assessment_en'] == "":
                print("***")
            print("item['assessment_en']: ", item['assessment_en'])

            work_experience_desc_en = response.xpath(
                "//*[contains(text(),'Work experience')]/..").extract()
            item['work_experience_desc_en'] = remove_class(
                clear_lianxu_space(work_experience_desc_en))
            # print("item['work_experience_desc_en']: ", item['work_experience_desc_en'])

            how_to_apply = response.xpath(
                "//h3[@class='subheader']/..").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            location = response.xpath(
                "//h4[contains(text(),'Study location')]/following-sibling::*[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            modules = response.xpath(
                "//a[contains(text(),'Programme specification document')]/../../preceding-sibling::*"
            ).extract()
            if len(modules) == 0:
                # //h2[contains(text(),'Support')]/../../preceding-sibling::*
                modules = response.xpath(
                    "//h2[contains(text(),'Support')]/../../preceding-sibling::*"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***")
            # print("item['modules_en']: ", item['modules_en'])

            career_en = response.xpath(
                "//h2[@class='header'][contains(text(),'Industry links')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
<p>As a general guide, we look for qualifications that are equivalent to the British high school A-levels. A portfolio is also required for most of our art and design courses.</p>
<p>Students with a good Senior High School Diploma and an IELTS of minimum 5.5 may be eligible for a foundation year (level 0 of a bachelor's degree) or an HND programme.</p>
<p>For postgraduate courses, we look for qualifications that are equivalent to the British&nbsp;bachelor's degree.</p>
"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #7
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Reading"
        # item['country'] = 'England'
        # item['website'] = 'http://www.reading.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型、ucas_code
            programmeDegree_typeUcascode = response.xpath(
                "//span[@class='text-bg-standout text-nice-wrap']/text() | //h1[@id='heading']//text() | //h1[@class='hero-heading']//text() | //h1[@class='block-heading block-heading-l5 block-heading-b5 block-heading-md-l-reset cell-md-t0']//text()"
            ).extract()
            clear_space(programmeDegree_typeUcascode)
            programmeDegree_typeUcascode = ''.join(
                programmeDegree_typeUcascode).strip()
            # print("programmeDegree_typeUcascode: ", programmeDegree_typeUcascode)

            degree_type = re.findall(r"^\w+/\w+", programmeDegree_typeUcascode)
            if len(degree_type) == 0:
                degree_type = re.findall(r"^\w+", programmeDegree_typeUcascode)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegree_typeUcascode.replace(
                item['degree_name'], '').strip()
            item['programme_en'] = programme.title()
            # print("item['programme_en']: ", item['programme_en'])

            # duration
            durationMode = response.xpath(
                "//h2[@class='row-margin-small text-weight-medium text-size-25']/text() | //strong[contains(text(),'Duration')]/../text() | //h3[contains(text(),'Programme length:')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            # if ":" in durationMode:
            #     duration = durationMode.split(":")[-1].strip()
            #     mode = durationMode.split(":")[0].strip()
            #     item['duration'] = duration
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            item['teach_time'] = getTeachTime(''.join(durationMode))
            # print("item['duration']: ", item['duration'])
            # print("item['teach_time']: ", item['teach_time'])
            # print("item['duration_per']: ", item['duration_per'])

            start_date = response.xpath(
                "//p[@class='headline'][contains(text(), 'Start date')]//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            overview2 = response.xpath(
                "//div[@class='m-bg-white m-pad-around m-pull-left-normal m-pull-up']//div[@class='theme-editor'] | //div[@id='top-courseOverview'] | //html//div[@id='top-programmeOverview']/h2[1]/following-sibling::div[1] | //div[@id='tc1']"
            ).extract()
            overview = remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = overview
            print("item['overview_en']: ", item['overview_en'])

            # department
            department = response.xpath(
                "//article[@class='pad-around bg-white']//div[@class='theme-editor']//a//text()|//p[@class='paddingtop22 nopaddingbottom']//a//text()|//a[@class='navbar-brand navbar-brand-hbs']//text()"
            ).extract()
            clear_space(department)
            if department == "":
                item['department'] = response.meta.get('department')
            else:
                item['department'] = ', '.join(department).strip()
            item['department'] = item['department'].replace("How to apply", "")
            # print("item['department']: ", item['department'])

            item[
                'location'] = "Whiteknights,PO Box 217,Reading, Berkshire,RG6 6AH"
            # //h2[@id='Panel1Trigger']/../..
            entry_requirements = response.xpath(
                "//h2[@id='Panel1Trigger']/../..//text()|//div[@id='bottom-entryRequirements']/..//text()|//div[@id='tc5']//text()"
            ).extract()
            if len(entry_requirements) == 0:
                entry_requirements = response.xpath(
                    "//h4[contains(text(),'Entry requirements:')]/preceding-sibling::*[1]/following-sibling::*[position()<4]//text()"
                ).extract()
            clear_space(entry_requirements)
            entry = ''.join(entry_requirements)
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            if item['rntry_requirements'] == "":
                print("rntry_requirements 为空")
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELT.{1,100}", entry)
            # ielts = response.xpath(
            #     "//strong[contains(text(),'IELTS')]/..//text()").extract()
            # # if item['ielts_desc'] == "":
            clear_space(ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # if item['ielts_desc'] == "":
            #     print("ielts_desc 为空")
            # print("item['ielts_desc']1: ", item['ielts_desc'])
            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl = re.findall(r"TOEFL[\s\(\)\w:\.]{1,300}", entry)
            # print(ielts)
            if item['toefl_desc'] == "":
                item['toefl_desc'] = ''.join(toefl)
            # print("item['toefl_desc']: ", item['toefl_desc'])
            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # //h2[@id='Panel1Trigger']/../..
            modules = response.xpath(
                "//h2[@id='Panel2Trigger']/../..|//div[@id='bottom-courseContent']/..|//div[@id='page_content_wrap']/following-sibling::div[position()<3]|//strong[contains(text(),'Programme structure')]/../following-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h4[contains(text(),'Programme structure and content')]/preceding-sibling::*[1]/following-sibling::*[position()<11]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # //h2[@id='Panel1Trigger']/../..
            career = response.xpath(
                "//h2[@id='Panel4Trigger']/../following-sibling::div[1]|//div[@id='bottom-careers']/..|//div[@id='careers']|//h3[contains(text(),'Careers')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            # //h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]
            tuition_fee = response.xpath(
                "//h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]//text()|"
                "//html//div[@id='bottom-feesFunding']//tr[2]/td[3]//text()|"
                "//html//div[@id='bottom-feesFunding']//tr[2]/td[2]//text()|"
                "//html//div[@id='tc2']//h3[1]/following-sibling::p[1]//text()|"
                "//*[contains(text(),'Programme fee')]/following-sibling::*[1]//text()|"
                "//h2[contains(text(),'Fees')]/following-sibling::p[1]//h2[contains(text(),'Fees')]/following-sibling::p[1]|"
                "//h2[contains(text(),'Fees')]/following-sibling::p[position()<3]//text()|"
                "//p[contains(text(),'New international students:')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            # item['tuition_fee'] = ''.join(tuition_fee).strip()
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) == 1:
                item['tuition_fee'] = int(''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            if len(tuition_fee_re) >= 2:
                item['tuition_fee'] = int(''.join(tuition_fee_re[1]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # if item['tuition_fee'] is None:
            #     print("tuition_fee 为空")
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='top-howWeTeachYou']
            assessment_en = response.xpath(
                "//div[@id='top-howWeTeachYou']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            item[
                'apply_proces_en'] = """<div><h1><span>How to apply for postgraduate courses</span></h1></div><div><h4>Postgraduate taught courses</h4><p>The quickest and easiest away to apply for postgraduate study at the University of Reading is through our <a>online application service</a>. The online service allows you to complete your application form and attach electronic copies of your academic transcripts, certificates and other supporting information. It also provides a tool for sending an email request to your referees, enabling them to send your supporting references directly to us.</p><p><span>If you are unable to apply online you can request a paper application form by telephoning </span><a>+44 (0) 118 378 5289</a><span> or writing to:</span></p><p>Admissions Office<br>University of Reading<br>Miller Building<br>Whiteknights<br>Reading, RG6 6AB<br>UK</p><h4>PGCE and School Direct</h4><p>Candidates for the PGCE and School Direct courses should submit an application via <a>UCAS Teacher Training</a>.</p><strong> Postgraduate research </strong><p>For more information on applying for postgraduate research opportunities, please visit our <a>graduate school website</a>.</p></div><div><div><div><h4>Entry requirements</h4><p>Please visit our <a>postgraduate entry requirements</a> page for information on academic qualifications and English language requirements.</p><h4>When to apply</h4><p>There is no specific deadline date for most courses and applications will be considered until the course is full. However, to allow time for us to process your application we recommend that you apply by the following dates for admission in September:</p><div><strong>UK applicants</strong> by 1 August</div><div><strong>International applicants</strong> by 1 June</div><div><br></div><p>Please note that the MSc Speech and Language Therapy has an earlier application deadline of 1 December. Applications for PGCE courses are made through UCAS (see above) and the deadline is 15 September of the year of entry though early applications are recommended.</p><p>Most of our taught courses start at the beginning of the autumn term (in September) but there are a number that also have a start at a different time of the year or have multiple starts throughout the year. Please see the individual subject pages for further details.</p><h4>After you apply</h4><p>As soon as you have submitted your&nbsp;completed application we will send&nbsp;you an email acknowledgement.&nbsp;We will also create an applicant&nbsp;account for you which will allow&nbsp;you to check on the progress of&nbsp;your application online and access&nbsp;other useful information about&nbsp;the University of Reading.</p><p> We aim to reach a decision on&nbsp;your application within 4 weeks.&nbsp;The length of time taken to reach&nbsp;a decision will vary as each&nbsp;application is considered on an individual basis according to your&nbsp;relevant strengths and merits. Once your application has been&nbsp;considered you will receive an&nbsp;email from the Admissions Office&nbsp;informing you of the decision. If&nbsp;your application has been successful,&nbsp;our email will explain the offer and&nbsp;any conditions attached to it and also&nbsp;give further details of the fees and&nbsp;other expenses associated with your&nbsp;course.&nbsp;</p><p>Our team of experienced&nbsp;admissions staff is here to help you&nbsp;throughout the application process&nbsp;so please do not hesitate to get in&nbsp;touch with us if you need any help&nbsp;with completing your application or&nbsp;have a question about the progress&nbsp;of your application. You can contact&nbsp;us at <a>[email protected]</a>.</p></div></div>"""

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #8
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "Liverpool Hope University"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = 'Hope Park, Liverpool, L16 9JD'
        print("===========================")
        print(response.url)
        try:
            # 专业
            programmelist = response.xpath(
                "//section[@id='pageContent']/div[@class='course_header']/h1//text()"
            ).extract()
            # print(programmelist)
            programmeStr = ''.join(programmelist)
            degree_type = ''.join(
                re.findall(r"\(.{1,10}\)|\(Postgraduate\sCertificate\)",
                           programmeStr.strip()))
            # print(degree_type)
            programme = programmeStr.replace(degree_type, "")
            item['programme_en'] = programme.title()
            item['degree_name'] = degree_type.replace("(",
                                                      "").replace(")",
                                                                  "").strip()
            print("item['programme_en']: ", item['programme_en'])
            print("item['degree_name']: ", item['degree_name'])

            duration = response.xpath(
                "//strong[contains(text(),'Duration')]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //strong[contains(text(),'Start month')]
            start_date = response.xpath(
                "//strong[contains(text(),'Start month')]//text()").extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            if '&' in ''.join(start_date):
                start_date_list = ''.join(start_date).split('&')
                print(start_date_list)
                for s in start_date_list:
                    item['start_date'] += getStartDate(s.strip()) + ","
            else:
                item['start_date'] = getStartDate(''.join(start_date))
            item['start_date'] = item['start_date'].strip().strip(",").strip()
            print("item['start_date'] = ", item['start_date'])

            overview = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath("//div[@id='curriculum']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry_reqs']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            assessment_en = response.xpath(
                "//div[@id='teaching_research']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//div[@id='careers']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            # //h2[contains(text(),'INTERNATIONAL TUITION FEES')]/following-sibling::p[1]
            tuition_fee = response.xpath(
                "//h2[contains(text(),'INTERNATIONAL TUITION FEES')]/following-sibling::p//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            dep_dict = {
                "liverpool hope business school":
                "Faculty of Arts and Humanities",
                "creative and performing arts":
                "Faculty of Arts and Humanities",
                "english": "Faculty of Arts and Humanities",
                "fine and applied art": "Faculty of Arts and Humanities",
                "history and politics": "Faculty of Arts and Humanities",
                "law": "Faculty of Arts and Humanities",
                "media and communication": "Faculty of Arts and Humanities",
                "social sciences": "Faculty of Arts and Humanities",
                "theology, philosophy and religion":
                "Faculty of Arts and Humanities",
                "disability and education": "Faculty of Education",
                "early childhood": "Faculty of Education",
                "education studies": "Faculty of Education",
                "teacher education": "Faculty of Education",
                "geography and environmental science": "Faculty of Science",
                "mathematics and computer science": "Faculty of Science",
                "psychology": "Faculty of Science",
                "health sciences": "Faculty of Science",
            }
            department = response.xpath(
                "//div[contains(text(),'Department of')]//text()|//div[contains(text(),'School')]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            department_key = ''.join(department).replace(
                "Department of", "").replace("School of", "").lower().strip()
            # print("department_key: ", department_key)
            item['department'] = dep_dict.get(department_key)
            # print("item['department']: ", item['department'])

            ielts_desc = re.findall(r".{1,20}IELTS.{1,40}",
                                    item['rntry_requirements'])
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            # print(ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[2]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl_desc = re.findall(r"TOEFL.{1,40}",
                                    item['rntry_requirements'])
            # print("toefl_desc: ", toefl_desc)
            item['toefl_desc'] = ''.join(toefl_desc)

            toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
            # print(toefl_list)
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item[
                'apply_proces_en'] = "http://www.hope.ac.uk/postgraduate/howtoapply/"
            item[
                'require_chinese_en'] = """<h3>2018 Postgraduate Entry Requirements</h3><ul><li>A degree from a recognised institution equivalent to a UK Honours degree</li></ul>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #9
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "University of Lincoln"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item[
            'location'] = 'University of Lincoln, Brayford Pool, Lincoln, LN6 7TS'
        print("===========================")
        print(response.url)
        try:
            # //table[@id='newTitle']/tbody[@id='newTitleBody']/tr/td/h1[1]/a
            programmeDegreetype = response.xpath(
                "//div[@id='CourseTitleApms']//h1[@class='nd_2019-20']//text()"
            ).extract()
            clear_space(programmeDegreetype)
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)

            degree_type = re.findall(
                r"^(M\w+\sby\sResearch\s/[/\w]+\s|M\w+\sby\sResearch|PG\s\w+|\w+/\w+|\w+)",
                programmeDegreetypeStr)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            if "phd" in item['degree_name'].lower():
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            if "by research" in item['degree_name'].lower(
            ) or item['degree_name'] == "MRes":
                item['teach_type'] = 'research'
                item['degree_type'] = 3
            print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(''.join(degree_type),
                                                       '')
            # if len(programme) > 0:
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //span[@id='durationFT']
            duration = response.xpath(
                "//span[contains(text(),'Full-time Duration')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//span[contains(text(),'School:')]/following-sibling::a//text()"
            ).extract()
            clear_space(department)
            if len(department) > 0:
                item['department'] = department[0]
            print("item['department']: ", item['department'])

            dep_dict = {
                "lincoln school of architecture and the built environment":
                "College of Arts",
                "lincoln school of design":
                "College of Arts",
                "lincoln school of film and media":
                "College of Arts",
                "school of english and journalism":
                "College of Arts",
                "school of fine and performing arts":
                "College of Arts",
                "school of history and heritage":
                "College of Arts",
                "school of chemistry":
                "College of Science",
                "school of computer science":
                "College of Science",
                "school of engineering":
                "College of Science",
                "school of geography":
                "College of Science",
                "school of life sciences":
                "College of Science",
                "school of mathematics and physics":
                "College of Science",
                "school of pharmacy":
                "College of Science",
                "national centre for food manufacturing":
                "College of Science",
                "lincoln institute for agri-tech":
                "College of Science",
                "school of education":
                "College of Social Science",
                "school of health and social care":
                "College of Social Science",
                "professional development centre":
                "College of Social Science",
                "lincoln law school":
                "College of Social Science",
                "school of psychology":
                "College of Social Science",
                "school of social and political sciences":
                "College of Social Science",
                "school of sport and exercise science":
                "College of Social Science",
            }
            if item['department'] != "Lincoln Business School":
                item['department'] = dep_dict.get(item['department'].lower())
            print("item['department']1: ", item['department'])

            if item['department'] == None:
                item['department'] = ''.join(
                    response.xpath(
                        "//div[@class='breadcrumb-list']//span//a[@href='/home/collegeofsocialscience/']//text()"
                    ).extract()).strip()
                print("item['department']2: ", item['department'])

            # //div[@id='feesTables']/table
            fee = response.xpath(
                "//td[contains(text(),'International')]/following-sibling::td//text()"
            ).extract()
            clear_space(fee)
            # print("fee: ", fee)
            feeStr = ''.join(fee)
            tuitionfee = getTuition_fee(feeStr)
            item['tuition_fee'] = tuitionfee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //h2[contains(text(),'The Course')]/..
            overview = response.xpath(
                "//h2[contains(text(),'The Course')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules_en = response.xpath(
                """//body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'How you study')]/../../..|
                                        //body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'How You Study')]/../../..|
                                        //body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'Modules')]/../../..|
                                        //body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'Research Areas, Projects & Topics')]/../../.."""
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    """//a[contains(text(),'How you study')]/../../..|
                                        //a[contains(text(),'How You Study')]/../../..|
                                        //a[contains(text(),'Modules')]/../../..|
                                        //a[contains(text(),'Research Areas, Projects & Topics')]/../../.."""
                ).extract()
            # 需要去除的多余的内容
            del_modules_en = response.xpath(
                "//div[@id='collapse62019-20']//div[@id='modulePanelPrint']"
            ).extract()
            del_modules_en_str = remove_class(
                clear_lianxu_space(del_modules_en))
            print(modules_en)
            item['modules_en'] = remove_class(
                clear_lianxu_space(modules_en)).replace(
                    del_modules_en_str, '').strip()
            if item['modules_en'] == "":
                item['modules_en'] = None
                # print("*** modules_en")
            else:
                print("===", item['modules_en'])
                del_cont = re.findall(
                    r"<br>Find out more</p><div><span>.*?</em></span>",
                    item['modules_en'])
                print("del_cont==", del_cont)
                if len(del_cont) > 0:
                    for delc in del_cont:
                        item['modules_en'] = item['modules_en'].replace(
                            delc, '<div>').strip()
            print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//a[contains(text(),'How You Are Assessed')]/../../..|//a[contains(text(),'How you are assessed')]/../../.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            interview_desc_en = response.xpath(
                "//a[contains(text(),'Interviews & Applicant Days')]/../../.."
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            rntry_requirements = response.xpath(
                "//a[contains(text(),'Entry Requirements')]/../../..//text()|//a[contains(text(),'Entry requirements')]/../../..//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELTS.{1,80}", item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ielts).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career = response.xpath("//div[@id='CourseCareersApms']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/entryrequirementsandyourcountry/china/
            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Master's</strong></p>
<p>Prospective students require one of the following qualifications:</p>
<ul>
<li>A Chinese degree from a recognised institution with a minimum average grade of 70% (GPA 2.5), some programmes may require 80% or a GPA 3.0</li>
<li>Successful completion of a UK Bachelor degree with a minimum grade of 2:2</li>
<li>Students with a three year Chinese Diploma who have gained at least 3 years full-time relevant work experience may be considered for our MBA programme on a case by case basis</li>
</ul>"""
                ]))
            if item['teach_type'] == "phd":
                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<p><strong>PhD</strong></p>
<p><span>Successful completion of a Master's Degree from a recognised institution.</span></p>
"""
                    ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            if item['ielts_desc'] == "":
                item[
                    'ielts_desc'] = "Prospective students require IELTS 6.0 (with no less than 5.5 in each band score) or an equivalent qualification. Please note that some courses require a higher score."
                item['ielts'] = 6.0
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("******item['ielts_desc']: ", item['ielts_desc'])
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/englishlanguagerequirementsandsupport/englishlanguagerequirements/
            if item['ielts'] == "6.5":
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_s'] = 22
                item['toefl_r'] = 21
                item['toefl_w'] = 22
            elif item['ielts'] == "7.0":
                item['toefl'] = 100
                item['toefl_l'] = 22
                item['toefl_s'] = 23
                item['toefl_r'] = 23
                item['toefl_w'] = 23
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h4 class="h2">Follow these five simple steps to apply for a postgraduate course at Lincoln:</h4>
<p class="h2">1. Find your course</p>
<p>On this website you will find an overview of the <a href="/home/studywithus/postgraduatestudy/">postgraduate courses</a> available at the University of Lincoln.</p>
<p>Choose the course you wish to study, making sure you check the entry requirements.</p>
<p>We strongly recommend you attend a <a href="/home/studywithus/opendaysandvisits/postgraduatetasterdays/">Postgraduate Taster Day</a> to find out more.</p>
<p class="h2">2. Check for a closing date</p>
<p>Most of our postgraduate courses have no official closing date for applications. The majority of our taught courses start in September, although some courses have intakes in January or February. Please allow enough time for your application to be considered prior to the start date. If you are an international student you may need to factor in time for your visa application. We would advise you to apply as soon as possible.</p>
<p class="h2">3. Are you eligible for a postgraduate loan or scholarship?</p>
<p>The government has announced a new system of Postgraduate Loans where eligible full-time and part-time students could borrow up to &pound;10,609 towards the cost of a taught postgraduate Master&rsquo;s qualification. <a href="/home/studywithus/postgraduatestudy/feesandfunding/">Visit our Postgraduate Fees and Funding page</a> to find out more. The University of Lincoln also offers a range of postgraduate <a href="/home/studywithus/scholarshipsandbursaries/">scholarships</a>.</p>
<p class="h2">4. Research candidates only - compose your research proposal</p>
<p>If you are applying for a research programme, you will need to draft your research proposal. In your application you will be asked to give a description of the topic or theme you intend to research.</p>
<p class="h2">5. Apply online</p>
<p>When you have found the course you are interested in, go to the course page and click <a href="https://my.lincoln.ac.uk/welcome/pages/login.aspx" target="_blank">&lsquo;Apply Online&rsquo;</a>.</p>
<p>You will need to register with us first to proceed.</p>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #10
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_name = response.xpath(
                "//span[contains(text(),'Awards:')]/../text()").extract()
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0]
            print("item['degree_name']: ", item['degree_name'])

            teach_time = response.xpath(
                "//span[contains(text(),'Study modes:')]/../text()").extract()
            teach_time = ''.join(teach_time)
            # teach_time_re = re.findall(r"[a-zA-Z]{4}-time", teach_time)
            # print("teach_time_re: ", teach_time_re)
            item['teach_time'] = getTeachTime(teach_time)
            # item['teach_time'] = item['teach_time'].replace("parttime", "").replace(',', '')
            # print("item['teach_time']: ", item['teach_time'])

            department = response.xpath(
                "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs 12']//ul//li//span[contains(text(),'College:')]/following-sibling::*//text()").extract()
            if len(department) == 0:
                department = response.xpath(
                    "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs 12']//ul//li//span[contains(text(),'School:')]/following-sibling::a[1]/text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            location = response.xpath(
                "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            # //option[@value='0010']
            start_date = response.xpath(
                "//select[@name='code2']//option//text()").extract()
            clear_space(start_date)
            # print(start_date)
            if len(start_date) > 0:
                start_date = start_date[0].strip()
            # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(start_date)
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_collapseresearch_profile']/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            duration = response.xpath(
                "//table[@class='table table-striped']//tbody//tr[1]/td[3]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration = ''.join(duration).strip()
            duration_int = re.findall(r"\d+", duration)
            if len(duration_int) != 0:
                item['duration'] = int(''.join(duration_int))
            # print("item['duration']: ", item['duration'])

            if "year" in duration or "Year" in duration:
                item['duration_per'] = 1
            if "month" in duration or "Month" in duration:
                item['duration_per'] = 3
            # print("item['duration_per']: ", item['duration_per'])


            # //div[@id='proxy_collapseprogramme']
            modules1 = response.xpath(
                "//div[@id='proxy_collapsehow_taught']/div/*[position()<=last()]").extract()
            # clear_space(modules1)
            modules2url = response.xpath(
                "//html//tr[1]/td[5]/a/@href").extract()
            modules2 = ""
            if len(modules2url) != 0:
                modules2url = ''.join(modules2url)
                modules2 = self.get_modules2(modules2url)
            item['modules_en'] = remove_class(clear_lianxu_space(list(modules1)))
            if modules2 != "":
                item['modules_en'] += "\n" + modules2
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//div[@id='proxy_collapsecareer_opp']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            entry_requirements = response.xpath(
                "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            IELTS = response.xpath("//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath("//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                    item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath("//div[@id='proxy_collapsefees']//ul/li/a[contains(text(),'Full')]/@href").extract()
            # print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                # print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])

            item['require_chinese_en'] = "https://www.ed.ac.uk/studying/international/postgraduate-entry/asia/china"
            item['apply_proces_en'] = "https://www.ed.ac.uk/studying/postgraduate/applying"
            # apply_proces_en = response.xpath(
            #     "//div[@id='proxy_collapseHowToApply']/..").extract()
            # item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            yield item
        except Exception as e:
            with open(item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #11
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "St George's, University of London"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "Cranmer Terrace, London SW17 0RE"
        print("===========================")
        print(response.url)
        try:
            programmeDegree_name = response.xpath("//div[@class='inner']/h1//text()").extract()
            programmeDegree_nameStr = ''.join(programmeDegree_name).strip()
            # print("programmeDegree_nameStr: ", programmeDegree_nameStr)

            degree_name = re.findall(r"\w+/\w+/\w+|\w+\s\(|\w+\s-", programmeDegree_nameStr)
            if len(degree_name) == 0:
                degree_name = re.findall(r"\w+$", programmeDegree_nameStr)
            item['degree_name'] = ''.join(degree_name).replace("(", "").replace("-", "").strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegree_nameStr.replace(item['degree_name'], "").strip()
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])


            # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # # print("start_date: ", start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            duration = response.xpath("//table[1]/tbody/tr[1]/td[2]//text()").extract()
            clear_space(duration)
            # print("duration: ", ''.join(duration))
            item['teach_time'] = getTeachTime(''.join(duration))
            # print("item['teach_time']: ", item['teach_time'])

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //p[contains(text(),'Non-UK/EU (International) application deadline')]
            deadline = response.xpath(
                "//p[contains(text(),'Non-UK/EU (International) application deadline')]//text()").extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            item['deadline'] = getStartDate(''.join(deadline).replace("Non-UK/EU (International) application deadline", "").replace(":", "").strip())
            # print("item['deadline']: ", item['deadline'])

            # location = response.xpath("//*[contains(text(),'Study location:')]//text()").extract()
            # item['location'] = ''.join(location).replace("Study location:", "").strip()
            # print("item['location']: ", item['location'])

            tuition_fee = response.xpath("//p[contains(text(),'Non-EU (international): ')]//text()|"
                                         "//table//p[contains(text(),'2018 entry Non-EU')]//text()|"
                                         "//table[2]/tbody/tr[4]/td/p[contains(text(),'2018 Non-EU')]/following-sibling::*/*[1]//text()|"
                                         "//table//p[contains(text(),'2018 Non-EU')]/following-sibling::*[1]/*[1]//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", ''.join(tuition_fee))
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            overview_en = response.xpath("//p[@class='first']|//table[1]/following-sibling::*[position()<last()-1]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en)).replace("<p><button>Make an enquiry</button></p>", "").strip()
            # print("item['overview_en']: ", item['overview_en'])

            entry_url = response.xpath("//a[contains(text(),'Entry')]/@href").extract()
            # print("entry_url: ", entry_url)
            if len(entry_url) != 0:
                parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                # print("parse_entry_url: ", parse_entry_url)
                entry_dict = self.parse_rntry_requirements(parse_entry_url)
                # print(entry_dict)
                item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                item['ielts_desc'] = entry_dict.get('ielts_desc')
            # print("item['rntry_requirements']: ", item['rntry_requirements'])
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 5:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[4]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[3]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            modules_url = response.xpath("//a[contains(text(),'Module')]/@href").extract()
            # print("modules_url: ", modules_url)
            if len(modules_url) != 0:
                parse_modules_url = "https://www.sgul.ac.uk" + modules_url[0]
                # print("parse_modules_url: ", parse_modules_url)
                item['modules_en'] = remove_class(clear_lianxu_space(self.parse_modules(parse_modules_url))).replace("<p><img></p>", "").strip()
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en_url = response.xpath("//a[contains(text(),'Studying')]/@href").extract()
            # print("assessment_en_url: ", assessment_en_url)
            if len(assessment_en_url) != 0:
                parse_assessment_en_url = "https://www.sgul.ac.uk" + assessment_en_url[0]
                # print("parse_assessment_en_url: ", parse_assessment_en_url)
                item['assessment_en'] = remove_class(clear_lianxu_space(self.parse_assessment_en(parse_assessment_en_url))).replace("<p><img></p>", "").strip()
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en_url = response.xpath("//a[contains(text(),'Career')]/@href").extract()
            # print("career_en_url: ", career_en_url)
            if len(career_en_url) != 0:
                parse_career_en_url = "https://www.sgul.ac.uk" + career_en_url[0]
                # print("parse_career_en_url: ", parse_career_en_url)
                item['career_en'] = remove_class(clear_lianxu_space(self.parse_career_en(parse_career_en_url))).replace("<p><img></p>", "").strip()
            # print("item['career_en']: ", item['career_en'])

            apply_proces_en_url = response.xpath("//a[contains(text(),'Apply')]/@href|//a[contains(text(),'Application and interview')]/@href").extract()
            print("apply_proces_en_url: ", apply_proces_en_url)
            if len(apply_proces_en_url) != 0:
                parse_apply_proces_en_url = "https://www.sgul.ac.uk" + apply_proces_en_url[0]
                print("parse_apply_proces_en_url: ", parse_apply_proces_en_url)
                item['apply_proces_en'] = remove_class(clear_lianxu_space(self.parse_apply_proces_en(parse_apply_proces_en_url))).replace("<p><img></p>", "").strip()
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            # item['require_chinese_en'] = remove_class(clear_lianxu_space([""]))
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #12
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.plymouth.ac.uk/"
        item['university'] = "University of Plymouth"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===============================")
        print(response.url)
        try:
            # //span[@class='course-title']
            programme = response.xpath(
                "//span[@class='course-title']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            # print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@class='hero-heading']/text()").extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            # print("item['degree_name'] = ", item['degree_name'])

            degree_name_lower = item['degree_name'].lower()
            # print("degree_name_lower: ", degree_name_lower)
            if "phd" in degree_name_lower:
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            elif "res" in degree_name_lower:
                item['teach_type'] = 'research'
                item['degree_type'] = 3
            # print("item['teach_type'] = ", item['teach_type'])
            # print("item['degree_type'] = ", item['degree_type'])

            department = response.xpath(
                "//h2[@class='school-title']//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department)
            # print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath("//td[contains(text(),'Duration')]/following-sibling::td//text()").extract()
            clear_space(duration)
            # print(duration)
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # mode
            mode = response.xpath("//td[contains(text(),'Course type')]/following-sibling::td//text()").extract()
            clear_space(mode)
            # print("mode: ", mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time'] = ", item['teach_time'])

            # location
            location = response.xpath("//td[contains(text(),'Location')]/following-sibling::td//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            # overview
            overview1 = response.xpath("//div[@class='overview']").extract()
            overview2 = response.xpath("//div[@id='key-features-accordion']").extract()
            overview = remove_class(clear_lianxu_space(overview1)) + remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = overview
            # print("item['overview_en'] = ", item['overview_en'])

            # modules
            modules = response.xpath("//div[@id='structure-accordion']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # entry_requirements
            entry_requirements = response.xpath("//div[@id='entry-requirements-accordion']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # .{1,150}IELTS.{1,150}
            IELTS = re.findall(r"(.{1,80}IELTS.{1,80})|(.{1,80}ILETS.{1,80})|(.{1,80}IELTs.{1,80})", item['rntry_requirements'])
            # print(IELTS)
            if len(IELTS) != 0:
                ielts = ''.join(list(IELTS[0])).strip()
                item['ielts_desc'] = ielts
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip(".").strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip(".").strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip(".").strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip(".").strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip(".").strip()
            print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # how_to_apply
            how_to_apply = response.xpath("//div[@id='how-to-apply-accordion']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //html//div[@class='course-accordions']//tr[3]/td[3]
            # how_to_apply
            tuition_fee = response.xpath("//strong[contains(text(),'International')]/../following-sibling::*[2]//text()").extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            if tuition_fee_str == "To be confirmed" or tuition_fee_str == "":
                item['tuition_fee'] = None
            else:
                item['tuition_fee'] = int(tuition_fee_str.replace("£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # https://www.plymouth.ac.uk/international/study/international-students-country-guides/asia/china
            item['require_chinese_en'] = """<p><b>Postgraduate</b></p><p>For postgraduate programmes, you'll need either a bachelor's degree (with high grades), a masters degree from a ranked Chinese university or a good honours degree from a British university.&nbsp;</p><p><div class="table-responsive">
<table>
<tr>
<td><b>Chinese degree classification - prestigious i</b><b>nstitution</b></td>
<td><b>Chinese degree classification - non-prestigious institution</b></td>
<td><b>Chinese degree classification - college institution</b></td>
<td><b>UK degree equivalent</b></td>
<td></td>
</tr>
<tr>
<td>80%</td>
<td>85%</td>
<td>90%</td>
<td>1st</td>
<td></td>
</tr>
<tr>
<td>75%</td>
<td>80%</td>
<td>85%</td>
<td>2:1</td>
<td></td>
</tr>
<tr>
<td>70%</td>
<td>75%</td>
<td>80%</td>
<td>2:2</td>
<td></td>
</tr>
</table></div>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #13
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "The University of Sheffield"
        # item['country'] = 'England'
        # item['website'] = 'https://www.sheffield.ac.uk'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "Western Bank, Sheffield, S10 2TN, UK"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programmeDegree_type = response.xpath(
                "//main[@class='main content']/h1//text()").extract()
            if len(programmeDegree_type) == 0:
                programmeDegree_type = response.xpath(
                    "//main[@class='main content']/h2[1]//text()").extract()
            programmeDegree_type = ''.join(programmeDegree_type)
            # print("programmeDegree_type: ", programmeDegree_type)
            degree_typeList = re.findall(r"^[A-Za-z/\(\)]*",
                                         programmeDegree_type)
            # print("degree_typeList: ", degree_typeList)
            programme = programmeDegree_type
            if len(degree_typeList) != 0:
                degree_type = ''.join(list(degree_typeList[0]))
                item['degree_name'] = degree_type
                programme = programmeDegree_type.split(item['degree_name'])
            print("item['degree_name']: ", item['degree_name'])
            item['programme_en'] = ''.join(programme).strip().replace(
                "in ", "").strip()
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            department = response.xpath(
                "//html//main[@class='main content']/p[1]//text()").extract()
            department = ''.join(department)
            clear_space_str(department)
            item['department'] = department.strip()
            # print("item['department']: ", item['department'])

            # start_date //a[@href='#tab00']
            start_date = response.xpath(
                "//table[@class='cms-tabs']/tbody/tr[last()]/th[1]//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date)).replace(
                "--20", "").strip()
            # print("item['start_date']: ", item['start_date'])

            # 专业描述
            overview = response.xpath(
                "//div[@id='tab00']//div[@class='highlight neutral']|//h2[contains(text(),'Overview')]/.."
            ).extract()
            # print("overview: ", overview)
            if len(overview) == 0:
                overview = response.xpath(
                    "//div[@class='highlight neutral']").extract()
                # print("overview1: ", overview)
                if len(overview) == 0:
                    overview = response.xpath(
                        "//h3[contains(text(),'Core modules')]/preceding-sibling::*"
                    ).extract()
                    # print("overview2: ", overview)
                    if len(overview) == 0:
                        overview = response.xpath(
                            "//h3[contains(text(),'Teaching')]/preceding-sibling::*"
                        ).extract()
                        # print("overview3: ", overview)
                        if len(overview) == 0:
                            overview = response.xpath(
                                "//h3[contains(text(),'Course duration')]/preceding-sibling::*"
                            ).extract()
                            # print("overview3: ", overview)
                            if len(overview) == 0:
                                overview = response.xpath(
                                    "//*[contains(text(), 'Course descriptio')]/preceding-sibling::*[1]/following-sibling::*[position()<4]"
                                ).extract()
                                # print("overview4: ", overview)
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # 课程长度
            durationContent = response.xpath(
                "//h3[contains(text(),'Course duration')]/following-sibling::p[1]//text() | //h3[contains(text(),'Course duration')]/following-sibling::ul/li[1]//text()"
            ).extract()
            clear_space(durationContent)
            # print(durationContent)
            if len(durationContent) != 0:
                duration = durationContent[0].strip()
                if "full" in duration:
                    item['teach_time'] = 'fulltime'
                elif "part" in duration or "Part" in duration:
                    item['teach_time'] = 'parttime'
                d_re = re.findall(r'\d+', duration)
                if len(d_re) != 0:
                    item['duration'] = d_re[0]
                if 'year' in duration:
                    item['duration_per'] = 1
                elif 'month' in duration:
                    item['duration_per'] = 3
            duration_list = getIntDuration(''.join(durationContent))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time']: ", item['teach_time'])
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # //div[@id='tab00']
            # modules   评估方式
            twoContent = response.xpath(
                "//main[@class='main content']//text()").extract()
            clear_space(twoContent)
            # print(twoContent)
            if "Core modules" in twoContent:
                modulesIndex = twoContent.index("Core modules")
                modules = ""
                if "Teaching and assessment" in twoContent:
                    modulesIndexEnd = twoContent.index(
                        "Teaching and assessment")
                    modules = twoContent[modulesIndex:modulesIndexEnd]
                elif "Teaching" in twoContent:
                    modulesIndexEnd = twoContent.index("Teaching")
                    modules = twoContent[modulesIndex:modulesIndexEnd]
                elif "Course duration" in twoContent:
                    modulesIndexEnd = twoContent.index("Course duration")
                    modules = twoContent[modulesIndex:modulesIndexEnd]
                item['modules_en'] = clear_lianxu_space(modules)
            if item['modules_en'] != "":
                item['modules_en'] = "<div>" + item['modules_en'] + "</div>"
            # print("item['modules_en']: ", item['modules_en'])

            if item['modules_en'] == "":
                modules_bu = response.xpath(
                    "//h2[contains(text(),'Programme structure')]/..|"
                    "//h3[contains(text(),'odules')]|//h3[contains(text(),'odules')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'Subjects')]|//h3[contains(text(),'Subjects')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'odules')]|//h3[contains(text(),'odules')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'Course content')]|//h3[contains(text(),'Course content')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'odules')]|//h3[contains(text(),'odules')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'Subjects')]|//h3[contains(text(),'Subjects')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'odules')]|//h3[contains(text(),'odules')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'semester')]|//h3[contains(text(),'semester')]/following-sibling::*[position()<6]|"
                    "//h3[contains(text(),'Stage')]|//h3[contains(text(),'Stage')]/following-sibling::*[position()<4]|"
                    "//h3[contains(text(),'Semester')]|//h3[contains(text(),'Semester')]/following-sibling::*[position()<6]"
                ).extract()
                item['modules_en'] = remove_class(
                    clear_lianxu_space(modules_bu))
            if item['modules_en'] == "":
                print("***** modules_en")
            print("item['modules_en']: ", item['modules_en'])

            if "Teaching and assessment" in twoContent:
                teachingIndex = twoContent.index("Teaching and assessment")
                if "Course duration" in twoContent:
                    teachingIndexEnd = twoContent.index("Course duration")
                    teaching = twoContent[teachingIndex:teachingIndexEnd]
                    item['assessment_en'] = clear_lianxu_space(teaching)
                elif "Entry requirements" in twoContent:
                    teachingIndexEnd = twoContent.index("Entry requirements")
                    teaching = twoContent[teachingIndex:teachingIndexEnd]
                    item['assessment_en'] = clear_lianxu_space(teaching)
            elif "Teaching" in twoContent:
                teachingIndex = twoContent.index("Teaching")
                if "Course duration" in twoContent:
                    teachingIndexEnd = twoContent.index("Course duration")
                    teaching = twoContent[teachingIndex:teachingIndexEnd]
                    item['assessment_en'] = clear_lianxu_space(teaching)
            elif "Assessment" in twoContent:
                teachingIndex = twoContent.index("Assessment")
                if "Course duration" in twoContent:
                    teachingIndexEnd = twoContent.index("Course duration")
                    teaching = twoContent[teachingIndex:teachingIndexEnd]
                    item['assessment_en'] = clear_lianxu_space(teaching)
            if len(item['assessment_en']) != 0:
                item['assessment_en'] = "<div>" + item[
                    'assessment_en'] + "</div>"
            if len(item['assessment_en']) == 0:
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(
                        response.xpath(
                            "//h2[contains(text(),'Assessment')]/..|"
                            "//h3[contains(text(),'How we will teach and assess you')]|//h3[contains(text(),'How we will teach and assess you')]/following-sibling::*[position()<3]|"
                            "//h3[contains(text(),'teaching and assessment')]|//h3[contains(text(),'teaching and assessment')]/following-sibling::*[position()<5]"
                        ).extract()))
            print("item['assessment_en']: ", item['assessment_en'])

            entry_requirements = response.xpath(
                "//div[@class='highlight complement']//h4[contains(text(),'English language')]/preceding-sibling::*//text()|"
                "//div[@class='highlight complement']//h4[contains(text(),'English Language')]/preceding-sibling::*//text()"
            ).extract()
            if len(entry_requirements) == 0:
                entry_requirements = response.xpath(
                    "//h4[contains(text(),'English language')]/preceding-sibling::*[position()<3]//text()"
                ).extract()
                if len(entry_requirements) == 0:
                    entry_requirements = response.xpath(
                        "//*[contains(text(),'Entry')]/following-sibling::*[position()<4]//text()"
                    ).extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@class='highlight complement']//h4[contains(text(),'English language')]/following-sibling::p[1]//text()|"
                "//div[@class='highlight complement']//h4[contains(text(),'English Language')]/following-sibling::p[1]//text()"
            ).extract()
            if len(ielts_desc) == 0:
                ielts_desc = response.xpath(
                    "//h4[contains(text(),'English language')]/following-sibling::*[position()<3]//text()"
                ).extract()
                if len(ielts_desc) == 0:
                    ielts_desc = response.xpath(
                        "//p[contains(text(),'IELTS')]//text()").extract()
            clear_space(ielts_desc)
            if len(ielts_desc) > 0:
                item['ielts_desc'] = ''.join(ielts_desc[0])
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltDict.get('IELTS')
            item["ielts_l"] = ieltDict.get('IELTS_L')  # float
            item["ielts_s"] = ieltDict.get('IELTS_S')  # float
            item["ielts_r"] = ieltDict.get('IELTS_R')  # float
            item["ielts_w"] = ieltDict.get('IELTS_W')
            # print("ielts = %s  ielts_l = %s  ielts_s = %s  ielts_r = %s  ielts_w = %s"%(
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item['apply_proces_en'] = """<h1>Applying</h1>
    <p>You can apply for postgraduate study using our Postgraduate Online Application Form. It is a quick and easy process. Use the following link to enter:</p>
    <p>Postgraduate online application form</p>
    <p>The form has comprehensive instructions about how to complete it and pop-up help is available on each page.</p>
    <table>
    <tbody>
    <tr>
    <td>
    <h4>MArch Applications</h4>
    <p>Direct-entry applicants to Part 2 of the  RIBA-accredited MArch Architecture (undergraduate masters course) should  apply using the form available on the School of Architecture's webpages:</p>
    <p>Applying for MArch courses (RIBA Part 2)</p>
    </td>
    </tr>
    </tbody>
    </table>
    <h3>Completing your application</h3>
    <p>The form is divided into two parts. Part 1 is for personal information, including English language ability, and previous education and employment. You have to complete all of the mandatory fields in this part (marked with a *) before you can go on to Part 2. Part 2 is where you select the course or courses you want to apply for. You can apply for a total of three different postgraduate courses.</p>
    <h3>Supporting Documents</h3>
    <p>You will need to include certain documents to support your application, for example evidence of your previous qualifications and a personal statement. You can supply these simply by uploading them to the relevant sections of your online application.</p>
    <p>You can find more information about the supporting documents you will need, and how to supply them, on our Supporting Documents webpage:</p>
    <p>Supporting documents</p>
    <h3>Submitting your application</h3>
    <p>Your application will only be submitted to us when you click the "Submit Application" button. If you have forgotten to fill in any sections, you will be prompted to go back and complete them at this stage. When you have successfully submitted the completed form we will confirm this on-screen. You will also then be sent an email confirmation.</p>
    <p>If you want to apply for more than one course, you do not need to submit them all at the same time. Each course choice has its own "Submit Application" button.</p>
    <p>If you have any problems completing your online application, please contact us:</p>
    <p>Problems using the Postgraduate Online Application Form</p>
    <h3>After you've applied</h3>
    <p>When we have created your applicant record, we will send you a second email to confirm this. This email will include your applicant and choice numbers, as well as information about what happens next.</p>
    <p>You can find more information about what happens after you submit your application, and about preparing to study at the University of Sheffield, on the After You Apply webpage:</p>
    <p>After you apply</p>
    <p>If you have any questions about the application process or about studying at the University,  please contact us.</p>"""
            item[
                'require_chinese_en'] = """<div>Postgraduate Taught Programmes e.g. MA, MSc
    New! Mandarin web pages for postgraduate applicants
    Find out more about how to apply in Mandarin using our new web pages from Admissions, which give you guidelines about how to apply to our postgraduate taught courses.
    We have over 200 postgraduate taught courses - if you are considering further study, there's a very good chance we'll have the course to meet your needs.
    Search for a postgraduate course
    Chinese University Degree Holders
    Holders of a good bachelor degree from a recognised Chinese university will be considered for direct entry to postgraduate diploma or masters programmes.
    For Entry to MBA
    Holders of a good bachelor degree from a recognised Chinese university and at least 3 years´ post-graduation work experience will be considered for direct entry to the MBA. Applicants must be at least 25 years old by the time the programme starts.
    For further information on the MBA programme
    For Entry to Postgraduate Research Programmes e.g. MPhil, PhD, PhD with Integrated Masters
    With around 2,000 postgraduate research students from over 100 different countries, Sheffield is one of the foremost centres for research training in the UK.
    Search for a research area
    Chinese Masters Degree Holders
    Holders of a good bachelor degree and a good masters degree from a recognised Chinese university will be considered for direct admission to postgraduate research programmes.
    If you are in any doubt about whether you are eligible to study at Sheffield, please contact the China Team.</div>"""

            tuition_fee_str = re.findall(r'course=.+"', response.text)
            tuition_fee_str = ''.join(tuition_fee_str).replace("course=",
                                                               '').replace(
                                                                   '"', '')
            # print("tuition_fee_str: ", tuition_fee_str)
            tuition_fee_url = "https://ssd.dept.shef.ac.uk/fees/pgt/api/lookup.php?year=2018&status=Overseas&course=" + tuition_fee_str
            # print("tuition_fee_url: ", tuition_fee_url)
            r = requests.get(tuition_fee_url, headers=self.headers)
            # print(r.text)
            tuition_fee = re.findall(r"&pound;\d+", r.text)
            # print(tuition_fee, "*******")
            if len(tuition_fee) != 0:
                item['tuition_fee'] = int(''.join(tuition_fee).replace(
                    '&pound;', ''))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])

            career = response.xpath(
                "//h3[contains(text(),'Careers')]|//h3[contains(text(),'Careers')]/following-sibling::*[position()<2]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #14
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.uwl.ac.uk/"
        item['university'] = "University of West London"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='page-title']//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)

            degree_type = re.findall(r"^(\w+\s?/\s?\w+|\w+)\s",
                                     programmeDegreetypeStr)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            if item['degree_name'].lower() == "phd":
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            print("item['teach_type']: ", item['teach_type'])
            print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.strip(''.join(degree_type))
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath(
                "//dt[contains(text(), 'Study mode')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time']: ", item['teach_time'])

            location = response.xpath(
                "//dt[contains(text(), 'Location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).replace(
                "See location information", "").strip()
            print("item['location']: ", item['location'])

            start_date = response.xpath(
                "//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            tuition_fee = response.xpath(
                "//h4[contains(text(),'Overseas students')]/following-sibling::dl[1]//dt[contains(text(), 'Main fee')]/following-sibling::dd[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='course-detail']
            modules = response.xpath("//div[@id='course-detail']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}',
                                       ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            if len(ielts_desc_re) > 0:
                item['ielts_desc'] = ielts_desc_re[-1]
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Teaching methods')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                "//*[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                "//html//div/strong[contains(text(),'How will I be taught?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath(
                "//div[@id='career-progression-and-study']|"
                "//div[@id='jobs-and-placements']|"
                "//html//*[contains(text(),'Career and study progression')]/../following-sibling::*[position()<5]"
            ).extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            overview_en = response.xpath(
                "//div[@id='course-summary']/*[position()<last()]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            item['require_chinese_en'] = """<h3>Postgraduate entry</h3>
<p>Applicants with the followingqualificationswill be considered for entry on a postgraduate course:</p>
<p>Bachelor's degree from a national university with a GPA 2.6 / 4.0 or an overall average of 65% or higher</p>
<p>Bachelor's degree from a high-ranking private college with an average of 85% or higher</p>
<p>Honours degree from any university in the UK or Republic of Ireland with a minimum of 2:2 or above</p>
<p>To study a PhD: a proposal is required in addition to a Masters qualification in a related subject area.</p>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #15
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.strath.ac.uk/"
        item["university"] = "University of Strathclyde"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "16 Richmond Street, Glasgow, G1 1XQ"
        print("===========================")
        print(response.url)
        try:
            # 学位类型
            degree_type = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/span/text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            # 专业名
            programme = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/text()"
            ).extract()
            # print("programme = ", programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            if "Engineering" in item['programme_en']:
                item['department'] = "Faculty of Engineering"
            elif "Science" in item['programme_en']:
                item['department'] = "Faculty of Science"
            elif "Business" in item['programme_en'] or "Finance" in item[
                    'programme_en'] or "Marketing" in item['programme_en']:
                item['department'] = "Strathclyde Business School"
            print("item['department'] = ", item['department'])

            # 课程长度、开学时间、截止日期
            durationTeachtime = response.xpath(
                "//b[contains(text(),'Study mode and duration')]/../text()"
            ).extract()
            clear_space(durationTeachtime)
            # print("durationTeachtime: ", durationTeachtime)
            durationTeachtimeStr = ''.join(durationTeachtime)

            item['teach_time'] = getTeachTime(durationTeachtimeStr)
            duration_list = getIntDuration(durationTeachtimeStr)
            # print(duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])
            # print("item['teach_time'] = ", item['teach_time'])

            start_date = response.xpath(
                "//b[contains(text(),'Start date')]/../text()").extract()
            start_date_str = ''.join(start_date).replace(":", "")
            print("start_date_str = ", start_date_str)
            item['start_date'] = getStartDate(start_date_str)
            if item['start_date'] != "" and item[
                    'start_date'] > "06" and "201" not in item['start_date']:
                item['start_date'] = "2018-" + item['start_date']
            elif item['start_date'] != "" and item[
                    'start_date'] <= "06" and "201" not in item['start_date']:
                item['start_date'] = "2019-" + item['start_date']
            print("item['start_date'] = ", item['start_date'])

            # 截止日期
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline')]/../text()"
            ).extract()
            # print("deadline1 = ", deadline)
            deadline = ''.join(deadline).replace(":", "").strip()
            print("deadline = ", deadline)
            item['deadline'] = getStartDate(deadline)
            if item['deadline'] == '2':
                item['deadline'] = ""
            print("item['deadline'] = ", item['deadline'])

            # 专业描述
            overview = response.xpath(
                "//article[@id='why-this-course']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # 课程设置、评估方式
            modules = response.xpath(
                "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//article[@id='course-content']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            if item['modules_en'] == "":
                print("modules_en 为空")
            # else:
            #     print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item["assessment_en"] = remove_class(
                clear_lianxu_space(assessment_en))
            if item['assessment_en'] == "":
                print("assessment_en 为空")
            # else:
            #     print("item['assessment_en'] = ", item['assessment_en'])

            # 学术要求、英语要求
            rntry_requirements = response.xpath(
                "//article[@id='entry-requirements']//text()").extract()
            item["rntry_requirements"] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # ielts = response.xpath("//h3[contains(text(),'English language requirements')]/following-sibling::*[position()<4]//text()").extract()
            # print("ielts: ", ielts)
            ielts_re = re.findall(r"IELTS.{1,80}", ''.join(rntry_requirements))
            # print("ielts_re = ", ielts_re)
            item["ielts_desc"] = ''.join(ielts_re)
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # print(ieltlsrw)
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip('.').strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip('.').strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip('.').strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip('.').strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip('.').strip()
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            # 学费    //article[@id='fees-and-funding']/ul[3]/li
            tuition_fee = response.xpath(
                "//html//article[@id='fees-and-funding']/*[contains(text(),'International')]/following-sibling::*[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = ''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "")
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            # 就业    //article[@id='careers']
            career = response.xpath("//article[@id='careers']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Postgraduate</h3>
<div>GPA from a four-year undergraduate degree must be:</div>
<div>
<ul>
<li>over an average of 70% for 211/985 universities</li>
<li>over an average of 75% for the rest of Chinese universities</li>
</ul>
</div>
<div>Students interested in PhD must usually have a Masters and must include a proposal in their application.</div>
<div>For further information on entry requirements, you can contact our representative Lexy Docwra (<a href="mailto:[email protected]">[email protected]</a>).</div>"""
                ]))
            print("item['require_chinese_en'] = ", item['require_chinese_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Postgraduate application process</h2>
	<ul>
<li>choose the course you want to apply for &ndash; <a href="http://www.strath.ac.uk/courses/?level_ug=false&amp;level_pgt=true&amp;level_pgr=false">search our postgraduate taught courses</a></li>
<li>check the entry requirements for the course on the course page or in the prospectus</li>
<li>start your application online by clicking on the Apply button on the course page</li>
<li>submit your application along with all supporting documentation &ndash; see our document checklist below. Your application may be delayed if you fail to provide all the required documents</li>
<li>to help you fill in the application form please read our <a href="/media/ps/registry/Applicant_Guide_to_Postgraduate_Taught_Admissions.pdf.pagespeed.ce.p3pCAoLRJ3.pdf" title="" rel="external">Applicant Guide to Postgraduate Taught Admissions</a></li>
<li>once you&rsquo;ve submitted your personal information, you&rsquo;ll receive an email which contains your username and password. Please keep these in a safe place as you&rsquo;ll need them to progress with your application and view any decisions</li>
<li>your application will be considered by the relevant selection team. If they need any further information you&rsquo;ll be contacted</li>
<li>a decision will be made on your application &ndash; we try to make a decision on your application as quickly as possible. In most cases this will be within a minimum of 10 working days (two weeks)</li>
<li>you&rsquo;ll receive an email telling you that a decision has been made on your application. You&rsquo;ll be asked to log in to our online application system (PEGASUS) to view the outcome of your application</li>
</ul>"""
                ]))
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            item['apply_documents_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Document checklist</h2>
<p>Your application may be delayed if you fail to provide the following documents (where appropriate):</p>
<ul>
<li>certified copies of qualifications you&rsquo;ve gained, eg degree certificate and transcripts (showing the subjects taken and your grades). If you&rsquo;re still studying, provide a transcript of your results so far</li>
<li>if your qualifications are in a language other than English, please provide official translations in addition to the copies of the original documents</li>
<li>if English is not your first language, please provide a suitable English language test certificate (if appropriate), for example IELTS</li>
<li>a copy of your passport (if you are a non EU overseas applicant). Your passport is required in order to obtain your Certificate of Acceptance for Studies (CAS) statement which allows you to apply for your Tier 4 visa to study</li>
<li>a copy of your sponsor letter/scholarship award (if appropriate/available)&nbsp;</li>
<li>copies of any other documentation to support your application such as a CV, Personal Statement, Portfolio (for certain programmes)</li>
</ul>"""
                ]))
            print("item['apply_documents_en'] = ", item['apply_documents_en'])

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #16
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bradford"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        if "pg" in response.url:
            print("===========================")
            print(response.url)
            try:
                key_url = response.url.split("/")[-2].strip()

                programme = response.xpath(
                    "//div[@id='course-key-info']//div[@class='col-xs-12']/h1//text()"
                ).extract()
                item['programme_en'] = ''.join(programme).strip()
                print("item['programme_en']: ", item['programme_en'])

                degree_type = response.xpath(
                    "//p[@id='cAward']//text()").extract()
                item['degree_name'] = ''.join(degree_type).strip()
                print("item['degree_name']: ", item['degree_name'])

                if "phd" in item['programme_en'].lower(
                ) or item['degree_name'].lower() == "doctorate":
                    item['teach_type'] = 'phd'
                    item['degree_type'] = 3
                print("item['teach_type']: ", item['teach_type'])
                print("item['degree_type']: ", item['degree_type'])

                mode = response.xpath(
                    "//option[@value='fulltime']//text()|//span[@id='cAttendance']//text()"
                ).extract()
                clear_space(mode)
                item['teach_time'] = getTeachTime(''.join(mode))
                print("item['teach_time']: ", item['teach_time'])

                start_date_url = "https://www.bradford.ac.uk/courses/pg/pgapi.php?uri=/courses/pg/" + key_url + "/&startMonth=startMonth&level=pg&year=y2018&attendance=fulltime"
                print("start_date_url: ", start_date_url)
                start_date = json.loads(
                    requests.get(start_date_url).text).get("data")
                print("start_date: ", start_date)
                if start_date != None:
                    if "," in start_date:
                        start_date_list = start_date.split(",")
                        for s in start_date_list:
                            item['start_date'] += getStartDate(s.lower()) + ","
                    else:
                        item['start_date'] = getStartDate(
                            ''.join(start_date).lower())
                item['start_date'] = item['start_date'].strip().strip(
                    ",").strip()
                print("item['start_date']: ", item['start_date'])
                # start_date_year = response.xpath(
                #     "//div[@class='col-xs-5']//span[@id='displayYear']//text()").extract()
                # if len(start_date_year) != 0 and item['start_date'] != "":
                #     item['start_date'] = ''.join(start_date_year).strip() + "-" + item['start_date']
                # else:
                #     item['start_date'] = ''.join(start_date_year).strip()
                # print("item['start_date']: ", item['start_date'])

                item['location'] = 'Bradford West Yorkshire BD7 1DP UK'
                # print("item['location']: ", item['location'])

                duration_url = "https://www.bradford.ac.uk/courses/pg/pgapi.php?uri=/courses/pg/" + key_url + "/&duration=duration&level=pg&year=y2018&attendance=fulltime"
                # print("duration_url: ", duration_url)
                duration = json.loads(
                    requests.get(duration_url).text).get("data")
                # print("duration: ", duration)
                if duration != None:
                    duration_list = getIntDuration(''.join(duration))
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                overview_en = response.xpath(
                    "//div[@id='overviewStripe']").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en))
                # print("item['overview_en']: ", item['overview_en'])

                entry_requirements = response.xpath(
                    "//div[@id='course-entry']//text()|//div[@id='nav-course-entry']//text()"
                ).extract()
                entry_requirements_str = ''.join(entry_requirements).strip()
                item['rntry_requirements'] = clear_lianxu_space(
                    entry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                ielts_desc = response.xpath(
                    "//div[@id='course-entry']//*[contains(text(),'IELTS')]//text()|"
                    "//div[@id='nav-course-entry']//*[contains(text(),'IELTS')]//text()"
                ).extract()

                # print("ielts_desc: ", ielts_desc)
                item['ielts_desc'] = ''.join(ielts_desc).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                if item['ielts'] is None:
                    ielts_desc = re.findall(r"IELTS.{1,100}",
                                            entry_requirements_str)
                    clear_space(ielts_desc)
                    item['ielts_desc'] = ''.join(ielts_desc).strip()
                print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                print(
                    "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                    % (item['ielts'], item['ielts_l'], item['ielts_s'],
                       item['ielts_r'], item['ielts_w']))

                toefl_desc = re.findall(r"TOEFL.{1,250}",
                                        entry_requirements_str)
                clear_space(toefl_desc)
                item['toefl_desc'] = ''.join(toefl_desc).strip()
                # print("item['toefl_desc']: ", item['toefl_desc'])

                toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
                # print(toefl_list)
                if len(toefl_list) == 1:
                    item['toefl'] = toefl_list[0]
                    # item['toefl_l'] = toefl_list[0]
                    # item['toefl_s'] = toefl_list[0]
                    # item['toefl_r'] = toefl_list[0]
                    # item['toefl_w'] = toefl_list[0]
                elif len(toefl_list) == 2:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[1]
                    item['toefl_s'] = toefl_list[1]
                    item['toefl_r'] = toefl_list[1]
                    item['toefl_w'] = toefl_list[1]
                elif len(toefl_list) == 5:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[1]
                    item['toefl_s'] = toefl_list[3]
                    item['toefl_r'] = toefl_list[2]
                    item['toefl_w'] = toefl_list[4]
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #                             item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

                modules = response.xpath(
                    "//div[@id='course-curriculum']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//div[@class='row stripe background--green']").extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])

                tuition_fee = response.xpath(
                    "//div[@id='tuitionFees']//p[contains(text(),'International:')]//text()"
                ).extract()
                if len(tuition_fee) == 0:
                    tuition_fee = response.xpath(
                        "//div[@id='tuitionFees']//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))

                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                    item['tuition_fee_pre'] = "£"
                else:
                    print("***")
                print("item['tuition_fee']: ", item['tuition_fee'])
                print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

                career_en = response.xpath(
                    "//div[@id='nav-course-career']").extract()
                item['career_en'] = remove_class(
                    clear_lianxu_space(career_en)).replace("<div></div>",
                                                           "").strip()
                # print("item['career_en']: ", item['career_en'])

                # apply_url_key = response.url.split("/")
                # print(apply_url_key)
                apply_url = "https://www.bradford.ac.uk/courses/pg/pgapi.php?uri=/courses/pg/" + key_url + "/&applyCTAModal=applyCTAModal&level=pg&year=y2018&attendance=fulltime"
                # print("apply_url: ", apply_url)
                apply = json.loads(requests.get(apply_url).text).get("data")
                if apply != None:
                    item['apply_proces_en'] = remove_class(
                        clear_lianxu_space([apply]))
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="entryReq __postgraduate"><h3>Postgraduate</h3><p>The entry requirement for a postgraduate taught course is typically equivalent to a UK Second Class Honours Second Division (2:2). For individual course requirements, please see the course details in the <a href="/courses/pg/">postgraduate course listings</a>.</p>
<p>The table below shows how the University equates qualifications from your country to UK degree classifications:</p>
<table>
<tbody>
<tr><th>Qualification&nbsp;</th><th>UK 1st Class&nbsp;</th><th>UK 2:1&nbsp;</th><th>UK 2:2&nbsp;</th></tr>
<tr>
<td>Bachelor Degree 学士学位</td>
<td>85%</td>
<td>80%</td>
<td>70%</td>
</tr>
</tbody>
</table></div>
"""
                    ]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                department_dict = {
                    "Advanced Biomedical Engineering":
                    "Engineering & Informatics",
                    "Advanced Chemical and Petroleum Engineering":
                    "Engineering & Informatics",
                    "Advanced Civil and Structural Engineering":
                    "Engineering & Informatics",
                    "Advanced Mechanical Engineering":
                    "Engineering & Informatics",
                    "Big Data Science and Technology":
                    "Engineering & Informatics",
                    "Cyber Security": "Engineering & Informatics",
                    "Filmmaking": "Engineering & Informatics",
                    "Internet of Things (IoT)": "Engineering & Informatics",
                    "Nursing Studies (International)": "Health Studies",
                    "PhD (Faculty of Health Studies)": "Health Studies",
                    "Public Health": "Health Studies",
                    "Analytical Sciences": "Life Sciences",
                    "Analytical Sciences": "Life Sciences",
                    "Archaeological Sciences": "Life Sciences",
                    "Archaeological Sciences": "Life Sciences",
                    "Bioinformatics and Computational Biosciences":
                    "Life Sciences",
                    "Cancer Drug Discovery": "Life Sciences",
                    "Cancer Pharmacology": "Life Sciences",
                    "Doctorate in Medicine": "Life Sciences",
                    "Drug Toxicology and Safety Pharmacology": "Life Sciences",
                    "Forensic Archaeology and Crime Scene Investigation":
                    "Life Sciences",
                    "Forensic Archaeology and Crime Scene Investigation":
                    "Life Sciences",
                    "Human Osteology and Palaeopathology": "Life Sciences",
                    "Human Osteology and Palaeopathology": "Life Sciences",
                    "Materials Chemistry": "Life Sciences",
                    "Medical Bioscience": "Life Sciences",
                    "Optometry Progression to Pre-registration Period":
                    "Life Sciences",
                    "Pharmaceutical Technology and Medicines Control":
                    "Life Sciences",
                    "PhD (School of Pharmacy and Medical Sciences)":
                    "Life Sciences",
                    "Skin Sciences and Regenerative Medicine": "Life Sciences",
                    "Applied Management and Entrepreneurship":
                    "Management & Law",
                    "European and International Business Management":
                    "Management & Law",
                    "Finance and Investment": "Management & Law",
                    "Finance, Accounting and Management": "Management & Law",
                    "Financial Management": "Management & Law",
                    "MSc Human Resource Management (CIPD Accreditation)":
                    "Management & Law",
                    "International Business and Management":
                    "Management & Law",
                    "International Commercial Law": "Management & Law",
                    "International Human Rights and Development":
                    "Management & Law",
                    "International Legal Studies": "Management & Law",
                    "International Strategic Marketing": "Management & Law",
                    "Logistics, Data Analytics and Supply Chain Management":
                    "Management & Law",
                    "Management": "Management & Law",
                    "Marketing and Management": "Management & Law",
                    "Natural Resources and Environmental Law and Policy":
                    "Management & Law",
                    "PhD (School of Law)": "Management & Law",
                    "PhD (School of Management)": "Management & Law",
                    "Advanced Practice in Peacebuilding and Conflict Resolution":
                    "Social Sciences",
                    "Economics and Finance for Development": "Social Sciences",
                    "International Development Management": "Social Sciences",
                    "International Relations and Security Studies":
                    "Social Sciences",
                    "Peace, Conflict and Development": "Social Sciences",
                    "Peace, Resilience and Social Justice": "Social Sciences",
                    "PhD (Faculty of Social Sciences)": "Social Sciences",
                    "Project Planning and Management": "Social Sciences",
                    "Psychology": "Social Sciences",
                    "Psychology of Health and Wellbeing": "Social Sciences",
                    "Social Work": "Social Sciences",
                    "Sociology, Social Policy and Crime": "Social Sciences",
                    "Sustainable Development": "Social Sciences",
                }
                item['department'] = department_dict.get(
                    item['programme_en'].strip())
                # print("item['department']: ", item['department'])
                if item['teach_time'] == "fulltime":
                    yield item
            except Exception as e:
                with open(item['university'] + str(item['degree_type']) +
                          ".txt",
                          'a',
                          encoding="utf-8") as f:
                    f.write(
                        str(e) + "\n" + response.url +
                        "\n========================\n")
                print("异常:", str(e))
                print("报错url:", response.url)
コード例 #17
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===============================")
        print(response.url)
        try:

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_list = ''.join(programme_en).split("\n")
            # print(programme_en_list)
            if len(programme_en_list) > 1:
                item['programme_en'] = programme_en_list[0].strip()
                item['degree_name'] = ''.join(programme_en_list[1:]).strip()
            print("item['programme_en'] = ", item['programme_en'])
            print("item['degree_name'] = ", item['degree_name'])

            overview = response.xpath(
                "//h3[@class='px-3 pt-1 text-white'][contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            teach_time = response.xpath(
                "//td[@headers='view-field-study-mode-table-column'][contains(text(),'Full-time')]//text()"
            ).extract()
            item['teach_time'] = getTeachTime(''.join(teach_time))
            # print("item['teach_time'] = ", item['teach_time'])

            duration = response.xpath(
                "//td[@headers='view-field-study-mode-table-column'][contains(text(),'Full-time')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(duration)
            # print(duration)
            if len(duration) != 0:
                duration_list = getIntDuration(''.join(duration))
                # print("duration_list: ", duration_list)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            start_date = response.xpath(
                "//td[@headers='view-field-study-mode-table-column'][contains(text(),'Full-time')]/following-sibling::*[last()]//text()"
            ).extract()
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date'] = ", item['start_date'])

            item[
                'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
            # print("item['location'] = ", item['location'])

            career = response.xpath(
                "//h2[contains(text(),'Professional development')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//h2[contains(text(),'Professional recognition')]|//h2[contains(text(),'Professional recognition')]/following-sibling::*[position()<3]|"
                "//h2[contains(text(),'Careers')]|//h2[contains(text(),'Careers')]/following-sibling::*[position()<3]|"
                "//h2[contains(text(),'Industrial placement')]|//h2[contains(text(),'Industrial placement')]/following-sibling::*[position()<4]"
            ).extract()
            if len(career) == 0:
                career = response.xpath(
                    "//h2[contains(text(),'Career prospects')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]"
                ).extract()
                if len(career) == 0:
                    career = response.xpath(
                        "//h2[contains(text(),'Graduate prospects')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]"
                    ).extract()
            # print(career)
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/preceding-sibling::*").extract()
            modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(
                clear_lianxu_space(modules)) + remove_class(
                    clear_lianxu_space(modules1))
            if item['modules_en'] == "":
                item['modules_en'] = remove_class(
                    clear_lianxu_space(
                        response.xpath(
                            "//h2[contains(text(),'Modules')]/following-sibling::p[position()<3]"
                        ).extract()))
            # print("item['modules_en'] = ", item['modules_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']/*//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            ielts_str = response.xpath(
                "//h2[contains(text(),'English language requirements')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            # print(ielts_re)
            item['ielts_desc'] = ''.join(ielts_re)

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            tuition_fee = response.xpath(
                "//div[@id='fees-collapse']//td[@headers='view-field-study-mode-table-column--2'][contains(text(),'Full-time')]/following-sibling::*[last()]//text()"
            ).extract()
            # print(tuition_fee)
            if len(tuition_fee) > 0 and "£" in "".join(tuition_fee):
                item['tuition_fee'] = int(''.join(tuition_fee[0]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            how_to_apply_url = response.xpath(
                "//span[@class='studymode'][contains(text(), 'Full-time')]/following-sibling::span[@class='applink']/a/@href"
            ).extract()
            # print(how_to_apply_url)
            if len(how_to_apply_url) > 0:
                how_to_apply_url = ''.join(how_to_apply_url[0])
                # print(how_to_apply_url)
                item['apply_proces_en'] = self.parse_apply_proces_en(
                    how_to_apply_url)
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = """<h2>Postgraduate</h2>
<p>To apply for one of our postgraduate courses that require a UK 2:1, you must achieve between 75-85% overall.</p>
<p>For courses that require a UK 2:2, you must achieve between 70-80% overall.</p>
<p>For courses that require a UK first-class degree to be eligible for a scholarship, you must achieve between 80-90% overall.</p>
"""

            department_dict = {}
            department1_list = [
                "Criminology",
                "Criminology and Sociology",
                "Law with Criminology",
                "Media, Culture and Society",
                "Media Studies with Film Studies",
                "Politics and Sociology",
                "Sociology",
                "Criminology and Social Research",
                "Criminology and Social Research (Corporate Crime and Corporate Responsibility)",
                "Criminology and Social Research (Cybercrime and Cybersecurity)",
                "Social Research Methods",
                "Sociology",
                "Economics",
                "Business Economics",
                "Economics and Finance",
                "Economics and thetics",
                "Economics",
                "Business Economics and Finance",
                "Economics",
                "Economics and Finance",
                "International Economics, Finance and Development",
                "Economics (Four Year)",
                "Law",
                "Law with Criminology",
                "Law with International Relations",
                "International Commercial Law",
                "Law",
                "Accounting and Finance",
                "Business and Retail nagement",
                "Business nagement",
                "Business nagement (Entrepreneurship)",
                "Business nagement (HRM)",
                "Business nagement (rketing)",
                "International Business nagement",
                "Accounting and Finance",
                "Business Administration",
                "Business Analytics",
                "Corporate Finance",
                "Entrepreneurship",
                "Hun Resources nagement",
                "International Business nagement",
                "International Financial nagement",
                "International rketing nagement",
                "International Retail rketing in the Digital Environment",
                "Investment nagement",
                "nagement Education",
                "rketing nagement",
                "Occupational and Organizational Psychology",
                "Operations and Supply Chain in the Digital Era",
                "nagement and Business",
                "Creative Music Technology",
                "Digital Media Arts",
                "Film and Video Production Technology",
                "Music",
                "Music and Sound Recording (Tonmeister)",
                "Music (Composition)",
                "Music (Conducting)",
                "Music (Creative Practice)",
                "Music (Musicology)",
                "Music (Perfornce)",
                "Digital Media Arts",
                "Music",
                "Sound Recording",
                "English Literature with Politics",
                "International Relations",
                "Politics",
                "Politics and Economics",
                "Politics and Sociology",
                "Public Affairs",
                "International Relations",
                "Public Affairs",
                "International Event nagement",
                "International Hospitality and Tourism nagement",
                "International Hospitality nagement",
                "International Tourism nagement",
                "Air Transport nagement",
                "International Events nagement",
                "International Events nagement (Eurosters)",
                "Eurosters",
                "International Hospitality nagement (Eurosters)",
                "International Hotel nagement",
                "International Tourism nagement",
                "International Tourism nagement (Eurosters)",
                "Eurosters",
                "Strategic Hotel nagement",
                "Strategic Tourism nagement and rketing",
                "Hospitality and Tourism nagement",
                "English Literature",
                "English Literature and French",
                "English Literature and Gern",
                "English Literature and Spanish",
                "English Literature with Creative Writing",
                "English Literature with Film Studies",
                "English Literature with Politics",
                "English Literature with Sociology",
                "Creative Writing",
                "Creative Writing",
                "English Literature",
                "Creative Writing",
                "English Literature",
                "Business nagement and French",
                "Business nagement and Gern",
                "Business nagement and Spanish",
                "English Literature and French",
                "English Literature and Gern",
                "English Literature and Spanish",
                "Modern Languages (French and Gern)",
                "Modern Languages (French and Spanish)",
                "Modern Languages (Gern and Spanish)",
                "Communication and International rketing",
                "Intercultural Communication with International Business",
                "Interpreting",
                "Interpreting (Chinese Pathway)",
                "Teaching English to Speakers of Other Languages (TESOL)",
                "Translation",
                "Translation and Interpreting",
                "Translation and Interpreting Studies",
                "Film Studies",
                "Linguistics",
                "Literary and Cultural Studies",
                "Translation and Interpreting",
                "Acting",
                "Actor-Musician",
                "Dance",
                "Musical Theatre",
                "Theatre",
                "Theatre and Perfornce",
                "Theatre Production",
                "Acting",
                "Musical Theatre",
                "Stage and Production nagement",
                "Theatre",
                "Acting",
                "Musical Theatre",
                "Dance",
                "Theatre",
            ]
            department1_list = list(set(department1_list))
            department1_value = "Faculty of Arts and Social Sciences"
            for d in department1_list:
                department_dict[d.lower()] = department1_value

            department2_list = [
                "Practitioner Doctorate in Sustainability",
                "Environment and Sustainability",
                "Corporate Environmental Management",
                "Environmental Strategy",
                "Sustainable Development",
                "Chemistry",
                "Chemistry",
                "Chemistry",
                "Chemistry with Forensic Investigation",
                "Medicinal Chemistry",
                "Mathematics",
                "Mathematics with Statistics",
                "Mathematics with Music",
                "Financial Mathematics",
                "Mathematics and Physics",
                "Economics and Mathematics",
                "Mathematics",
                "Mathematics and Physics",
                "Physics",
                "Physics with Astronomy",
                "Physics with Nuclear Astrophysics",
                "Physics with Quantum Technologies",
                "Medical Physics",
                "Nuclear Science and Applications",
                "Physics",
                "Radiation and Environmental Protection",
                "Physics",
                "Information Systems",
                "Information Security",
                "Advanced Materials",
                "Biomedical Engineering",
            ]
            department2_list = list(set(department2_list))
            department2_value = "Faculty of Engineering and Physical Sciences"
            for d in department2_list:
                department_dict[d.lower()] = department2_value

            department3_list = [
                "Nutrition",
                "Nutrition and Dietetics",
                "Nutrition and Food Science",
                "Human Nutrition",
                "Nutritional Medicine",
                "International English Language Testing System (IELTS)",
                "Developmental Psychology in Research and Practice",
                "Health Psychology",
                "Psychology (Conversion)",
                "Primary and Community Care (SPA Community Children's Nursing)",
                "Primary and Community Care (SPA District Nursing)",
                "Primary and Community Care (SPA General Practice Nursing)",
                "Public Health Practice (SCPHN Health Visiting)",
                "Public Health Practice (SCPHN School Nursing)",
                "Advanced Clinical Practice",
                "Advanced Practitioner (Primary and Community Care)",
                "Advanced Practitioner (Public Health Practice)",
                "Education for Health Professionals",
                "Education for Professional Practice",
                "Healthcare Practice",
                "Leadership and Healthcare",
                "Physician Associate",
                "Primary and Community Care (SPA Community Children's Nursing)",
                "Primary and Community Care (SPA District Nursing)",
                "Primary and Community Care (SPA General Practice Nursing)",
                "Public Health Practice (SCPHN Health Visiting)",
                "Public Health Practice (SCPHN School Nursing)",
            ]
            department3_list = list(set(department3_list))
            department3_value = "Faculty of Health and Medical Sciences"
            for d in department3_list:
                department_dict[d.lower()] = department3_value

            item['department'] = department_dict.get(
                item['programme_en'].lower())
            print("item['department: ", item['department'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #18
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1//text()|"
                "//h1[@class='Title']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//div[@class='course_title']/span//text()|"
                "//h1[@class='Title']/small//text()").extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            department = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1//text()|"
                "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            item['start_date'] = response.meta.get(response.url)
            print("item['start_date']1 = ", item['start_date'])
            if item['start_date'] is not None:
                if "," in item['start_date']:
                    start_date_re = item['start_date'].split(',')
                    start_date_str = ""
                    for s in start_date_re:
                        start_date_str += getStartDate(s) + ","
                    item['start_date'] = start_date_str.strip().strip(
                        ',').strip()
            print("item['start_date'] = ", item['start_date'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//div[contains(text(),'Duration')]/following-sibling::*//text()|"
                "//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()|"
                "//dt[contains(text(), 'duration')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_str = ''.join(duration)
            item['other'] = duration_str
            item['teach_time'] = getTeachTime(duration_str)

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['teach_time'] = ", item['teach_time'])
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//div[contains(text(),'Location')]/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']: ", item['location'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                " //strong[contains(text(),'International students')]/../following-sibling::p//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(
                r"Full\stime:\s£\d+,\d+|Full\stime\s£\d+,\d+",
                ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "Full time",
                    "").replace(":", "").replace(",", "").replace("£",
                                                                  "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            if item['tuition_fee'] == None:
                # //strong[contains(text(),'International students')]/../following-sibling::p[1]
                tuition_fee = response.xpath(
                    "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]/following-sibling::*//*[contains(text(),'Full')]//text()|"
                    "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]/../following-sibling::*//*[contains(text(),'Full')]//text()|"
                    "//h3[contains(text(),'Tuition fees')]/..//h4[contains(text(),'Full-time')]/following-sibling::*[position()<3]//*[contains(text(),'International students')]/../text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
                # print("tuition_fee_re: ", tuition_fee_re)
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = int(tuition_fee_re[0].replace(
                        ",", "").replace("£", "").strip())
                # print("item['tuition_fee']: ", item['tuition_fee'])

            rntry_requirements_content = response.xpath(
                "//h3[contains(text(),'Key Facts')]/..//text()").extract()
            clear_space(rntry_requirements_content)
            # print("rntry_requirements_content: ", rntry_requirements_content)
            if "2018 ENTRY REQUIREMENTS" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 ENTRY REQUIREMENTS")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            if "2018 entry requirements" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 entry requirements")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            if item['rntry_requirements'] == "":
                rntry_requirements_content = response.xpath(
                    "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2018 start')]/../../../..//text()|"
                    "//div[contains(text(),'Entry requirements')]/../../..//div[contains(text(),'2018 start')]/../../../..//text()"
                ).extract()
                item['rntry_requirements'] = clear_lianxu_space(
                    rntry_requirements_content)
            print("item['rntry_requirements']: ", item['rntry_requirements'])

            ieltsList = re.findall(r".{1,45}IELTS.{1,45}",
                                   item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            overview = response.xpath(
                """//h2[@id='overview']/..|//h3[contains(text(),'What you’ll experience')]/..|//*[contains(text(),"What you'll experience")]/..|
                                    //h4[contains(text(),"On this course, you'll:")]/../..|//h3[contains(text(),"What you'll experience")]/../preceding-sibling::*[2]|
                                    //h3[contains(text(),'Why take this course?')]/../*[not(@class='blockquote-img')]"""
            ).extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)
            ).replace(
                "<div><div>Get a prospectus</div><div>Book an Open Evening</div><div>Apply Now</div></div>",
                "").strip()
            print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath(
                "//h2[@id='What youll study']/..|//h2[@id='What youll study']/../following-sibling::div[1]|//div[contains(text(),'Units currently being studied')]/../../..|"
                "//h3[@id='structure']/../../following-sibling::div[1]"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//h2[@id='Teaching']/..|//h2[@id='Teaching']/../following-sibling::*[1]|"
                "//h2[@id='How youre assessed']/..|//h2[@id='How youre assessed']/../following-sibling::*[1]|"
                "//div[@class='pure-g purple content']/div[1]/div[@class='box']"
            ).extract()
            if len(teaching_assessment) == 0:
                teaching_assessment = response.xpath(
                    "//h3[contains(text(), 'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//h3[contains(text(),'Careers and opportunities')]/..|"
                "//div[@class='box container content pure-g']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Postgraduate courses</h3>
<p>For entry to our postgraduate Master's programmes, you'll usually need to have one of the following from a recognised Higher Education institution:</p>
<ul>
    <li>a Bachelor's degree (normally from a four year undergraduate programme)</li>
    <li>a Bachelor's degree from Higher Education Self-Study Examinations (full time)</li>
    <li>a top-up degree or university-recognised Pre-Master&rsquo;s Foundation programme</li>
</ul>
<p>Typical minimum Grade Point Average (GPA) requirements:</p>
<ul>
    <li>From 2.8 on a scale of 1-4</li>
    <li>From 7 on a scale of 1-10</li>
</ul>
<p>If you don't meet the postgraduate entry requirements, you can do a pre-Master's programme at<a rel="noopener noreferrer" href="http://www.icp.navitas.com/"></a><a rel="noopener noreferrer" href="https://www.icp.navitas.com/" target="_blank">International College Portsmouth (ICP)</a>&nbsp;for many of our courses.</p>"""
                ]))
            item[
                'apply_proces_en'] = "https://www.port.ac.uk/study/international-students/how-to-apply"
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #19
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.manchester.ac.uk/"
        item['university'] = "The University of Manchester"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'research'
        # 学位类型
        item['degree_type'] = 3
        item['location'] = "Oxford Rd, Manchester, M13 9PL, UK"
        print("===============================")
        print(response.url)
        try:
            # print(response.url)
            # 专业、学位类型
            programmeDegree = response.xpath(
                "//div[@id='course-profile']/div[@class='heading']/h1//text()"
            ).extract()
            clear_space(programmeDegree)
            programmeDegreeStr = ''.join(programmeDegree)
            # print(programmeDegreeStr)
            degree_type = list(
                re.findall(r"^(\w{0,6})|(\w{0,6}/\w{0,6})\s",
                           programmeDegreeStr)[0])
            # print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            programme = programmeDegreeStr.split(''.join(degree_type))
            item['programme_en'] = programme[-1].strip()
            # print("item['degree_name'] = ", item['degree_name'])
            # print("item['programme_en'] = ", item['programme_en'])

            if item['degree_name'] == "PhD":
                item['teach_type'] = 'phd'

            start_date = response.xpath(
                "//*[contains(text(), 'Year of entry:')]//text()").extract()
            item['start_date'] = ''.join(start_date).replace(
                "Year of entry:", "").strip()
            # print("item['start_date'] = ", item['start_date'])

            duration = response.xpath(
                "//div[@id='course-profile']/div[@class='course-profile-content full-page']/div[@class='fact-file']/dl/dd[2]//text()"
            ).extract()
            durationStr = ''.join(duration)
            # print("durationStr = ", durationStr)
            if "full" in durationStr or "Full" in durationStr or "FT" in durationStr or "ft" in durationStr:
                item['teach_time'] = "fulltime"
            duration_re = re.findall(
                r"([a-zA-Z0-9\.]+\s)(year|month|week|yr|yft){1}|([0-9\.]+)(yr|yft|\-month){1}",
                durationStr, re.I)
            # print("duration_re = ", duration_re)
            d_dict = {
                "One": "1",
                "Two": "2",
                "Three": "3",
                "Four": "4",
                "Five": "5",
                "Six": "6",
                "Seven": "7",
                "Eight": "8",
                "Nine": "9",
                "Ten": "10",
                "one": "1",
                "two": "2",
                "three": "3",
                "four": "4",
                "five": "5",
                "six": "6",
                "seven": "7",
                "eight": "8",
                "nine": "9",
                "ten": "10",
            }
            if len(duration_re) > 0:
                d_int = re.findall(r"\d+", ''.join(duration_re[0]))
                if len(d_int) > 0:
                    item['duration'] = int(''.join(d_int))
                else:
                    d = re.findall(
                        r"(One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)",
                        ', '.join(duration_re[0]))
                    print("d = ", d)
                    item['duration'] = int(d_dict.get(''.join(d[0]).strip()))
                if "y" in ''.join(duration_re[0]) or "Y" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 1
                elif "m" in ''.join(duration_re[0]) or "M" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 3
                elif "w" in ''.join(duration_re[0]) or "W" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 4
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//*[contains(text(), 'Academic department')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            if len(department) > 0:
                item['department'] = department[0]
            # print("item['department'] = ", item['department'])

            # 专业描述,雅思托福,就业方向, 学术要求,How To Apply
            overview = response.xpath(
                '//h3[@id="programme-description"]/following-sibling::div[1]|//h3[@id="course-description"]/following-sibling::div[1]'
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # Entry requirements
            rntry_requirements = response.xpath(
                '//h2[@id="entry-requirements"]/following-sibling::*[position()<9]//text()'
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # english = clear_lianxu_space(response.xpath("//h3[contains(text(), 'English language')]/following-sibling::div[1]//text()").extract())
            # print(english)
            # ielts_desc = re.findall(r".{1,100}IELTS.{1,100}", english.replace("IELTS", " IELTS").replace("TOEFL", " TOEFL"))
            # toefl_desc = re.findall(r".{1,100}TOEFL.{1,100}", english.replace("IELTS", " IELTS").replace("TOEFL", " TOEFL"))
            # item['ielts_desc'] = ''.join(ielts_desc).strip()
            # item['toefl_desc'] = ''.join(toefl_desc).strip()
            # print("ielts_Desc: ", ielts_desc)
            # print("toefl_desc: ", toefl_desc)

            ielts_desc = response.xpath(
                "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'IELTS')]//text()"
            ).extract()
            clear_space(ielts_desc)
            if ''.join(ielts_desc).strip() == "IELTS":
                ielts_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'IELTS')]/..//text()"
                ).extract()
            toefl_desc = response.xpath(
                "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'TOEFL')]//text()"
            ).extract()
            clear_space(toefl_desc)
            if ''.join(toefl_desc).strip() == "IBT TOEFL:":
                toefl_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'TOEFL')]/..//text()"
                ).extract()
            item['ielts_desc'] = clear_lianxu_space(ielts_desc)
            item['toefl_desc'] = clear_lianxu_space(toefl_desc)
            print("item['ielts_desc']: ", item['ielts_desc'])
            print("item['toefl_desc']: ", item['toefl_desc'])

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[2]
                item['ielts_s'] = ielts_list[2]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[1]
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            toefl_list = re.findall(r"1[0-1]\d|[12789]\d", item['toefl_desc'])
            print(toefl_list)
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
                # item['toefl_l'] = toefl_list[0]
                # item['toefl_r'] = toefl_list[0]
                # item['toefl_s'] = toefl_list[0]
                # item['toefl_w'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 3:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[2]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 4:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[3]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 5:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[3]
                item['toefl_s'] = toefl_list[4]
                item['toefl_w'] = toefl_list[2]
            print(
                "item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s "
                % (item['toefl'], item['toefl_l'], item['toefl_s'],
                   item['toefl_r'], item['toefl_w']))

            apply_proces_en = response.xpath(
                '//h2[@id="application-and-selection"]/following-sibling::*[position()<15]'
            ).extract()
            apply_proces_en_str = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print(apply_proces_en_str.index("<h2>Course details</h2>"))
            if apply_proces_en_str.find("<h2>Course details</h2>") == -1:
                apply_proces_en_s1 = apply_proces_en_str[
                    0:len(apply_proces_en_str)]
            else:
                apply_proces_en_s1 = apply_proces_en_str[:apply_proces_en_str.find(
                    "<h2>Course details</h2>") - 1]
            item['apply_proces_en'] = apply_proces_en_s1
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            interview_desc_en = response.xpath(
                '//h3[contains(text(), "Interview requirements")]/following-sibling::div[1]'
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            modules_en = response.xpath(
                "//h3[@id='programme-content-year-1']/following-sibling::div[1]"
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(), 'Course unit list')]/following-sibling::*[position()<3]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                '//*[@id="teaching-and-learning"]/following-sibling::*[position()<4]'
            ).extract()
            if len(assessment_en) == 0:
                assessment_en = response.xpath(
                    '//*[@id="coursework-and-assessment"]/following-sibling::*[position()<4]'
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            career_en = response.xpath(
                '//*[@id="careers"]/following-sibling::*').extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])

            fee1 = response.xpath(
                "//div[@id='course-profile']/div[@class='course-profile-content full-page']/ul[1]/li[1]//text()"
            ).extract()
            # print(fee1)
            fee = clear_lianxu_space(fee1)
            fee_re = re.findall(
                r"International\sstudents\s\(per\sannum\):[\sa-zA-Z\-]+£[\d,]+",
                fee)
            fee_re1 = re.findall(r"£[\d,]+", ''.join(fee_re))
            # print("fee_re1: ", fee_re1)
            f = ''.join(fee_re1).replace("£", "").replace(",", "").strip()
            if len(f) != 0:
                item['tuition_fee'] = int(f)
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            item[
                'require_chinese_en'] = """<h2>Postgraduate research entry requirements</h2>
<p>For entry onto our postgraduate research degrees we require an overall mark of 85% or higher in Bachelor's and Master's of Law or related degree from a well-ranked institution with a strong Very Good mark in Master's dissertation.</p>
<p>For PhD study a degree in Law or a relevant discipline will be considered.</p>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #20
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.strath.ac.uk/"
        item["university"] = "University of Strathclyde"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        item['location'] = "16 Richmond Street, Glasgow, G1 1XQ"
        print("===========================")
        print(response.url)
        try:
            # 学位类型
            degree_type = response.xpath("//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/span/text()").extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            if "PhD" in item['degree_name']:
                item['teach_type'] = 'phd'
            # 专业名
            programme = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/text()").extract()
            # print("programme = ", programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            if "Engineering" in item['programme_en']:
                item['department'] = "Faculty of Engineering"
            elif "Science" in item['programme_en']:
                item['department'] = "Faculty of Science"
            elif "Business" in item['programme_en'] or "Finance" in item['programme_en'] or "Marketing" in item['programme_en']:
                item['department'] = "Strathclyde Business School"
            print("item['department'] = ", item['department'])

            # 课程长度、开学时间、截止日期
            durationTeachtime = response.xpath("//b[contains(text(),'Study mode and duration')]/../text()").extract()
            clear_space(durationTeachtime)
            # print("durationTeachtime: ", durationTeachtime)
            durationTeachtimeStr = ''.join(durationTeachtime)

            item['teach_time'] = getTeachTime(durationTeachtimeStr)
            duration_list = getIntDuration(durationTeachtimeStr)
            # print(duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])
            # print("item['teach_time'] = ", item['teach_time'])

            start_date = response.xpath("//b[contains(text(),'Start date')]/../text()").extract()
            start_date_str = ''.join(start_date).replace(":", "")
            print("start_date_str = ", start_date_str)
            item['start_date'] = getStartDate(start_date_str)
            if item['start_date'] != "" and item['start_date'] > "06" and "2018" not in item['start_date'] and "2019"  not in item['start_date']:
                item['start_date'] = "2018-" + item['start_date']
            elif item['start_date'] != "" and item['start_date'] <= "06" and "2018" not in item['start_date'] and "2019"  not in item['start_date']:
                item['start_date'] = "2019-" + item['start_date']
            # print("item['start_date'] = ", item['start_date'])


            # 截止日期
            deadline = response.xpath("//b[contains(text(),'Application deadline')]/../text()").extract()
            deadline = ''.join(start_date).replace(":", "").strip()
            print("deadline = ", deadline)
            item['deadline'] = getStartDate(deadline)
            print("item['deadline'] = ", item['deadline'])

            # 专业描述
            overview = response.xpath("//article[@id='research-opportunities']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            print("item['overview_en'] = ", item['overview_en'])

            # 课程设置、评估方式
            modules = response.xpath("//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath("//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*[1]/following-sibling::*").extract()
            item["assessment_en"] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # 学术要求、英语要求
            rntry_requirements = response.xpath("//article[@id='entry-requirements']//text()").extract()
            item["rntry_requirements"] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            apply_proces_en = response.xpath("//article[@id='how-can-i-apply']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            apply = response.xpath("//article[@id='how-can-i-apply']//text()").extract()
            clear_space(apply)
            ielts_re = re.findall(r"IELTS.{1,80}", ''.join(apply))
            # print("ielts_re = ", ielts_re)
            item["ielts_desc"] = ''.join(ielts_re)
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # print(ieltlsrw)
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip('.').strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'] .strip('.').strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip('.').strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip('.').strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'] .strip('.').strip()
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                  %(item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # 学费    //article[@id='fees-and-funding']/ul[3]/li
            tuition_fee = response.xpath("//html//article[@id='fees-and-funding']/*[contains(text(),'International')]/following-sibling::*[1]//text()").extract()
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = ''.join(tuition_fee_re[0]).replace("£", "").replace(",", "")
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])


            # 就业    //article[@id='careers']
            career = response.xpath("//article[@id='support-and-development']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #21
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 授课方式
        # item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath("//h1[@id='pagetitle']/span//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace("\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath("//th[contains(text(),'Awards available')]/following-sibling::td[1]//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            if "phd" in item['degree_name'].lower() or "md" in item['degree_name'].lower():
                item['teach_type'] = "phd"
                if "research" in item['degree_name'].lower():
                    item['teach_type'] += " " + "research"
                item['degree_type'] = 3
            elif "research" in item['degree_name'].lower():
                item['teach_type'] = "research"
                item['degree_type'] = 3
            else:
                item['teach_type'] = "taught"
                item['degree_type'] = 2
            # print("item['degree_type']: ", item['degree_type'])
            # print("item['teach_type']: ", item['teach_type'])

            # duration
            duration = response.xpath("//th[@scope='row'][contains(text(),'Programme length')]/following-sibling::td[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            item['teach_time'] = getTeachTime(''.join(duration))
            # print("item['teach_time']: ", item['teach_time'])

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath("//th[@scope='row'][contains(text(),'Location of programme')]/following-sibling::td[1]//text()").extract()
            # print("location = ", location)
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath("//th[@scope='row'][contains(text(),'Start date')]/following-sibling::td[1]//text()").extract()
            clear_space(startdate)
            print("startdate = ", startdate)
            if len(startdate) > 0:
                # item['start_date'] = startdate[-1].strip()
                # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(''.join(startdate[-1]))
            print("item['start_date'] = ", item['start_date'])

            # deadline
            deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # print("deadline = ", deadline)
            item['deadline'] = getStartDate(''.join(deadline))
            # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath("//div[@id='contact']/p[@class='pg-contact-address']/text()").extract()
            clear_space(department)
            # print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath("//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}", ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            # overview  //div[@id='programme-overview']//text()
            overview = response.xpath("//div[@id='programme-overview']|//div[@id='pgr-overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # tuitionFee   //div[@id='fees']
            tuitionFee = response.xpath("//dt[contains(text(),'Overseas: full-time')]/following-sibling::dd[1]//text()").extract()
            clear_space(tuitionFee)
            print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(''.join(tuitionFee[0]).replace("£", "").replace(",", "").strip())

            if item['tuition_fee'] is None:
                tuitionFee1 = response.xpath(
                    "//dl//dt[contains(text(),'Overseas:')]/following-sibling::dd[1]//text()").extract()
                clear_space(tuitionFee1)
                print("tuitionFee1 = ", tuitionFee1)
                if len(tuitionFee1) > 0:
                    item['tuition_fee_pre'] = "£"
                    item['tuition_fee'] = getTuition_fee(''.join(tuitionFee1))
                if item['tuition_fee'] == 0:
                    item['tuition_fee_pre'] = ""
                    item['tuition_fee'] = None
            if item['tuition_fee'] is None:
                print("tuition_fee 为空")
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # modules   //div[@id='programme-structure']
            modules = response.xpath("//div[@id='programme-structure']|//div[@id='pgr-research-groups']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath("//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath("//*[contains(text(),'Profile')]//text()|//div[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            elif "Profile" not in item['ielts_desc']:
                ieltsDict = get_ielts(item['ielts_desc'])
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # 就业    //div[@id='careers']
            career = response.xpath("//div[@id='careers']").extract()
            # print("department = ", department)
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            require_chinese_en = """<h2 id="pgentryreqs">Entry requirements for postgraduate programmes</h2>
<p>You should&nbsp;<a href="/pg-howtoapply/">apply online</a>&nbsp;for all our postgraduate programmes.</p>
<p>To be considered for admission to postgraduate study at the University of Bristol, the minimum requirement for entry is an undergraduate (Bachelor&rsquo;s) degree that is equivalent to a UK Upper Second Class degree (also known as a 2:1). Please refer to the <a href="http://www.bristol.ac.uk/study/postgraduate/admissions-statements/%20%20%20" target="_blank">Postgraduate Admissions Statements</a> for each programme for individual entry requirements.</p>
<ul>
<li>Applicants who hold a 4-year Bachelor's (Honours) degree from a prestigious university with a minimum of 80% will be considered for admission to a Master's degree.</li>
<li>Applicants who hold a good Master's degree from a prestigious university will be considered for admission to PhD study.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/" target="_blank">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # http://www.bristol.ac.uk/study/postgraduate/apply/
            item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<p>We offer an online application system for all of our programmes, except the Postgraduate Certificate in Education for which you should <a href="https://www.ucas.com/ucas/teacher-training/ucas-teacher-training-apply-and-track">apply through UCAS</a>.</p>
<p>You can use our online admissions system to:</p>
<ul>
<li>submit all your application details securely online and view your completed application form;</li>
<li>upload supporting documents;</li>
<li>request references electronically;</li>
<li>track the progress of your application;</li>
<li>receive a decision on your application online;</li>
<li>update your contact details (it is important you tell us if you change your home address or email);</li>
<li>receive useful information about the University and your application.</li>
</ul>
<p>If you are unable to make an online application, please contact the Enquiries team on <a href="mailto:[email protected]">[email protected]</a>.</p>"""]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            apply_documents_en = response.xpath("//h3[contains(text(),'English language requirements')]/preceding-sibling::*[position()<last()]").extract()
            item["apply_documents_en"] = remove_class(clear_lianxu_space(apply_documents_en))
            print("item['apply_documents_en']: ", item['apply_documents_en'])
            yield item
        except Exception as e:
            print("异常:", str(e))
            print("报错链接:", response.url)
            with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
コード例 #22
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.uclan.ac.uk/"
        item['university'] = "University of Central Lancashire"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        # item['location'] = 'Hope Park, Liverpool, L16 9JD'
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/text()"
            ).extract()
            if len(programme) == 0:
                programme = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/text()"
                ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/span/text()"
            ).extract()
            if len(degree_type) == 0:
                degree_type = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/span/text()"
                ).extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            department = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h4//text()"
            ).extract()
            item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            duration = response.xpath(
                "//h4[contains(text(), 'Duration:')]/..//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            teach_time = response.xpath(
                "//strong[contains(text(),'Full-time:')]/..//text()").extract(
                )
            clear_space(teach_time)
            print("teach_time: ", teach_time)
            item['other'] = ','.join(teach_time)
            if ''.join(teach_time).strip() == "Full-time:" or teach_time[teach_time.index("Full-time:")+1] == "N/A" or\
                    teach_time[teach_time.index("Full-time:")+1] == "" or "part-time" in item['programme_en']:
                item['teach_time'] = "parttime"
            elif item['teach_time'] == "":
                item['teach_time'] = "fulltime"
            print("item['teach_time'] = ", item['teach_time'])

            location = response.xpath(
                "//h4[contains(text(), 'Campus')]/following-sibling::p[1]//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']", item['location'])

            start_date = response.xpath(
                "//h4[contains(text(), 'Start Date:')]/following-sibling::p[1]//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']", item['start_date'])

            overview = response.xpath(
                "//div[@id='FullCourse']/div[@class='eightcol']/div[@class='sixcol last']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']", item['overview_en'])

            # //div[@id='EntryReq']
            entry_requirements = response.xpath(
                "//div[@id='EntryReq']//text()").extract()
            entry_requirements_str = ''.join(entry_requirements).strip()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']", item['rntry_requirements'])

            # ielts = response.xpath("//div[@id='EntryReq']//p[last()-1]//text() | //div[@id='EntryReq']//ul[last()]//text()").extract()
            # clear_space(ielts)

            # //div[@id='caag']
            modules = response.xpath("//div[@id='caag']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']", item['modules_en'])

            # //h3[contains(text(),'Learning Environment and Assessment')]/..
            assessment_en = response.xpath(
                "//h3[contains(text(),'Learning Environment and Assessment')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']", item['assessment_en'])

            # //div[@class='ug-course-2017']/div[@class='container gap-bottom'][2]/div[@class='row']/div[@class='twelvecol last']/div
            career_en = response.xpath(
                "//h3[contains(text(),'Graduate Careers')]/..|//h3[contains(text(),'Opportunities')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']", item['career_en'])

            # //h3[@id='applynow']/..
            apply_proces_en = response.xpath(
                "//h3[@id='applynow']/..").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']", item['apply_proces_en'])

            # https://www.uclan.ac.uk/study_here/fees_and_finance/international_tuition_fees.php#international
            item['tuition_fee'] = '12950'
            if item['department'] == "School of Forensic and Applied Sciences" or item['department'] == "School of Physical Sciences and Computing" \
                    or item['department'] == "School of Pharmacy and Biomedical Sciences" or item['department'] == "School of Engineering":
                item['tuition_fee'] = '13950'
            item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']", item['tuition_fee'])
            # School of Forensic and Applied Sciences
            # School of Physical Sciences and Computing
            # School of Pharmacy and Biomedical Sciences
            # School of Engineering

            ieltsList = re.findall(r'.{1,50}IELTS.{1,80}',
                                   entry_requirements_str)
            print("ieltslist: ", ieltsList)
            item['ielts_desc'] = ''.join(ieltsList)
            # print("item['ielts_desc']", item['ielts_desc'])

            ielts_list = re.findall(
                r"[5-9]\.\d\s|[5-9]\.\d,|[5-9]\.\d\.|[5-9]\.\d$|[5-9]\s|[5-9]\.",
                item['ielts_desc'])
            # print(ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_r'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_w'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_l'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_s'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_w'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_w'] = ielts_list[2].strip().strip('.').replace(
                    ',', '').strip()
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item[
                'require_chinese_en'] = "<p>4-year Bachelors degree with grades of 70% or above</p>"
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #23
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.salford.ac.uk/"
        item['university'] = "University of Salford"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        item['location'] = 'The Crescent, Salford, M5 4WT, UK'
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h2//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/p
            department = response.xpath(
                "//strong[contains(text(), 'School -')]/../text()").extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            start_date = response.xpath(
                "//strong[contains(text(), 'Start Date(s):')]/../text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//strong[contains(text(), 'Duration')]/../following-sibling::*[position()<3]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            if item['department'] == "School of Environment & Life Sciences" or item[
                    'department'] == "School of Computing, Science & Engineering" or item[
                        'department'] == "School of the Built Environment" or item[
                            'department'] == "School of Health Sciences":
                item['tuition_fee'] = 13680
                item['tuition_fee_pre'] = "£"
            elif item['department'] == "School of Arts & Media":
                item['tuition_fee'] = 12490
                item['tuition_fee_pre'] = "£"
            elif item['department'] == "Salford Business School":
                item['tuition_fee'] = 12990
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1]
            overview = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1] | //div[@id='content']/div[@class='row']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //section[@id='about']/div[@id='content']
            # modules_en = response.xpath("//div[@id='courseaccordion']").extract()
            # if len(modules_en) == 0:
            #     # print("********")
            #     modules_en = response.xpath("//h2[contains(text(),'Course Details')]/following-sibling::*").extract()
            # item['modules_en'] = remove_class(clear_lianxu_space(modules_en)) # .replace("&nbsp;", "")
            # item['modules_en'] = item['modules_en'].encode('utf-8').decode("unicode-escape").replace(" ", "")
            # print("item['modules_en']: ", item['modules_en'])

            # //section[@id='requirements']/div
            entry_requirements = response.xpath(
                "//section[@id='requirements']/div//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # 申请材料
            apply_documents_en = response.xpath(
                "//h3[contains(text(),'Applicant Profile')]/preceding-sibling::*[1]/following-sibling::*[position()<5]"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(apply_documents_en)
            ).replace(
                "<h3>International Students - Academic Technology Approval Scheme (ATAS)</h3>",
                "").strip()
            # print("item['apply_documents_en']: ", item['apply_documents_en'])

            # //h3[contains(text(),'English Language Requirements')]/following-sibling::*[1]
            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).replace(
                "Suitable For", "").strip()
            # print("item['ielts_desc']: ",item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # //section[@id='teaching']/div[@class='container main']/div[@class='col-md-12']/div[@id='teaching_0a19']
            assessment_en = response.xpath(
                "//h3[contains(text(),'Assessment Links')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            # //section[@id='employability']/div[@class='container main']/div[@class='col-md-12']/div[@id='employ_0a19']
            career = response.xpath(
                "//section[@id='employability']/div").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div id="content_div_43747">
<h1>Applying for a research degree</h1><p>To apply for your postgraduate research place, you will need to complete our online application form. You will need to have at hand your supporting documents ready to upload when you start the online application. We have four entry points: October, January, April and July. From September 2018, this will change to three entry points, in January, May and September.&nbsp;For the Salford DBA, there are two entry points: April and September.</p><p>Please submit your application with a minimum of six weeks before the date you are aiming to register.</p><ul><li>Degree certificates</li><li>Transcripts</li><li><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0018/104841/18-02-23-Vouch-List-Equivalent-qualifications-to-English-GCSE-Grade-C.pdf">English language qualifications</a></li><li><a href="http://www.advice.salford.ac.uk/page/visa">Passport details (required for International applicants)</a></li><li><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0003/631686/Writing-a-Research-Proposal-Guidance.pdf" title="How to write a research proposal" target="_blank">Research proposal</a></li></ul><p>If you are applying for a PhD by published works, please go <a href="https://shop.salford.ac.uk/product-catalogue/university-goods-and-services/phd-by-published-works/phd-by-published-works-application-fee">to the online shop to make your payment</a> before completing your application.</p><p>For help preparing a research proposal for the PhD in Business, Management and Law, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0009/1572147/HowtoWriteaResearchProposal2018.pdf" title="PhD Research proposal guidance" target="_blank">Research Proposal Guidance</a>. For the Salford DBA, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/1559996/Guidance-on-Writing-a-DBA-Research-Proposal-PDF.pdf" title="Guidance on Writing a DBA Research Proposal" target="_blank">Guidance on Writing a DBA Research Proposal</a>.</p><h2>English Language Requirements</h2><p>If you have not yet taken an English Language test please note that availability of these and the time taken to receive certificates of results can vary depending on the time of year. For further information and to check timescales and availability please visit:</p><p><strong>IELTS</strong> - <a href="http://www.ielts.org/">http://www.ielts.org/</a><br /><strong>Pearson Test of English Academic</strong> - <a href="http://www.pearsonpte.com/testme">www.pearsonpte.com/testme</a></p><p>For details of other English Language tests accepted for the UKVI, please visit:<br /><a href="http://www.ukba.homeoffice.gov.uk/sitecontent/applicationforms/new-approved-english-tests.pdf">http://www.ukba.homeoffice.gov.uk/sitecontent/applicationforms/new-approved-english-tests.pdf</a></p><h2>Guide to submitting your application</h2><ol><li>When you first enter the online application you will be asked to create an account</li><li>You will then receive an email with your login PIN and password</li><li>You can re-enter and complete your application at times convenient to you</li><li>Fill in application details &ndash; using the guidance within the form</li><li>Upload your supporting documents</li><li>Once you have submitted your application you can print a copy of your application. However you cannot re-enter and make any changes at this stage</li></ol><h3>What happens next?</h3><ul><li>When you submit your online application you will receive and acknowledgement by email</li><li>You&rsquo;ll be notified of the outcome of your application in writing.</li><li>If you have any questions about the progress of your application please <a href="mailto:[email protected]">email admissions</a></li></ul><h3>Relevant work experience</h3><p><strong>We try to make applying to Salford as flexible and straightforward as possible.</strong></p><p>We&rsquo;re not just interested in exams you&rsquo;ve passed and certificates you&rsquo;ve collected. If you&rsquo;ve gained enough relevant work experience &ndash; paid or voluntary &ndash; we&rsquo;ll take that into account through our Accreditation of Prior Learning (APL) and Accreditation of PriorExperiential Learning (APEL) schemes.</p><h2>How to prepare a research proposal</h2><p>The research proposal is a crucial part of your application.</p><p>You should discuss your proposal with the <strong>Postgraduate Research Admissions Contact</strong> of the School to which you are applying, to make sure you understand what is expected in your subject area.</p><p>For help preparing a research proposal for the PhD in Business, Management and Law, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0009/1572147/HowtoWriteaResearchProposal2018.pdf" title="PhD Research proposal guidance" target="_blank">Research Proposal Guidance</a>. For the Salford DBA, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/1559996/Guidance-on-Writing-a-DBA-Research-Proposal-PDF.pdf" title="Guidance on Writing a DBA Research Proposal" target="_blank">Guidance on Writing a DBA Research Proposal</a>.&nbsp;</p><p>When submitting an application, make sure that the specialist area you wish to study is covered by a member of staff at the University:</p><ul><li>Check individual staff entries on the <a href="http://www.salford.ac.uk/research/research-centres">Research Centre sites</a> that relate to your area</li><li>Explore <a href="http://www.seek.salford.ac.uk/">staff profiles</a> and check current research interests</li><li>Take note of the relevant Research Administrator listed below you will need it when completing your online application</li></ul>
</div>
"""
                ]))
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Postgraduate</strong></p><p>(4 year) Bachelor degrees with a GPA 2.7/4.0 or 70% from a National University; or from a Project 211 University with a GPA 2.6/4.0 or 65%; or from a Private University with GPA 2.75/4.0 or 75%.</p>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #24
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "University of Hertfordshire"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        print("===========================")
        print(response.url)
        try:
            # //div[@id='content']/main/div[@class='course-sub-head']/a
            department = response.xpath("//div[@id='content']/main/div[@class='course-sub-head']/a//text()").extract()
            department = ''.join(department).strip()
            item['department'] = department
            print("department: ", department)

            # 专业、学位类型 //div[@id='content']/main/h1
            programmeDegreetype = response.xpath("//div[@id='content']/main/h1//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)
            degreetype = re.findall(r"Masters\sby\sResearch|^\w+\s", programmeDegreetypeStr)
            # print(degreetype)
            if len(degreetype) != 0:
                degreetype = ''.join(list(degreetype[0]))
                # print(degreetype)
                item['degree_name'] = degreetype
            print("item['degree_name']: ", item['degree_name'])
            programme = programmeDegreetypeStr.replace(''.join(degreetype), '')
            # print(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            if item['degree_name'] == "Masters by Research":
                item['teach_type'] = 'research'

            duration = response.xpath(
                "//h3[contains(text(),'Key course information')]/following-sibling::ul//*[contains(text(), 'Full')]//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['teach_time'] = ", item['teach_time'])
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//h3[contains(text(),'Key course information')]/following-sibling::ul//*[contains(text(), 'Locations')]/../../following-sibling::*//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            print("item['location'] = ", item['location'])

            # //div[@id='overview']
            overview = response.xpath("//div[@id='overview']").extract()
            overview_en = remove_class(clear_lianxu_space(overview))
            item['overview_en'] = overview_en
            print("item['overview_en']: ", item['overview_en'])

            assessment_en = response.xpath("//h3[contains(text(),'Teaching methods')]/following-sibling::*").extract()
            if len(assessment_en) > 0:
                item['assessment_en'] = "<h3>Teaching methods</h3>" + remove_class(clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//h3[contains(text(),'Teaching methods')]/preceding-sibling::*").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en']: ", item['career_en'])

            modules = response.xpath("//div[@id='modules']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # //div[@id='fees']
            feeContent = response.xpath("//h4[contains(text(),'International Students')]/following-sibling::h5[contains(text(), 'Full')]/following-sibling::ul[1]//text()").extract()
            clear_space(feeContent)
            # print("feeContent: ", feeContent)
            feelist = re.findall(r"£[\d,]+", ''.join(feeContent))
            if len(feelist) > 0:
                item['tuition_fee'] = int(feelist[0].replace('£', '').replace(',', '').strip())
                item['tuition_fee_pre'] = '£'
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='how-to-apply']
            entry_requirements = response.xpath("//h2[contains(text(),'How to apply')]/preceding-sibling::*//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            print("item['rntry_requirements']: ", item['rntry_requirements'])

            # print("entry_requirementsStr: ", entry_requirementsStr)
            ielts = re.findall(r"IELTS[\sa-zA-Z]*\d\.?\d?[\sa-z\(\)]*\d\.?\d?[\sa-z\(\)]{1,100}", item['rntry_requirements'])
            # print("ielts: ", ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


            yield item
        except Exception as e:
            with open(item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #25
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "http://www.swansea.ac.uk/"
        item['university'] = "Swansea University"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        item['degree_type'] = 3
        item['location'] = "Singleton Park, Swansea, SA2 8PP, Wales, UK"
        print("===============================")
        print(response.url)
        try:
            # 专业、学位类型
            courseDegreeaward = response.xpath(
                "//h1[@class='content-header-heading']//text()").extract()
            courseDegreeawardStr = ''.join(courseDegreeaward)
            if len(courseDegreeawardStr) != 0:
                d = re.findall(
                    r"^(\w+\s/\w+\s/\w+)|^(\w+/\w+/\w+)|^(\w+/\s\w+)|^(\w+)",
                    courseDegreeawardStr)
                if len(d) != 0:
                    degree_type = ''.join(list(d)[0])
                    # print(degree_type)
                    item['degree_name'] = degree_type
                    programme = courseDegreeawardStr.split(degree_type)
                    item['programme_en'] = ''.join(programme).strip()
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            # print("courseDegreeawardStr = ", courseDegreeawardStr)
            departmentDict = {
                "Ancient Egyptian Culture":
                "College of Arts and Humanities",
                "Ancient History and Classical Culture":
                "College of Arts and Humanities",
                "Ancient Narrative Literature":
                "College of Arts and Humanities",
                "Classics":
                "College of Arts and Humanities",
                "Chinese-English Translation & Language Teaching":
                "College of Arts and Humanities",
                "Teaching English to Speakers of Other Languages  (TESOL)":
                "College of Arts and Humanities",
                "Creative Writing":
                "College of Arts and Humanities",
                "English Literature":
                "College of Arts and Humanities",
                "Gender and Culture":
                "College of Arts and Humanities",
                "Welsh Writing in English":
                "College of Arts and Humanities",
                "Early Modern History":
                "College of Arts and Humanities",
                "History":
                "College of Arts and Humanities",
                "Medieval Studies":
                "College of Arts and Humanities",
                "Modern History":
                "College of Arts and Humanities",
                "Public History and Heritage":
                "College of Arts and Humanities",
                "Public History and Heritage (extended)":
                "College of Arts and Humanities",
                "Professional Translation":
                "College of Arts and Humanities",
                "Professional Translation (Extended)":
                "College of Arts and Humanities",
                "Translation and Interpreting":
                "College of Arts and Humanities",
                "Translation and Interpreting (Extended)":
                "College of Arts and Humanities",
                "Postgraduate Certificate in Translation Technology":
                "College of Arts and Humanities",
                "Communication, Media Practice and PR":
                "College of Arts and Humanities",
                "International Journalism":
                "College of Arts and Humanities",
                "Digital Media":
                "College of Arts and Humanities",
                "Erasmus Mundus Journalism, Media and Globalisation":
                "College of Arts and Humanities",
                "Development and Human Rights":
                "College of Arts and Humanities",
                "Gender and Culture":
                "College of Arts and Humanities",
                "International Relations":
                "College of Arts and Humanities",
                "International Security & Development":
                "College of Arts and Humanities",
                "Politics":
                "College of Arts and Humanities",
                "Public Policy":
                "College of Arts and Humanities",
                "War and Society":
                "College of Arts and Humanities",
                "BEng Aerospace Engineering":
                "College of Engineering",
                "MEng Aerospace Engineering":
                "College of Engineering",
                "BEng Aerospace Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Aerospace Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Aerospace Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Aerospace Engineering (with a Year Abroad)":
                "College of Engineering",
                "Aerospace Engineering Foundation Year":
                "College of Engineering",
                "MSc Aerospace Engineering":
                "College of Engineering",
                "MSc by Research in Aerospace Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "PhD or MPhil Aerospace Engineering":
                "College of Engineering",
                "BEng Chemical Engineering":
                "College of Engineering",
                "MEng Chemical Engineering":
                "College of Engineering",
                "BEng Chemical Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Chemical Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Chemical Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Chemical Engineering (with a Year Abroad)":
                "College of Engineering",
                "Chemical Engineering Foundation Year":
                "College of Engineering",
                "MSc Chemical Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research in Chemical Engineering":
                "College of Engineering",
                "MSc by Research in Bio-process Engineering":
                "College of Engineering",
                "MSc by Research in Desalination and Water Re-use":
                "College of Engineering",
                "MSc by Research in Fuel Technology":
                "College of Engineering",
                "MSc by Research in Membrane Technology":
                "College of Engineering",
                "PhD or MPhil Chemical Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "BEng Civil Engineering":
                "College of Engineering",
                "MEng Civil Engineering":
                "College of Engineering",
                "BEng Civil Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Civil Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Civil Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Civil Engineering (with a Year Abroad)":
                "College of Engineering",
                "MSc Civil Engineering":
                "College of Engineering",
                "Erasmus Mundus MSc in Computational Mechanics":
                "College of Engineering",
                "MSc Computer Modelling and Finite Elements in Engineering Mechanics":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MRes Computer Modelling in Engineering":
                "College of Engineering",
                "MSc by Research in Civil Engineering":
                "College of Engineering",
                "PhD Computational Mechanics":
                "College of Engineering",
                "PhD or MPhil Civil Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "BEng Electronic and Electrical Engineering":
                "College of Engineering",
                "MEng Electronic and Electrical Engineering":
                "College of Engineering",
                "BEng Electronic and Electrical Engineering (with a year in Europe, N. America, Australia or industry)":
                "College of Engineering",
                "MEng Electronic and Electrical Engineering (with a year in Europe, N. America, Australia or industry)":
                "College of Engineering",
                "Electronic and Electrical Engineering Foundation Year":
                "College of Engineering",
                "MSc Communications Engineering":
                "College of Engineering",
                "MSc Electronic and Electrical Engineering":
                "College of Engineering",
                "MSc Power Engineering and Sustainable Energy":
                "College of Engineering",
                "MSc Nanoscience to Nanotechnology":
                "College of Engineering",
                "MSc by Research in Electronic and Electrical Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "PhD or MPhil Electronic and Electrical Engineering":
                "College of Engineering",
                "Erasmus Mundus MSc in Computational Mechanics":
                "College of Engineering",
                "MSc Computer Modelling and Finite Elements in Engineering Mechanics":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MRes Computer Modelling in Engineering":
                "College of Engineering",
                "PhD Computational Mechanics":
                "College of Engineering",
                "PhD or MPhil Civil Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "BEng Materials Science and Engineering":
                "College of Engineering",
                "MEng Materials Science and Engineering":
                "College of Engineering",
                "BEng Materials Science and Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Materials Science and Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Materials Science and Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Materials Science and Engineering (with a Year Abroad)":
                "College of Engineering",
                "Materials Science and Engineering Foundation Year":
                "College of Engineering",
                "MSc Materials Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research in Materials Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "PhD or MPhil Materials Engineering":
                "College of Engineering",
                "BEng Mechanical Engineering":
                "College of Engineering",
                "MEng Mechanical Engineering":
                "College of Engineering",
                "BEng Mechanical Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Mechanical Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Mechanical Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Mechanical Engineering (with a Year Abroad)":
                "College of Engineering",
                "Mechanical Engineering Foundation Year":
                "College of Engineering",
                "MSc Mechanical Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research in Mechanical Engineering":
                "College of Engineering",
                "PhD or MPhil Mechanical Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "MSc Nanoscience to Nanotechnology":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research Nanotechnology":
                "College of Engineering",
                "PhD or MPhil Nanotechnology":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "Zienkiewicz Centre for Computational Engineering (ZCCE)":
                "College of Engineering",
                "Materials Research Centre (MRC)":
                "College of Engineering",
                "Systems and Process Engineering Centre (SPEC)":
                "College of Engineering",
                "Applied Sports, Technology, Exercise and Medicine (A-STEM)":
                "College of Engineering",
                "MSc/PGCert/PGDip Gerontology and Ageing Studies":
                "College of Human and Health Sciences",
                "MSc International Gerontology and Ageing Studies":
                "College of Human and Health Sciences",
                "MA/PGDip/PGCert Childhood Studies":
                "College of Human and Health Sciences",
                "MA/PGDip/PGCert Developmental and Therapeutic Play":
                "College of Human and Health Sciences",
                "PGCert Enhanced Neonatal Care":
                "College of Human and Health Sciences",
                "MSc/PGDip/PGCert Child Public Health":
                "College of Human and Health Sciences",
                "MA/PGDip/PGCert Education for Health Professions":
                "College of Human and Health Sciences",
                "MSc/PGDip Advanced Critical Care Practice":
                "College of Human and Health Sciences",
                "MSc/PGDip Advanced Practice in Health Care":
                "College of Human and Health Sciences",
                "MSc/PGDip/PGCert Advanced Specialist Blood Transfusion Practice":
                "College of Human and Health Sciences",
                "PGCert Approved Mental Health Professional":
                "College of Human and Health Sciences",
                "PGCert Blood Component Transfusion":
                "College of Human and Health Sciences",
                "MSc/PgD/PgC Community and Primary Health Care Practice":
                "College of Human and Health Sciences",
                "MSc/PGDip/PGCert Enhanced Professional Practice":
                "College of Human and Health Sciences",
                "MSc/PGDip Enhanced Professional Midwifery Practice":
                "College of Human and Health Sciences",
                "MSc Long Term and Chronic Conditions Management":
                "College of Human and Health Sciences",
                "MA Medical Law and Ethics":
                "College of Human and Health Sciences",
                "PGCert Non-Medical Prescribing for Nurses and Midwives":
                "College of Human and Health Sciences",
                "PGCert Non-Medical Prescribing for Allied Health Professionals":
                "College of Human and Health Sciences",
                "PGCert Non-Medical Prescribing for Pharmacists":
                "College of Human and Health Sciences",
                "MSc Nursing Pre-Registration (Adult)":
                "College of Human and Health Sciences",
                "MSc Nursing Pre-Registration (Child)":
                "College of Human and Health Sciences",
                "MSc Nursing Pre-Registration (Mental Health)":
                "College of Human and Health Sciences",
                "MSc/PgD Public Health & Health Promotion":
                "College of Human and Health Sciences",
                "MSc Social Work":
                "College of Human and Health Sciences",
                "MSc Health Care Management":
                "College of Human and Health Sciences",
                "MSc Leadership, Management and Innovation in Health Care":
                "College of Human and Health Sciences",
                "MSc Abnormal and Clinical Psychology":
                "College of Human and Health Sciences",
                "MSc Cognitive Neuroscience":
                "College of Human and Health Sciences",
                "LLM in LegalTech":
                "Hillary Rodham Clinton School of Law",
                "LLM in Human Rights":
                "Hillary Rodham Clinton School of Law",
                "LLM Intellectual Property & Commercial Practice":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Commercial Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Commercial and Maritime Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Maritime Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Trade Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in Legal Practice and Advanced Drafting":
                "Hillary Rodham Clinton School of Law",
                "LLM in Oil, Gas and Renewable Energy Law":
                "Hillary Rodham Clinton School of Law",
                "Law PhD/MPhil":
                "Hillary Rodham Clinton School of Law",
                "Graduate Diploma in Law":
                "Hillary Rodham Clinton School of Law",
                "Legal Practice Course":
                "Hillary Rodham Clinton School of Law",
                "LLM in Legal Practice and Advanced Drafting":
                "Hillary Rodham Clinton School of Law",
                "MSc Environmental Dynamics and Climate Change":
                "College of Science",
                "MSc Geographic Information and Climate Change":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "MSc by Research in Earth Observation":
                "College of Science",
                "MSc by Research in Environmental Dynamics":
                "College of Science",
                "MSc by Research in Glaciology":
                "College of Science",
                "MSc by Research in Global Environmental Modelling":
                "College of Science",
                "MSc by Research in Global Migration":
                "College of Science",
                "MSc by Research in Media Geographies":
                "College of Science",
                "MSc by Research in Social Theory and Space":
                "College of Science",
                "MSc by Research in Urban Studies":
                "College of Science",
                "PhD/MPhil Human Geography":
                "College of Science",
                "PhD/MPhil Physical Geography":
                "College of Science",
                "MSc Maths & Computing for Finance":
                "College of Science",
                "MSc Mathematics":
                "College of Science",
                "MRes Stochastic Processes: Theory and Application":
                "College of Science",
                "MSc by Research in Mathematics":
                "College of Science",
                "PhD/MPhil Mathematics":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "Antimatter Physics":
                "College of Science",
                "Cold Atoms and Quantum Optics":
                "College of Science",
                "Laser Physics":
                "College of Science",
                "Lattice Gauge Theory":
                "College of Science",
                "Nanotechnology":
                "College of Science",
                "Quantum Fields & Strings":
                "College of Science",
                "Theoretical Particle Physics":
                "College of Science",
                "PhD/MPhil Physics":
                "College of Science",
                "PhD / MSc by Research Chemistry":
                "College of Science",
                "MSc Computer Science":
                "College of Science",
                "MSc Advanced Computer Science":
                "College of Science",
                "MSc Advanced Software Technology":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "MSc Data Science":
                "College of Science",
                "MSc Computer Science: Informatique (Swansea route)":
                "College of Science",
                "MSc Computer Science: Informatique (Grenoble route)":
                "College of Science",
                "MSc by Research in Human Computer Interaction":
                "College of Science",
                "MSc by Research in Theoretical Computer Science":
                "College of Science",
                "MSc by Research in Visual and Interactive Computing":
                "College of Science",
                "MRes Computing and Future Interaction Technologies":
                "College of Science",
                "MRes Visual Computing":
                "College of Science",
                "MRes Logic and Computation":
                "College of Science",
                "PhD/MPhil/MSc by Research in Computer Science":
                "College of Science",
                "MSc Environmental Biology: Conservation and Resource Management":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "MRes Biosciences":
                "College of Science",
                "PhD/MPhil Biological Sciences":
                "College of Science",
                "MSc Accounting & Finance":
                "School of Management",
                "MSc Financial Management":
                "School of Management",
                "MSc Finance and Business Analytics":
                "School of Management",
                "MSc Finance":
                "School of Management",
                "MSc International Banking & Finance":
                "School of Management",
                "MSc Investment Management":
                "School of Management",
                "MSc Strategic Accounting":
                "School of Management",
                "Generalist MSc Management":
                "School of Management",
                "Marketing":
                "School of Management",
                "Finance ":
                "School of Management",
                "Human Resource Management":
                "School of Management",
                "Entrepreneurship ":
                "School of Management",
                "Operations & Supply Management":
                "School of Management",
                "International Management":
                "School of Management",
                "International Standards":
                "School of Management",
                "Business Analytics":
                "School of Management",
                "E-Business":
                "School of Management",
                "Tourism ":
                "School of Management",
                "MSc Economics":
                "School of Management",
                "MSc Economics & Finance":
                "School of Management",
                "Strategic Marketing":
                "School of Management",
                "MSc Management (Marketing)":
                "School of Management",
                "MSc Clinical Medicine":
                "Swansea University Medical School",
                "MSc Clinical Science (Medical Physics)":
                "Swansea University Medical School",
                "MSc Diabetes Practice (Distance Learning)":
                "Swansea University Medical School",
                "MSc Genomic Medicine":
                "Swansea University Medical School",
                "MSc Medical Radiation Physics":
                "Swansea University Medical School",
                "MSc Nanomedicine":
                "Swansea University Medical School",
                "PG Dip Physician Associate Studies":
                "Swansea University Medical School",
                "MSc Applied Analytical Science (LCMS)":
                "Swansea University Medical School",
                "MSc Autism and Related Conditions":
                "Swansea University Medical School",
                "MSc Health Data Science":
                "Swansea University Medical School",
                "MSc Health Informatics":
                "Swansea University Medical School",
                "MSc Leadership for the Health Professions (Distance Learning)":
                "Swansea University Medical School",
                "MRes Applied Analytical Science (LCMS)":
                "Swansea University Medical School",
                "MRes Health Informatics":
                "Swansea University Medical School",
                "MRes Research in Health Professions Education":
                "Swansea University Medical School",
                "MSc Research Methods in Psychology":
                "College of Human and Health Sciences",
                "MSc Social Research Methods":
                "College of Human and Health Sciences",
            }
            item['department'] = departmentDict.get(courseDegreeawardStr)
            if item['department'] == None:
                item['department'] = departmentDict.get(
                    courseDegreeawardStr.replace(" ", ""))
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
            print("item['department'] = ", item['department'])

            # //ul[@style='width: 5000px;']/li[4]
            department = response.xpath(
                "//div[@class='breadCrumb module']//ul/li[4]//text()").extract(
                )
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath(
                "//table[@class='top-button-course-variants-table']//tr[1]/td[2]//text()|//div[@class='top-button-duration']/div[@class='top-button-duration-value']/text()"
            ).extract()
            clear_space(duration)
            duration = ''.join(duration).strip()
            item['teach_time'] = getTeachTime(duration)

            p_l = ['Yr', 'yrs', 'yr', 'YR']
            for p in p_l:
                if p in duration:
                    item['duration'] = int(duration.replace(p, ""))
                    item['duration_per'] = 1
                    break

            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            # 专业描述
            overview1 = response.xpath(
                "//div[@id='content-items']/div[@class='layout-article-items']/div[@class='title-and-body-text']"
            ).extract()
            # print(overview1)
            overview2 = response.xpath("//div[@id='key-features']").extract()
            overview3 = response.xpath("//div[@id='description']").extract()
            clear_lianxu_space(overview1)
            clear_lianxu_space(overview2)
            clear_lianxu_space(overview3)
            overview = '\n'.join(overview1).strip() + '\n'.join(
                overview2).strip() + '\n'.join(overview3).strip()
            item['overview_en'] = remove_class(overview)
            print("item['overview_en'] = ", item['overview_en'])

            # 课程设置
            modules_1 = response.xpath(
                "//div[@class='ppsm-ms']//div[@class='variant']")
            # print("modules_1: ", modules_1)
            modules = []
            for m in modules_1:
                modules_year = m.xpath("./h3").extract()
                # print("modules_year: ", modules_year)
                modules.append(''.join(modules_year))
                modules_term = m.xpath("./h4").extract()
                # print("modules_term: ", modules_term)
                if len(modules_term) > 0:
                    for t in range(1, len(modules_term) + 1):
                        # print("modules_term: ", modules_term[t-1])
                        modules.append(modules_term[t - 1])
                        modules_name = m.xpath(
                            "./h4[" + str(t) +
                            "]/following-sibling::div[1]//table//tr/td[4]"
                        ).extract()
                        # print("modules_name: ", modules_name)
                        modules.append(''.join(modules_name))
            # print(modules)
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            # IELTS
            entryRequirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # clear_space(entryRequirements)
            item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])
            entryRequirementsStr = ''.join(entryRequirements)
            # .{0,100}(IELTS).{0,100}
            # ielts = re.findall(r"\.[a-zA-Z0-9\s.]{0,80}(IELTS)[a-zA-Z0-9\s.\(\))]{0,80}", entryRequirementsStr)
            pat = r"\..{0,100}IELTS.{0,100}"
            re_ielts = re.compile(pat)
            ielts = re_ielts.findall(entryRequirementsStr)
            item['ielts_desc'] = ''.join(ielts).lstrip('.').strip()
            print("item['ielts_desc'] = ", item['ielts_desc'])
            ielts = item['ielts_desc']
            ieltlsrw = re.findall(r"\d\.\d", ielts)
            # print(ieltlsrw)
            if len(ieltlsrw) >= 2:
                item['ielts'] = ieltlsrw[0]
                item['ielts_l'] = ieltlsrw[1]
                item['ielts_s'] = ieltlsrw[1]
                item['ielts_r'] = ieltlsrw[1]
                item['ielts_w'] = ieltlsrw[1]
            elif len(ieltlsrw) == 1:
                item['ielts'] = ieltlsrw[0]
                item['ielts_l'] = ieltlsrw[0]
                item['ielts_s'] = ieltlsrw[0]
                item['ielts_r'] = ieltlsrw[0]
                item['ielts_w'] = ieltlsrw[0]
            else:
                item["ielts"] = None  # float
                item["ielts_l"] = None  # float
                item["ielts_s"] = None  # float
                item["ielts_r"] = None  # float
                item["ielts_w"] = None
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            # 学费
            # fee = html.xpath("//div[@id='tuition-fees-contents']/div[@class='table-wrapper']/table[@class='expander-item-fees-table']/tbody/tr[@class='expander-item-fees-table-row odd']/td[@class='expander-item-fees-table-data odd'][2]//text()")
            tuition_fee = response.xpath(
                "//div[@id='tuition-fees-contents']//table[@class='expander-item-fees-table']/tbody/tr[1]/td[4]//text()"
            ).extract()
            clear_space(tuition_fee)
            tuition_fee = ''.join(tuition_fee)
            # print(tuition_fee)
            if "£" in tuition_fee:
                item['tuition_fee'] = int(
                    tuition_fee.replace('£', '').replace(',', ''))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])
            print("item['tuition_fee'] = ", item['tuition_fee'])

            # //div[@id='how-to-apply']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            assessment_en = response.xpath("//div[@id='assessment']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en'] = ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='careers-and-employability']|//div[@id='careers-employability']|//div[@id='employabilitycareers']|//div[@id='employability-and-careers-']|//div[@id='careers-in-child-nursing-']|//div[@id='careers']|//div[@id='graduate-employability-and-careers']|//div[@id='careers-in-radiotherapy-physics']|//div[@id='careers-in-midwifery']|//div[@id='careers-in-neurophysiology-']|//div[@id='careers-in-psychology-']|//div[@id='careers-in-adult-nursing-']|//div[@id='careers-in-nursing']|//div[@id='career-prospects-']"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #26
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.bathspa.ac.uk/"
        item['university'] = "Bath Spa University"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            item['location'] = 'Bath'
            # 专业、学位类型//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1
            programme = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/p[1]//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type)
            # print("item['degree_name']: ", item['degree_name'])
            if item['degree_name'] == "" and "phd" in item[
                    'programme_en'].lower(
                    ) or item['degree_name'] == "" and "doctorate" in item[
                        'programme_en'].lower():
                item['degree_name'] = 'phd'
                item['teach_type'] = 'phd'
                # 学位类型
                item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])
            print("item['degree_name']: ", item['degree_name'])

            # //div[@class='content']/div[@class='collapsible-content'][1]/div[2]/div[1]
            overview = response.xpath(
                "//h3[contains(text(),'Overview')]/..").extract()
            if len(overview) == 0:
                overview = response.xpath(
                    "//h3[contains(text(),'overview')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath(
                "//h3[contains(text(),'Course structure')]/..|//h3[contains(text(),'Course modules')]/.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'How will I be assessed?')]/..|//h3[contains(text(),'How will I be taught?')]/..|//h3[contains(text(),'Assessment')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//h3[contains(text(),'Career')]/..|//h3[contains(text(),'career')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            feeContent = response.xpath(
                "//h3[contains(text(),'International students full time')]/../div/table[1]//td[contains(text(), '2018/19 entry')]/following-sibling::td//text()"
            ).extract()
            clear_space(feeContent)
            # print(feeContent)
            if len(feeContent) > 0:
                item['tuition_fee'] = int(feeContent[0].replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@class='content']/div[@class='collapsible-content highlighted']/div[2]/div[2]
            entry_requirements = response.xpath(
                "//div[@class='content']/div[@class='collapsible-content highlighted']//text()"
            ).extract()
            clear_space(entry_requirements)
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ieltsList = re.findall(r".{1,60}IELTS.{1,60}",
                                   item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ieltsList)
            print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            interview_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/.."
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            # https://www.bathspa.ac.uk/international/country-advice/china/
            item[
                'require_chinese_en'] = "<p><strong>Postgraduate</strong></p><ul><li>Normally a Bachelor's degree with honours and a good passing grade from an internationally recognised university or Higher Education institution</li><li>Other international qualifications to an equivalent standard will also be considered.</li></ul> "

            # https://www.bathspa.ac.uk/applicants/how-to-apply/postgraduate/
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="content">
      <div data-hash-anchor='<a id="d.en.1289"></a>'></div>
<div class="intro-text">
	<p class="intro">You can apply for one of our taught postgraduate courses online from the webpage for the course you're interested in.</p>
</div><div class="rich-text" >
  <div data-hash-anchor='<a id="d.en.1291"></a>'></div>
    <div>
        <h2>How to apply</h2>
<p>To apply simply hit on the "Apply Now" on the course’s webpage. You'll need to create an online account.</p>
<p>Don’t have time to complete your whole application? Don’t worry, you can save your application and come back to it at anytime.</p>
<p>Entry requirements are listed on the course's webpage. If you don’t hold a first degree you may be required to provide additional evidence to support your application.</p>
<p><a href="/courses/">Search for your course</a></p>
<h3>What do I need?</h3>
<p>As part of the online application you’ll need to upload a variety of documents. This may include:</p>
<ul>
<li>Copy of passport</li>
<li>Qualifications</li>
<li>Portfolio</li>
<li>Previous UK visas (if applicable)</li>
<li>Reference.</li>
</ul>
<h3>Contact us</h3>
<p>Please contact us if you have any questions or concerns:&nbsp;<a href="mailto:[email protected]">[email protected]</a></p>
<p>Phone: +44 (0)1225 876180</p>
<h3>Interviews</h3>
<p>You may be required to attend an interview as part of the selection process for a postgraduate course. This is usually a 30 minute discussion of your experience and any work submitted with the application.</p>
<p>Telephone or Skype interviews can usually be arranged for applicants applying from outside of the UK.</p>
    </div>
</div>
"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            #             department_dict = {"arts management":"Bath Business School","accounting and finance":"Bath Business School",
            # "business and management":"Bath Business School",
            # "business and management (accounting)":"Bath Business School",
            # "business and management (entrepreneurship)":"Bath Business School",
            # "business and management (international business)":"Bath Business School",
            # "business and management (marketing)":"Bath Business School",
            # "curatorial practice":"Bath School of Art and Design",
            # "design (ceramics)":"Bath School of Art and Design",
            # "design (fashion and textiles)":"Bath School of Art and Design",
            # "fine art":"Bath School of Art and Design",
            # "visual communication":"Bath School of Art and Design",
            # "children's publishing":"College of Liberal Arts",
            # "classical acting":"College of Liberal Arts",
            # "composition":"College of Liberal Arts",
            # "creative producing":"College of Liberal Arts",
            # "creative writing":"College of Liberal Arts",
            # "creative writing phd":"College of Liberal Arts",
            # "crime and gothic fictions":"College of Liberal Arts",
            # "dance":"College of Liberal Arts",
            # "directing":"College of Liberal Arts",
            # "directing circus":"College of Liberal Arts",
            # "environmental humanities":"College of Liberal Arts",
            # "environmental management":"College of Liberal Arts",
            # "feature filmmaking":"College of Liberal Arts",
            # "heritage management":"College of Liberal Arts",
            # "intercultural musicology":"College of Liberal Arts",
            # "liberal arts":"College of Liberal Arts",
            # "literature, landscape and environment":"College of Liberal Arts",
            # "music performance":"College of Liberal Arts",
            # "performing shakespeare":"College of Liberal Arts",
            # "principles of applied neuropsychology":"College of Liberal Arts",
            # "scriptwriting":"College of Liberal Arts",
            # "songwriting (campus based)":"College of Liberal Arts",
            # "songwriting (distance learning)":"College of Liberal Arts",
            # "sound (arts)":"College of Liberal Arts",
            # "sound (design)":"College of Liberal Arts",
            # "sound (production)":"College of Liberal Arts",
            # "theatre for young audiences":"College of Liberal Arts",
            # "transnational writing":"College of Liberal Arts",
            # "travel and nature writing":"College of Liberal Arts",
            # "writing for young people":"College of Liberal Arts",
            # "counselling and psychotherapy practice":"Institute for Education",
            # "education (education studies)":"Institute for Education",
            # "education (early childhood studies)":"Institute for Education",
            # "education (international education)":"Institute for Education",
            # "education (leadership and management)":"Institute for Education",
            # "inclusive education":"Institute for Education",
            # "professional practice":"Institute for Education",
            # "professional practice in higher education":"Institute for Education",
            # "teaching english to speakers of other languages":"Institute for Education",
            # "specific learning difficulties / dyslexia":"Institute for Education",
            # "national award for special educational needs coordination":"Institute for Education",
            # "professional doctorate in education":"Institute for Education",
            # }
            #             item['department'] = department_dict.get(item['programme_en'].lower())
            #             print("item['department']: ", item['department'])
            department = response.xpath(
                "//dt[contains(text(),'School')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            location = response.xpath(
                "//dt[contains(text(),'Campus or location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            # duration
            durationMode = response.xpath(
                "//dt[contains(text(),'Course length')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(durationMode)
            print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            item['teach_time'] = getTeachTime(''.join(durationMode))
            print("item['duration']: ", item['duration'])
            print("item['teach_time']: ", item['teach_time'])
            print("item['duration_per']: ", item['duration_per'])
            item['other'] = durationMode
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #27
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace("Award:",
                                                               "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(item['degree_name'],
                                                       '').replace("()",
                                                                   "").strip()
            item['programme_en'] = programme
            # print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time']: ", item['teach_time'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            item[
                'require_chinese_en'] = "<p><strong>Postgraduate</strong></p><p><em>Taught Postgraduate Programmes:</em></p><p>Bachelor degree from a recognised Chinese university.</p>"

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "https://courses.bolton.ac.uk/course" in item['url']:
                if "postgraduate" in isup_str or len(isup) == 0:
                    print("******存到数据库*****")
                    yield item

        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #28
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "Loughborough University"
        # item['country'] = 'England'
        # item['website'] = 'http://www.lboro.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                "//span[@class='emphasised']//text()|"
                "//h1[@class='degree-info__heading']/text()").extract()
            # print("degree_name: ", degree_name)
            item['degree_name'] = ''.join(degree_name).replace(
                ', PG certificate', '').strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme_en = response.xpath(
                "//h1[@id='top']/text()|"
                "//h1[@class='degree-info__heading']/span//text()").extract()
            clear_space(programme_en)
            item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            item['department'] = response.meta.get(item['programme_en'])
            print("item['department']: ", item['department'])

            # 授课类型
            mode = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Full-time:')]//text()"
            ).extract()
            clear_space(mode)
            if len(mode) != 0:
                item['teach_time'] = 'fulltime'
            # print("item['teach_time']: ", item['teach_time'])

            duration = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Full-time:')]/following-sibling::dd//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            # if 'year' in ''.join(duration):
            #     item['duration'] = int(''.join(duration).replace('year', '').strip())
            #     item['duration_per'] = 1
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            start_date = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Start date:')]/following-sibling::dd//text()"
            ).extract()
            clear_space(start_date)
            # print(start_date)
            item['start_date'] = ''.join(start_date).replace(
                '(module restrictions apply)', '').strip()
            # print("item['start_date']: ", item['start_date'])

            # tuition_fee = response.xpath(
            #     "//dt[@class='list__item list__item--term'][contains(text(),'International fees:')]/following-sibling::dd//text()").extract()
            tuition_fee = response.xpath(
                "//span[contains(text(),'International fee')]/../following-sibling::dd//text()"
            ).extract()
            clear_space(tuition_fee)
            if "£" in ''.join(tuition_fee):
                item['tuition_fee_pre'] = '£'
                item['tuition_fee'] = ''.join(tuition_fee).replace(
                    '£', '').replace(',', '').strip()
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            location = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Location:')]/following-sibling::dd//text()"
            ).extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            allcontent = response.xpath(
                "//nav[@class='programme-nav nav']/following-sibling::*//text()"
            ).extract()
            clear_space(allcontent)
            # print("allcontent: ", allcontent)

            # 专业描述
            if "Overview" in allcontent:
                overviewIndex = allcontent.index("Overview")
                if "Entry requirements" in allcontent:
                    overviewIndexEnd = allcontent.index("Entry requirements")
                    overview = allcontent[overviewIndex + 1:overviewIndexEnd]
                    # clear_space(overview)
                    item['overview_en'] = clear_lianxu_space(overview).strip(
                        "Entry requirements").strip()
            if item['overview_en'] != "":
                item['overview_en'] = "<h2>Overview</h2><div>" + item[
                    'overview_en'] + "</div>"
            else:
                overview = response.xpath(
                    "//span[contains(text(),'Entry')]/../../../../preceding-sibling::div"
                ).extract()
                overview_en = remove_class(clear_lianxu_space(overview))
                item['overview_en'] = overview_en
            print("item['overview_en']: ", item['overview_en'])

            entry = response.xpath(
                "//h2[contains(text(),'Entry requirements')]/..//text()|"
                "//h2[contains(text(),'Entry Requirements')]/..//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(entry)
            # if item['rntry_requirements'] == "":
            #     print("entry_requ 为空")
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # 学术要求
            # if "Entry requirements" in allcontent:
            #     entry_requirementsIndex = allcontent.index("Entry requirements")
            #     if "English Language requirements" in allcontent:
            #         entry_requirementsIndexEnd = allcontent.index("English Language requirements")
            #         entry_requirements = allcontent[entry_requirementsIndex:entry_requirementsIndexEnd]
            #         # clear_space(entry_requirements)
            #         item['rntry_requirements'] = clear_lianxu_space(entry_requirements).replace(
            #             "English Language requirements", "").strip()
            #     elif "English language requirements" in allcontent:
            #         entry_requirementsIndexEnd = allcontent.index("English language requirements")
            #         entry_requirements = allcontent[entry_requirementsIndex:entry_requirementsIndexEnd]
            #         # clear_space(entry_requirements)
            #         item['rntry_requirements'] = clear_lianxu_space(entry_requirements).strip().replace(
            #             "English language requirements", '').strip()
            #     elif "English Language Requirements" in allcontent:
            #         entry_requirementsIndexEnd = allcontent.index("English Language Requirements")
            #         entry_requirements = allcontent[entry_requirementsIndex:entry_requirementsIndexEnd]
            #         # clear_space(entry_requirements)
            #         item['rntry_requirements'] = clear_lianxu_space(entry_requirements).replace(
            #             "English Language Requirements", '').strip()
            # item['rntry_requirements'] = "Entry requirements " + item['rntry_requirements'].replace(
            #     "Entry requirements", "").strip()
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # IELTS
            ielts_toefl = response.xpath(
                "//h2[contains(text(),'English')]/..//text()").extract()
            clear_space(ielts_toefl)
            if len(ielts_toefl) == 0:
                print("ielts_toefl 为空")
            ielts = "".join(ielts_toefl)
            # if "English Language requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English Language requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            # elif "English language requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English language requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            # elif "English Language Requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English Language Requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            # elif "English Language Entry Requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English Language Entry Requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            ielts_re = re.findall(r"IELTS.{1,80}", ielts)
            # print("ielts_re = ", ielts_re)
            toefl_re = re.findall(r"TOEFL.{1,80}", ielts)
            # print("toefl_re = ", toefl_re)

            item['ielts_desc'] = ''.join(ielts_re)
            print("item['ielts_desc']: ", item['ielts_desc'])
            item['toefl_desc'] = ''.join(toefl_re)
            print("item['toefl_desc']: ", item['toefl_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            toeflDict = get_toefl(item['toefl_desc'])
            item["toefl"] = toeflDict.get("TOEFL")  # float
            item["toefl_l"] = toeflDict.get("TOEFL_L")  # float
            item["toefl_s"] = toeflDict.get("TOEFL_S")  # float
            item["toefl_r"] = toeflDict.get("TOEFL_R")  # float
            item["toefl_w"] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # modules
            if "What you'll study" in allcontent:
                modulesIndex = allcontent.index("What you'll study")
                if "How you'll be assessed" in allcontent:
                    modulesIndexEnd = allcontent.index(
                        "How you'll be assessed")
                    modules = allcontent[modulesIndex:modulesIndexEnd]
                    # clear_space(modules)
                    item['modules_en'] = clear_lianxu_space(modules)
                elif "How you'll study" in allcontent:
                    modulesIndexEnd = allcontent.index("How you'll study")
                    modules = allcontent[modulesIndex:modulesIndexEnd]
                    # clear_space(modules)
                    item['modules_en'] = clear_lianxu_space(modules)
            item['modules_en'] = "<div>" + item['modules_en'] + "</div>"
            # module = response.xpath(r"//h3[@class='subheading'][contains(text(),'Modules')]/../../preceding-sibling::div[1]/following-sibling::div[@class='content-type content-type--toggle']").extract()
            # print("module: ", module)
            # print(len(module))
            # print("item['modules_en']: ", item['modules_en'])

            # teaching_assessment
            if "How you'll be assessed" in allcontent:
                teaching_assessmentIndex = allcontent.index(
                    "How you'll be assessed")
                if "Your personal and professional development" in allcontent:
                    teaching_assessmentIndexEnd = allcontent.index(
                        "Your personal and professional development")
                    teaching_assessment = allcontent[
                        teaching_assessmentIndex +
                        1:teaching_assessmentIndexEnd]
                    item[
                        'assessment_en'] = "<h2>How you'll be assessed</h2><div>" + clear_lianxu_space(
                            teaching_assessment) + "</div>"
            elif "How you'll study" in allcontent:
                teaching_assessmentIndex = allcontent.index("How you'll study")
                if "Your personal and professional development" in allcontent:
                    teaching_assessmentIndexEnd = allcontent.index(
                        "Your personal and professional development")
                    teaching_assessment = allcontent[
                        teaching_assessmentIndex +
                        1:teaching_assessmentIndexEnd]
                    item[
                        'assessment_en'] = "<h2>How you'll study</h2><div>" + clear_lianxu_space(
                            teaching_assessment) + "</div>"
            # print("item['assessment_en']: ", item['assessment_en'])

            # career
            if "Your personal and professional development" in allcontent:
                careerIndex = allcontent.index(
                    "Your personal and professional development")
                if "Fees and funding" in allcontent:
                    careerIndexEnd = allcontent.index("Fees and funding")
                    career = allcontent[careerIndex + 1:careerIndexEnd]
                    item['career_en'] = clear_lianxu_space(career)
            item[
                'career_en'] = "<h2>Your personal and professional development</h2><div>" + item[
                    'career_en'] + "</div>"
            # print("item['career_en']: ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div id="content-wrapper-wide" class="standard ">
<div class="content-wrapper">
<a name="d.en.1074686"></a>
<h3>Postgraduate</h3>
<p>Students are required to have a bachelor degree (4 years) for entry to a postgraduate programme. The University uses the <a href="http://rank2013.netbig.com/">Netbig 2013</a> university ranking to identify the required final mark, as outlined on the table below:&nbsp;</p>
<table border="1" cellpadding="0" cellspacing="0" style="width: 650px;">
<tbody>
<tr>
<td valign="top" width="121">
<p><strong>NETBIG rank 2013 </strong></p>
</td>
<td valign="top" width="130">
<p align="center"><strong>First</strong></p>
</td>
<td valign="top" width="130">
<p align="center"><strong>High 2:1 <br /> (65%)</strong></p>
</td>
<td valign="top" width="134">
<p align="center"><strong>2:1</strong></p>
</td>
<td valign="top" width="132">
<p align="center"><strong>High 2:2 <br /> (55-57%)</strong></p>
</td>
<td valign="top" width="132">
<p align="center"><strong>2:2</strong></p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>Top 150</strong></p>
</td>
<td valign="top" width="130">
<p align="center">84</p>
</td>
<td valign="top" width="130">
<p align="center">81</p>
</td>
<td valign="top" width="134">
<p align="center">80</p>
</td>
<td valign="top" width="132">
<p align="center">78</p>
</td>
<td valign="top" width="132">
<p align="center">77</p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>151-250</strong></p>
</td>
<td valign="top" width="130">
<p align="center">87</p>
</td>
<td valign="top" width="130">
<p align="center">83</p>
</td>
<td valign="top" width="134">
<p align="center">82</p>
</td>
<td valign="top" width="132">
<p align="center">80</p>
</td>
<td valign="top" width="132">
<p align="center">79</p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>251-500</strong></p>
</td>
<td valign="top" width="130">
<p align="center">89</p>
</td>
<td valign="top" width="130">
<p align="center">85</p>
</td>
<td valign="top" width="134">
<p align="center">84</p>
</td>
<td valign="top" width="132">
<p align="center">82</p>
</td>
<td valign="top" width="132">
<p align="center">80</p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>501+</strong></p>
</td>
<td valign="top" width="130">
<p align="center">92</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
<td valign="top" width="130">
<p align="center">87</p>
<p align="center">(SBE: No Offer)</p>
</td>
<td valign="top" width="134">
<p align="center">86</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
<td valign="top" width="132">
<p align="center">85</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
<td valign="top" width="132">
<p align="center">82</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<div class="clear"></div>
</div><!-- #content-wrapper -->
</div><!-- #content-wrapper-wide -->
<div class="clear"></div>    <!-- ACCORDION - STANDARD - SINGLE -->
<div class="content-wrapper">
<h3 class="trigger fcbg1"><a href="#">Affiliated Colleges</a></h3>
<div class="toggle_container">
<p>The University will consider students from Colleges affiliated to 211 and 985 universities and universities in the top 150 Netbig 2013 rankings. &nbsp;Applicants from these Colleges will be considered as follows:</p>
<ul>
<li>School of Business and Economics with 82% &ndash; 85%</li>
<li>All other programmes with 79% &ndash; 83%.&nbsp;</li>
</ul>
<p>Students from Colleges affiliated to universities with a Netbig 2013 rank of 151 &ndash; 250 will be considered as follows:</p>
<ul>
<li>School of Business and Economics with 85% &ndash; 86%</li>
<li>All other programmes with 80% &ndash; 85%.</li>
</ul>
<p>Students from Colleges affiliated to universities with a Netbig 2013 rank of 251 &ndash; 500 will be considered as follows:</p>
<ul>
<li>School of Business and Economics: not considered</li>
<li>All other programmes with 82% &ndash; 86%.</li>
</ul>
</div>
</div><!-- #content-wrapper -->    <!-- ACCORDION - STANDARD - SINGLE -->
<div class="content-wrapper">
<h3 class="trigger fcbg1"><a href="#">Business and Economics</a></h3>
<div class="toggle_container">
<p>The School of Business and Economics will give special consideration to students who have studied at a university which specialises in business or has expertise in another area.&nbsp; A list of these universities and the grades required can be found here:&nbsp;<a href="/terminalfour/SiteManager?ctfn=download&amp;fnno=60&amp;ceid=273195225">SBE Chinese Universities</a>&zwnj;.&nbsp; Applicants from these universities will be considered with 77% - 84% (depending on programme applied to).</p>
<p>Students who do not meet the above requirements may be considered if they have a relevant degree, can show good grades in relevant subjects, and/or have substantial relevant work experience.</p>
</div>
</div>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])

            item[
                'apply_proces_en'] = "http://www.lboro.ac.uk/study/postgraduate/apply/taught-applications/"
            print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item

        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #29
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.manchester.ac.uk/"
        item['university'] = "The University of Manchester"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "Oxford Rd, Manchester, M13 9PL, UK"
        print("===============================")
        print(response.url)
        try:
            # print(response.url)
            # 专业、学位类型
            programmeDegree = response.xpath(
                "//div[@id='course-profile']/div[@class='heading']/h1//text()"
            ).extract()
            clear_space(programmeDegree)
            programmeDegreeStr = ''.join(programmeDegree).strip()
            print(programmeDegreeStr)
            # degree_type = list(re.findall(r"^(\w{0,6})|(\w{0,6}/\w{0,6})\s", programmeDegreeStr)[0])
            degree_type = re.findall(
                r"^(Postgraduate\sCertificate)|(MBA)|^(\w{0,6}/\w{0,6}/\w{0,6})|^(\w{0,6}/\w{0,6})|^(\w{0,6})\s",
                programmeDegreeStr)
            if len(degree_type) > 0:
                degree_type = list(degree_type[0])
                print("degree_type = ", degree_type)
                item['degree_name'] = ''.join(degree_type).strip()
                if item['degree_name'] == "MBA":
                    item['programme_en'] = programmeDegreeStr
                else:
                    item['programme_en'] = programmeDegreeStr.replace(
                        item['degree_name'], "").strip("in").strip()
                # item['programme_en'] = programme[-1].strip()
            else:
                item['programme_en'] = programmeDegreeStr
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            start_date = response.xpath(
                "//*[contains(text(), 'Year of entry:')]//text()").extract()
            item['start_date'] = ''.join(start_date).replace(
                "Year of entry:", "").strip()
            # print("item['start_date'] = ", item['start_date'])

            duration = response.xpath(
                "//div[@id='course-profile']/div[@class='course-profile-content full-page']/div[@class='fact-file']/dl/dd[2]//text()"
            ).extract()
            durationStr = ''.join(duration)
            # print("durationStr = ", durationStr)
            if "full" in durationStr or "Full" in durationStr or "FT" in durationStr or "ft" in durationStr:
                item['teach_time'] = "fulltime"
            duration_re = re.findall(
                r"([a-zA-Z0-9\.]+\s)(year|month|week|yr|yft){1}|([0-9\.]+)(yr|yft|\-month){1}",
                durationStr, re.I)
            # print("duration_re = ", duration_re)
            d_dict = {
                "One": "1",
                "Two": "2",
                "Three": "3",
                "Four": "4",
                "Five": "5",
                "Six": "6",
                "Seven": "7",
                "Eight": "8",
                "Nine": "9",
                "Ten": "10",
                "one": "1",
                "two": "2",
                "three": "3",
                "four": "4",
                "five": "5",
                "six": "6",
                "seven": "7",
                "eight": "8",
                "nine": "9",
                "ten": "10",
            }
            if len(duration_re) > 0:
                d_int = re.findall(r"\d+", ''.join(duration_re[0]))
                if len(d_int) > 0:
                    item['duration'] = int(''.join(d_int))
                else:
                    d = re.findall(
                        r"(One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)",
                        ', '.join(duration_re[0]))
                    print("d = ", d)
                    item['duration'] = int(d_dict.get(''.join(d[0]).strip()))
                if "y" in ''.join(duration_re[0]) or "Y" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 1
                elif "m" in ''.join(duration_re[0]) or "M" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 3
                elif "w" in ''.join(duration_re[0]) or "W" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 4
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//*[contains(text(), 'Academic department')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            if len(department) > 0:
                item['department'] = department[0]
            # print("item['department'] = ", item['department'])

            # 专业描述,雅思托福,就业方向, 学术要求,How To Apply
            overview = response.xpath(
                '//h3[@id="course-overview"]/following-sibling::div[1]'
            ).extract()
            overview1 = response.xpath(
                '//h3[@id="course-description"]/following-sibling::div[1]'
            ).extract()
            print('===', len(overview1))
            if len(overview1) == 2:
                overview1 = [overview1[0]]
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)) + remove_class(
                    clear_lianxu_space((overview1)))
            print("item['overview_en'] = ", item['overview_en'])

            # Entry requirements
            rntry_requirements = response.xpath(
                '//h2[@id="entry-requirements"]/following-sibling::*[position()<9]//text()'
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'IELTS')]//text()"
            ).extract()
            if len(ielts_desc) == 0:
                ielts_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1][contains(text(), 'IELTS')]//text()"
                ).extract()
            if ''.join(ielts_desc).strip() == "IELTS":
                ielts_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'IELTS')]/..//text()"
                ).extract()
            clear_space(ielts_desc)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = clear_lianxu_space(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            toefl_desc = response.xpath(
                "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'TOEFL')]//text()"
            ).extract()
            if len(toefl_desc) == 0:
                toefl_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1][contains(text(), 'TOEFL')]//text()"
                ).extract()
            if ''.join(toefl_desc).strip() == "IBT TOEFL:":
                toefl_desc = response.xpath(
                    "//h3[contains(text(), 'English language')]/following-sibling::div[1]//*[contains(text(), 'TOEFL')]/..//text()"
                ).extract()
            clear_space(toefl_desc)
            item['toefl_desc'] = clear_lianxu_space(toefl_desc).replace(
                "\nTOEFL code for Manchester is 0757", "").strip()
            # print("item['toefl_desc']: ", item['toefl_desc'])

            # ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            ielts_list = re.findall(r"[567]\.\d|[678]", item['ielts_desc'])
            # print("ielts_list: ", ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3 or len(ielts_list) > 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[2]
                item['ielts_s'] = ielts_list[2]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[1]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl_list = re.findall(r"1[0-1]\d|[12789]\d", item['toefl_desc'])
            # print(toefl_list)
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
                # item['toefl_l'] = toefl_list[0]
                # item['toefl_r'] = toefl_list[0]
                # item['toefl_s'] = toefl_list[0]
                # item['toefl_w'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 3:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[2]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 4:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[3]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 5:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[3]
                item['toefl_s'] = toefl_list[4]
                item['toefl_w'] = toefl_list[2]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #                             item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            apply_proces_en = response.xpath(
                '//h2[@id="application-and-selection"]/following-sibling::*[position()<15]'
            ).extract()
            apply_proces_en_str = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print(apply_proces_en_str.index("<h2>Course details</h2>"))
            if apply_proces_en_str.find("<h2>Course details</h2>") == -1:
                apply_proces_en_s1 = apply_proces_en_str[
                    0:len(apply_proces_en_str)]
            else:
                apply_proces_en_s1 = apply_proces_en_str[:apply_proces_en_str.find(
                    "<h2>Course details</h2>") - 1]
            item['apply_proces_en'] = apply_proces_en_s1
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            interview_desc_en = response.xpath(
                '//h3[contains(text(), "Interview requirements")]/following-sibling::div[1]'
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            modules_en = response.xpath(
                "//*[contains(text(), 'Course unit details')]/following-sibling::*[position()<5]"
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(), 'Course unit list')]/following-sibling::*[position()<3]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                '//*[@id="teaching-and-learning"]/following-sibling::*[position()<4]'
            ).extract()
            if len(assessment_en) == 0:
                assessment_en = response.xpath(
                    '//*[@id="coursework-and-assessment"]/following-sibling::*[position()<4]'
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            career_en = response.xpath(
                '//*[@id="careers"]/following-sibling::*').extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])

            fee1 = response.xpath(
                "//div[@id='course-profile']/div[@class='course-profile-content full-page']/ul[1]/li[1]//text()"
            ).extract()
            # print(fee1)
            fee = clear_lianxu_space(fee1)
            fee_re = re.findall(
                r"International\sstudents\s\(per\sannum\):\s£[\d,]+", fee)
            fee_re1 = re.findall(r"£[\d,]+", ''.join(fee_re))
            # print("fee_re1: ", fee_re1)
            f = ''.join(fee_re1).replace("£", "").replace(",", "").strip()
            if len(f) != 0:
                item['tuition_fee'] = int(f)
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            item[
                'require_chinese_en'] = """<h2>Master's entry requirements</h2>
<p><span>For entry onto our master&rsquo;s degrees we require&nbsp;a minimum overall mark of 80% or CGPA of 3.0/4.0 in a Law degree with an average of 80% or higher in law units from a well ranked institution. We will accept relevant degrees for the MA study.</span></p>
<p>For all our LLM courses (except the LLM Healthcare Ethics and Law) we require an undergraduate Law degree. For MA courses we consider degrees in relevant disciplines.</p>"""
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
コード例 #30
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1//text()|"
                "//div[@class='onscreen-area']/div/div[@class='section'][1]/div[@class='page-title above-page-nav course-page-title']/div[@class='wrap']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/span//text()|"
                "//div[@class='onscreen-area']/div/div[@class='section'][1]/div[@class='page-title above-page-nav course-page-title']/div[@class='wrap']/p//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            department = response.xpath(
                "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()|//strong[contains(text(), 'Department')]/following-sibling::a//text()|"
                "//strong[contains(text(), 'Department')]/../following-sibling::p//text()|"
                "//span[contains(text(), 'Department')]/../following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            if len(department) > 0:
                item['department'] = department[0].strip()
                if item['department'] == "This course is eligible for the":
                    item['department'] = department[-1].strip()
            # print("item['department']: ", item['department'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()|//dt[contains(text(), 'duration')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                "//strong[contains(text(),'International students')]/../following-sibling::p//text()|"
                "//strong[contains(text(),'2018/19 entry')]/../following-sibling::p[1]//text()|"
                "//dt[contains(text(),'Fees')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(
                r"Full\stime:\s£\d+,\d+|Full\stime\s£\d+,\d+|International\sfull-time\sstudents:\s£\d+,\d+",
                ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            tuition_fee_re1 = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
            if len(tuition_fee_re1) > 0:
                item['tuition_fee'] = int(tuition_fee_re1[0].replace(
                    ",", "").replace("£", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            rntry_requirements_content = response.xpath(
                "//h3[contains(text(),'Key Facts')]/..//text()").extract()
            clear_space(rntry_requirements_content)
            # print("rntry_requirements_content: ", rntry_requirements_content)
            if "2018 ENTRY REQUIREMENTS" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 ENTRY REQUIREMENTS")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            if "2018 entry requirements" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 entry requirements")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            if item['rntry_requirements'] == "":
                # //dt[contains(text(),'Entry')]/following-sibling::dd[1]
                rntry_requirements = response.xpath(
                    "//dt[contains(text(),'Entry')]/following-sibling::dd[1]//text()"
                ).extract()
                item['rntry_requirements'] = clear_lianxu_space(
                    rntry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ieltsList = re.findall(r".{1,45}IELTS.{1,45}",
                                   item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            overview = response.xpath(
                "//h3[contains(text(),'Why take this course?')]/../*[not(@class='blockquote-img')]|"
                "//div[@class='onscreen-area']/div/div[@class='section'][1]/div[@class='wrap']/div[@class='group third']/div[@class='column twothirds']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath(
                "//h3[@id='structure']/../../following-sibling::div[1]|"
                "//div[@class='onscreen-area']/div/div[@class='section slate dark']/div[@class='wrap']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//div[@class='pure-g purple content']/div[1]/div[@class='box']"
            ).extract()
            if len(teaching_assessment) == 0:
                teaching_assessment = response.xpath(
                    "//h3[contains(text(), 'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//div[@class='box container content pure-g']|//div[@class='onscreen-area']/div/div[@class='section teal dark']/div[@class='wrap']"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            item[
                'apply_proces_en'] = "http://www.port.ac.uk/application-fees-and-funding/applying-postgraduate/#mastersCourses"
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)