Esempio n. 1
0
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Reading"
        # item['country'] = 'England'
        # item['website'] = 'http://www.reading.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型、ucas_code
            programmeDegree_typeUcascode = response.xpath(
                "//span[@class='text-bg-standout text-nice-wrap']/text() | //h1[@id='heading']//text() | //h1[@class='hero-heading']//text() | //h1[@class='block-heading block-heading-l5 block-heading-b5 block-heading-md-l-reset cell-md-t0']//text()"
            ).extract()
            clear_space(programmeDegree_typeUcascode)
            programmeDegree_typeUcascode = ''.join(
                programmeDegree_typeUcascode).strip()
            # print("programmeDegree_typeUcascode: ", programmeDegree_typeUcascode)

            degree_type = re.findall(r"^\w+/\w+", programmeDegree_typeUcascode)
            if len(degree_type) == 0:
                degree_type = re.findall(r"^\w+", programmeDegree_typeUcascode)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegree_typeUcascode.replace(
                item['degree_name'], '').strip()
            item['programme_en'] = programme.title()
            # print("item['programme_en']: ", item['programme_en'])

            # duration
            durationMode = response.xpath(
                "//h2[@class='row-margin-small text-weight-medium text-size-25']/text() | //strong[contains(text(),'Duration')]/../text() | //h3[contains(text(),'Programme length:')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            # if ":" in durationMode:
            #     duration = durationMode.split(":")[-1].strip()
            #     mode = durationMode.split(":")[0].strip()
            #     item['duration'] = duration
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            item['teach_time'] = getTeachTime(''.join(durationMode))
            # print("item['duration']: ", item['duration'])
            # print("item['teach_time']: ", item['teach_time'])
            # print("item['duration_per']: ", item['duration_per'])

            start_date = response.xpath(
                "//p[@class='headline'][contains(text(), 'Start date')]//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            overview2 = response.xpath(
                "//div[@class='m-bg-white m-pad-around m-pull-left-normal m-pull-up']//div[@class='theme-editor'] | //div[@id='top-courseOverview'] | //html//div[@id='top-programmeOverview']/h2[1]/following-sibling::div[1] | //div[@id='tc1']"
            ).extract()
            overview = remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = overview
            print("item['overview_en']: ", item['overview_en'])

            # department
            department = response.xpath(
                "//article[@class='pad-around bg-white']//div[@class='theme-editor']//a//text()|//p[@class='paddingtop22 nopaddingbottom']//a//text()|//a[@class='navbar-brand navbar-brand-hbs']//text()"
            ).extract()
            clear_space(department)
            if department == "":
                item['department'] = response.meta.get('department')
            else:
                item['department'] = ', '.join(department).strip()
            item['department'] = item['department'].replace("How to apply", "")
            # print("item['department']: ", item['department'])

            item[
                'location'] = "Whiteknights,PO Box 217,Reading, Berkshire,RG6 6AH"
            # //h2[@id='Panel1Trigger']/../..
            entry_requirements = response.xpath(
                "//h2[@id='Panel1Trigger']/../..//text()|//div[@id='bottom-entryRequirements']/..//text()|//div[@id='tc5']//text()"
            ).extract()
            if len(entry_requirements) == 0:
                entry_requirements = response.xpath(
                    "//h4[contains(text(),'Entry requirements:')]/preceding-sibling::*[1]/following-sibling::*[position()<4]//text()"
                ).extract()
            clear_space(entry_requirements)
            entry = ''.join(entry_requirements)
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            if item['rntry_requirements'] == "":
                print("rntry_requirements 为空")
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELT.{1,100}", entry)
            # ielts = response.xpath(
            #     "//strong[contains(text(),'IELTS')]/..//text()").extract()
            # # if item['ielts_desc'] == "":
            clear_space(ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # if item['ielts_desc'] == "":
            #     print("ielts_desc 为空")
            # print("item['ielts_desc']1: ", item['ielts_desc'])
            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl = re.findall(r"TOEFL[\s\(\)\w:\.]{1,300}", entry)
            # print(ielts)
            if item['toefl_desc'] == "":
                item['toefl_desc'] = ''.join(toefl)
            # print("item['toefl_desc']: ", item['toefl_desc'])
            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # //h2[@id='Panel1Trigger']/../..
            modules = response.xpath(
                "//h2[@id='Panel2Trigger']/../..|//div[@id='bottom-courseContent']/..|//div[@id='page_content_wrap']/following-sibling::div[position()<3]|//strong[contains(text(),'Programme structure')]/../following-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h4[contains(text(),'Programme structure and content')]/preceding-sibling::*[1]/following-sibling::*[position()<11]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # //h2[@id='Panel1Trigger']/../..
            career = response.xpath(
                "//h2[@id='Panel4Trigger']/../following-sibling::div[1]|//div[@id='bottom-careers']/..|//div[@id='careers']|//h3[contains(text(),'Careers')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            # //h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]
            tuition_fee = response.xpath(
                "//h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]//text()|"
                "//html//div[@id='bottom-feesFunding']//tr[2]/td[3]//text()|"
                "//html//div[@id='bottom-feesFunding']//tr[2]/td[2]//text()|"
                "//html//div[@id='tc2']//h3[1]/following-sibling::p[1]//text()|"
                "//*[contains(text(),'Programme fee')]/following-sibling::*[1]//text()|"
                "//h2[contains(text(),'Fees')]/following-sibling::p[1]//h2[contains(text(),'Fees')]/following-sibling::p[1]|"
                "//h2[contains(text(),'Fees')]/following-sibling::p[position()<3]//text()|"
                "//p[contains(text(),'New international students:')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            # item['tuition_fee'] = ''.join(tuition_fee).strip()
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) == 1:
                item['tuition_fee'] = int(''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            if len(tuition_fee_re) >= 2:
                item['tuition_fee'] = int(''.join(tuition_fee_re[1]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # if item['tuition_fee'] is None:
            #     print("tuition_fee 为空")
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='top-howWeTeachYou']
            assessment_en = response.xpath(
                "//div[@id='top-howWeTeachYou']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            item[
                'apply_proces_en'] = """<div><h1><span>How to apply for postgraduate courses</span></h1></div><div><h4>Postgraduate taught courses</h4><p>The quickest and easiest away to apply for postgraduate study at the University of Reading is through our <a>online application service</a>. The online service allows you to complete your application form and attach electronic copies of your academic transcripts, certificates and other supporting information. It also provides a tool for sending an email request to your referees, enabling them to send your supporting references directly to us.</p><p><span>If you are unable to apply online you can request a paper application form by telephoning </span><a>+44 (0) 118 378 5289</a><span> or writing to:</span></p><p>Admissions Office<br>University of Reading<br>Miller Building<br>Whiteknights<br>Reading, RG6 6AB<br>UK</p><h4>PGCE and School Direct</h4><p>Candidates for the PGCE and School Direct courses should submit an application via <a>UCAS Teacher Training</a>.</p><strong> Postgraduate research </strong><p>For more information on applying for postgraduate research opportunities, please visit our <a>graduate school website</a>.</p></div><div><div><div><h4>Entry requirements</h4><p>Please visit our <a>postgraduate entry requirements</a> page for information on academic qualifications and English language requirements.</p><h4>When to apply</h4><p>There is no specific deadline date for most courses and applications will be considered until the course is full. However, to allow time for us to process your application we recommend that you apply by the following dates for admission in September:</p><div><strong>UK applicants</strong> by 1 August</div><div><strong>International applicants</strong> by 1 June</div><div><br></div><p>Please note that the MSc Speech and Language Therapy has an earlier application deadline of 1 December. Applications for PGCE courses are made through UCAS (see above) and the deadline is 15 September of the year of entry though early applications are recommended.</p><p>Most of our taught courses start at the beginning of the autumn term (in September) but there are a number that also have a start at a different time of the year or have multiple starts throughout the year. Please see the individual subject pages for further details.</p><h4>After you apply</h4><p>As soon as you have submitted your&nbsp;completed application we will send&nbsp;you an email acknowledgement.&nbsp;We will also create an applicant&nbsp;account for you which will allow&nbsp;you to check on the progress of&nbsp;your application online and access&nbsp;other useful information about&nbsp;the University of Reading.</p><p> We aim to reach a decision on&nbsp;your application within 4 weeks.&nbsp;The length of time taken to reach&nbsp;a decision will vary as each&nbsp;application is considered on an individual basis according to your&nbsp;relevant strengths and merits. Once your application has been&nbsp;considered you will receive an&nbsp;email from the Admissions Office&nbsp;informing you of the decision. If&nbsp;your application has been successful,&nbsp;our email will explain the offer and&nbsp;any conditions attached to it and also&nbsp;give further details of the fees and&nbsp;other expenses associated with your&nbsp;course.&nbsp;</p><p>Our team of experienced&nbsp;admissions staff is here to help you&nbsp;throughout the application process&nbsp;so please do not hesitate to get in&nbsp;touch with us if you need any help&nbsp;with completing your application or&nbsp;have a question about the progress&nbsp;of your application. You can contact&nbsp;us at <a>[email protected]</a>.</p></div></div>"""

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "Loughborough University"
        # item['country'] = 'England'
        # item['website'] = 'http://www.lboro.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                "//span[@class='emphasised']//text()|"
                "//h1[@class='degree-info__heading']/text()").extract()
            # print("degree_name: ", degree_name)
            item['degree_name'] = ''.join(degree_name).replace(
                ', PG certificate', '').strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme_en = response.xpath(
                "//h1[@id='top']/text()|"
                "//h1[@class='degree-info__heading']/span//text()").extract()
            clear_space(programme_en)
            item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            item['department'] = response.meta.get(item['programme_en'])
            print("item['department']: ", item['department'])

            # 授课类型
            mode = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Full-time:')]//text()"
            ).extract()
            clear_space(mode)
            if len(mode) != 0:
                item['teach_time'] = 'fulltime'
            # print("item['teach_time']: ", item['teach_time'])

            duration = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Full-time:')]/following-sibling::dd//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            # if 'year' in ''.join(duration):
            #     item['duration'] = int(''.join(duration).replace('year', '').strip())
            #     item['duration_per'] = 1
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            start_date = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Start date:')]/following-sibling::dd//text()"
            ).extract()
            clear_space(start_date)
            # print(start_date)
            item['start_date'] = ''.join(start_date).replace(
                '(module restrictions apply)', '').strip()
            # print("item['start_date']: ", item['start_date'])

            # tuition_fee = response.xpath(
            #     "//dt[@class='list__item list__item--term'][contains(text(),'International fees:')]/following-sibling::dd//text()").extract()
            tuition_fee = response.xpath(
                "//span[contains(text(),'International fee')]/../following-sibling::dd//text()"
            ).extract()
            clear_space(tuition_fee)
            if "£" in ''.join(tuition_fee):
                item['tuition_fee_pre'] = '£'
                item['tuition_fee'] = ''.join(tuition_fee).replace(
                    '£', '').replace(',', '').strip()
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            location = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Location:')]/following-sibling::dd//text()"
            ).extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            allcontent = response.xpath(
                "//nav[@class='programme-nav nav']/following-sibling::*//text()"
            ).extract()
            clear_space(allcontent)
            # print("allcontent: ", allcontent)

            # 专业描述
            if "Overview" in allcontent:
                overviewIndex = allcontent.index("Overview")
                if "Entry requirements" in allcontent:
                    overviewIndexEnd = allcontent.index("Entry requirements")
                    overview = allcontent[overviewIndex + 1:overviewIndexEnd]
                    # clear_space(overview)
                    item['overview_en'] = clear_lianxu_space(overview).strip(
                        "Entry requirements").strip()
            if item['overview_en'] != "":
                item['overview_en'] = "<h2>Overview</h2><div>" + item[
                    'overview_en'] + "</div>"
            else:
                overview = response.xpath(
                    "//span[contains(text(),'Entry')]/../../../../preceding-sibling::div"
                ).extract()
                overview_en = remove_class(clear_lianxu_space(overview))
                item['overview_en'] = overview_en
            print("item['overview_en']: ", item['overview_en'])

            entry = response.xpath(
                "//h2[contains(text(),'Entry requirements')]/..//text()|"
                "//h2[contains(text(),'Entry Requirements')]/..//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(entry)
            # if item['rntry_requirements'] == "":
            #     print("entry_requ 为空")
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # 学术要求
            # if "Entry requirements" in allcontent:
            #     entry_requirementsIndex = allcontent.index("Entry requirements")
            #     if "English Language requirements" in allcontent:
            #         entry_requirementsIndexEnd = allcontent.index("English Language requirements")
            #         entry_requirements = allcontent[entry_requirementsIndex:entry_requirementsIndexEnd]
            #         # clear_space(entry_requirements)
            #         item['rntry_requirements'] = clear_lianxu_space(entry_requirements).replace(
            #             "English Language requirements", "").strip()
            #     elif "English language requirements" in allcontent:
            #         entry_requirementsIndexEnd = allcontent.index("English language requirements")
            #         entry_requirements = allcontent[entry_requirementsIndex:entry_requirementsIndexEnd]
            #         # clear_space(entry_requirements)
            #         item['rntry_requirements'] = clear_lianxu_space(entry_requirements).strip().replace(
            #             "English language requirements", '').strip()
            #     elif "English Language Requirements" in allcontent:
            #         entry_requirementsIndexEnd = allcontent.index("English Language Requirements")
            #         entry_requirements = allcontent[entry_requirementsIndex:entry_requirementsIndexEnd]
            #         # clear_space(entry_requirements)
            #         item['rntry_requirements'] = clear_lianxu_space(entry_requirements).replace(
            #             "English Language Requirements", '').strip()
            # item['rntry_requirements'] = "Entry requirements " + item['rntry_requirements'].replace(
            #     "Entry requirements", "").strip()
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # IELTS
            ielts_toefl = response.xpath(
                "//h2[contains(text(),'English')]/..//text()").extract()
            clear_space(ielts_toefl)
            if len(ielts_toefl) == 0:
                print("ielts_toefl 为空")
            ielts = "".join(ielts_toefl)
            # if "English Language requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English Language requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            # elif "English language requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English language requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            # elif "English Language Requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English Language Requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            # elif "English Language Entry Requirements" in allcontent:
            #     ieltsIndex = allcontent.index("English Language Entry Requirements")
            #     if "What you'll study" in allcontent:
            #         ieltsIndexIndexEnd = allcontent.index("What you'll study")
            #         ielts = allcontent[ieltsIndex:ieltsIndexIndexEnd]
            #         clear_space(ielts)
            #         ielts = ''.join(ielts).strip()
            ielts_re = re.findall(r"IELTS.{1,80}", ielts)
            # print("ielts_re = ", ielts_re)
            toefl_re = re.findall(r"TOEFL.{1,80}", ielts)
            # print("toefl_re = ", toefl_re)

            item['ielts_desc'] = ''.join(ielts_re)
            print("item['ielts_desc']: ", item['ielts_desc'])
            item['toefl_desc'] = ''.join(toefl_re)
            print("item['toefl_desc']: ", item['toefl_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            toeflDict = get_toefl(item['toefl_desc'])
            item["toefl"] = toeflDict.get("TOEFL")  # float
            item["toefl_l"] = toeflDict.get("TOEFL_L")  # float
            item["toefl_s"] = toeflDict.get("TOEFL_S")  # float
            item["toefl_r"] = toeflDict.get("TOEFL_R")  # float
            item["toefl_w"] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # modules
            if "What you'll study" in allcontent:
                modulesIndex = allcontent.index("What you'll study")
                if "How you'll be assessed" in allcontent:
                    modulesIndexEnd = allcontent.index(
                        "How you'll be assessed")
                    modules = allcontent[modulesIndex:modulesIndexEnd]
                    # clear_space(modules)
                    item['modules_en'] = clear_lianxu_space(modules)
                elif "How you'll study" in allcontent:
                    modulesIndexEnd = allcontent.index("How you'll study")
                    modules = allcontent[modulesIndex:modulesIndexEnd]
                    # clear_space(modules)
                    item['modules_en'] = clear_lianxu_space(modules)
            item['modules_en'] = "<div>" + item['modules_en'] + "</div>"
            # module = response.xpath(r"//h3[@class='subheading'][contains(text(),'Modules')]/../../preceding-sibling::div[1]/following-sibling::div[@class='content-type content-type--toggle']").extract()
            # print("module: ", module)
            # print(len(module))
            # print("item['modules_en']: ", item['modules_en'])

            # teaching_assessment
            if "How you'll be assessed" in allcontent:
                teaching_assessmentIndex = allcontent.index(
                    "How you'll be assessed")
                if "Your personal and professional development" in allcontent:
                    teaching_assessmentIndexEnd = allcontent.index(
                        "Your personal and professional development")
                    teaching_assessment = allcontent[
                        teaching_assessmentIndex +
                        1:teaching_assessmentIndexEnd]
                    item[
                        'assessment_en'] = "<h2>How you'll be assessed</h2><div>" + clear_lianxu_space(
                            teaching_assessment) + "</div>"
            elif "How you'll study" in allcontent:
                teaching_assessmentIndex = allcontent.index("How you'll study")
                if "Your personal and professional development" in allcontent:
                    teaching_assessmentIndexEnd = allcontent.index(
                        "Your personal and professional development")
                    teaching_assessment = allcontent[
                        teaching_assessmentIndex +
                        1:teaching_assessmentIndexEnd]
                    item[
                        'assessment_en'] = "<h2>How you'll study</h2><div>" + clear_lianxu_space(
                            teaching_assessment) + "</div>"
            # print("item['assessment_en']: ", item['assessment_en'])

            # career
            if "Your personal and professional development" in allcontent:
                careerIndex = allcontent.index(
                    "Your personal and professional development")
                if "Fees and funding" in allcontent:
                    careerIndexEnd = allcontent.index("Fees and funding")
                    career = allcontent[careerIndex + 1:careerIndexEnd]
                    item['career_en'] = clear_lianxu_space(career)
            item[
                'career_en'] = "<h2>Your personal and professional development</h2><div>" + item[
                    'career_en'] + "</div>"
            # print("item['career_en']: ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div id="content-wrapper-wide" class="standard ">
<div class="content-wrapper">
<a name="d.en.1074686"></a>
<h3>Postgraduate</h3>
<p>Students are required to have a bachelor degree (4 years) for entry to a postgraduate programme. The University uses the <a href="http://rank2013.netbig.com/">Netbig 2013</a> university ranking to identify the required final mark, as outlined on the table below:&nbsp;</p>
<table border="1" cellpadding="0" cellspacing="0" style="width: 650px;">
<tbody>
<tr>
<td valign="top" width="121">
<p><strong>NETBIG rank 2013 </strong></p>
</td>
<td valign="top" width="130">
<p align="center"><strong>First</strong></p>
</td>
<td valign="top" width="130">
<p align="center"><strong>High 2:1 <br /> (65%)</strong></p>
</td>
<td valign="top" width="134">
<p align="center"><strong>2:1</strong></p>
</td>
<td valign="top" width="132">
<p align="center"><strong>High 2:2 <br /> (55-57%)</strong></p>
</td>
<td valign="top" width="132">
<p align="center"><strong>2:2</strong></p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>Top 150</strong></p>
</td>
<td valign="top" width="130">
<p align="center">84</p>
</td>
<td valign="top" width="130">
<p align="center">81</p>
</td>
<td valign="top" width="134">
<p align="center">80</p>
</td>
<td valign="top" width="132">
<p align="center">78</p>
</td>
<td valign="top" width="132">
<p align="center">77</p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>151-250</strong></p>
</td>
<td valign="top" width="130">
<p align="center">87</p>
</td>
<td valign="top" width="130">
<p align="center">83</p>
</td>
<td valign="top" width="134">
<p align="center">82</p>
</td>
<td valign="top" width="132">
<p align="center">80</p>
</td>
<td valign="top" width="132">
<p align="center">79</p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>251-500</strong></p>
</td>
<td valign="top" width="130">
<p align="center">89</p>
</td>
<td valign="top" width="130">
<p align="center">85</p>
</td>
<td valign="top" width="134">
<p align="center">84</p>
</td>
<td valign="top" width="132">
<p align="center">82</p>
</td>
<td valign="top" width="132">
<p align="center">80</p>
</td>
</tr>
<tr>
<td valign="top" width="121">
<p><strong>501+</strong></p>
</td>
<td valign="top" width="130">
<p align="center">92</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
<td valign="top" width="130">
<p align="center">87</p>
<p align="center">(SBE: No Offer)</p>
</td>
<td valign="top" width="134">
<p align="center">86</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
<td valign="top" width="132">
<p align="center">85</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
<td valign="top" width="132">
<p align="center">82</p>
<p align="center"><span>(SBE: No Offer)</span></p>
</td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<div class="clear"></div>
</div><!-- #content-wrapper -->
</div><!-- #content-wrapper-wide -->
<div class="clear"></div>    <!-- ACCORDION - STANDARD - SINGLE -->
<div class="content-wrapper">
<h3 class="trigger fcbg1"><a href="#">Affiliated Colleges</a></h3>
<div class="toggle_container">
<p>The University will consider students from Colleges affiliated to 211 and 985 universities and universities in the top 150 Netbig 2013 rankings. &nbsp;Applicants from these Colleges will be considered as follows:</p>
<ul>
<li>School of Business and Economics with 82% &ndash; 85%</li>
<li>All other programmes with 79% &ndash; 83%.&nbsp;</li>
</ul>
<p>Students from Colleges affiliated to universities with a Netbig 2013 rank of 151 &ndash; 250 will be considered as follows:</p>
<ul>
<li>School of Business and Economics with 85% &ndash; 86%</li>
<li>All other programmes with 80% &ndash; 85%.</li>
</ul>
<p>Students from Colleges affiliated to universities with a Netbig 2013 rank of 251 &ndash; 500 will be considered as follows:</p>
<ul>
<li>School of Business and Economics: not considered</li>
<li>All other programmes with 82% &ndash; 86%.</li>
</ul>
</div>
</div><!-- #content-wrapper -->    <!-- ACCORDION - STANDARD - SINGLE -->
<div class="content-wrapper">
<h3 class="trigger fcbg1"><a href="#">Business and Economics</a></h3>
<div class="toggle_container">
<p>The School of Business and Economics will give special consideration to students who have studied at a university which specialises in business or has expertise in another area.&nbsp; A list of these universities and the grades required can be found here:&nbsp;<a href="/terminalfour/SiteManager?ctfn=download&amp;fnno=60&amp;ceid=273195225">SBE Chinese Universities</a>&zwnj;.&nbsp; Applicants from these universities will be considered with 77% - 84% (depending on programme applied to).</p>
<p>Students who do not meet the above requirements may be considered if they have a relevant degree, can show good grades in relevant subjects, and/or have substantial relevant work experience.</p>
</div>
</div>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])

            item[
                'apply_proces_en'] = "http://www.lboro.ac.uk/study/postgraduate/apply/taught-applications/"
            print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item

        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
Esempio n. 3
0
    def parse_main(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['teach_time'] = 'fulltime'
        item['university'] = 'University of Glasgow'
        item['url'] = response.url
        item['location'] = 'Glasgow'
        item['start_date'] = '2018-9'
        item['deadline'] = '2018-7'
        item["tuition_fee_pre"] = "£"
        item['teach_type'] = 'taught'

        programme = response.xpath(
            '//div[@id="prog-title"]/h1/text()').extract()
        programme = ''.join(programme)
        item['programme_en'] = programme
        degree_type = response.xpath(
            '//div[@id="prog-title"]/h1/span/text()').extract()
        degree_type = ''.join(degree_type)
        item['degree_name'] = degree_type
        duration = response.xpath(
            '//li[contains(text(),"full-time")]/text()').extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        # print(durations)

        overview = response.xpath(
            '//h2[contains(text(),"Why this programme")]/following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//h2[contains(text(),"Programme str")]/following-sibling::*'
        ).extract()
        modules = clear_same_s(modules)
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::*').extract()
        career = clear_same_s(career)
        career = remove_class(career)
        item['career_en'] = career

        fees = response.xpath(
            '//h2[contains(text(),"Fees and")]/following-sibling::div//text()'
        ).extract()
        fees = response.xpath('//div[@id="fees"]//text()').extract()
        # print(fees)
        tuition_fee = getTuition_fee(fees)
        # print(tuition_fee)
        if tuition_fee == 2018:
            tuition_fee = '0'
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        IELTS = response.xpath(
            '//*[contains(text(),"IELTS")]/../following-sibling::ul[1]//text()'
        ).extract()
        # print(IELTS)
        ielts = get_ielts(IELTS)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']
        TOEFL = response.xpath(
            '//*[contains(text(),"TOEFL")]/..//text()').extract()
        # print(TOEFL)
        toefl = get_toefl(TOEFL)
        if toefl != []:
            try:
                item['toefl_r'] = toefl[1]
                item['toefl_l'] = toefl[2]
                item['toefl_s'] = toefl[3]
                item['toefl_w'] = toefl[4]
                item['toefl'] = toefl[0]
            except:
                pass

        entry = response.xpath(
            '//h2[contains(text(),"Entry requirements")]/following-sibling::*'
        ).extract()
        entry = clear_same_s(entry)
        entry = remove_class(entry)
        item['rntry_requirements'] = entry

        apply_d = response.xpath(
            '//h3[contains(text(),"Documents")]/following-sibling::ul[1]'
        ).extract()
        apply_d = clear_same_s(apply_d)
        item['apply_proces_en'] = remove_class(apply_d)

        if programme != '':
            yield item
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_name = response.xpath(
                "//span[contains(text(),'Awards:')]/../text()").extract()
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0]
            print("item['degree_name']: ", item['degree_name'])

            teach_time = response.xpath(
                "//span[contains(text(),'Study modes:')]/../text()").extract()
            teach_time = ''.join(teach_time)
            # teach_time_re = re.findall(r"[a-zA-Z]{4}-time", teach_time)
            # print("teach_time_re: ", teach_time_re)
            item['teach_time'] = getTeachTime(teach_time)
            # item['teach_time'] = item['teach_time'].replace("parttime", "").replace(',', '')
            # print("item['teach_time']: ", item['teach_time'])

            department = response.xpath(
                "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs 12']//ul//li//span[contains(text(),'College:')]/following-sibling::*//text()").extract()
            if len(department) == 0:
                department = response.xpath(
                    "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs 12']//ul//li//span[contains(text(),'School:')]/following-sibling::a[1]/text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            location = response.xpath(
                "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            # //option[@value='0010']
            start_date = response.xpath(
                "//select[@name='code2']//option//text()").extract()
            clear_space(start_date)
            # print(start_date)
            if len(start_date) > 0:
                start_date = start_date[0].strip()
            # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(start_date)
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_collapseresearch_profile']/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            duration = response.xpath(
                "//table[@class='table table-striped']//tbody//tr[1]/td[3]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration = ''.join(duration).strip()
            duration_int = re.findall(r"\d+", duration)
            if len(duration_int) != 0:
                item['duration'] = int(''.join(duration_int))
            # print("item['duration']: ", item['duration'])

            if "year" in duration or "Year" in duration:
                item['duration_per'] = 1
            if "month" in duration or "Month" in duration:
                item['duration_per'] = 3
            # print("item['duration_per']: ", item['duration_per'])


            # //div[@id='proxy_collapseprogramme']
            modules1 = response.xpath(
                "//div[@id='proxy_collapsehow_taught']/div/*[position()<=last()]").extract()
            # clear_space(modules1)
            modules2url = response.xpath(
                "//html//tr[1]/td[5]/a/@href").extract()
            modules2 = ""
            if len(modules2url) != 0:
                modules2url = ''.join(modules2url)
                modules2 = self.get_modules2(modules2url)
            item['modules_en'] = remove_class(clear_lianxu_space(list(modules1)))
            if modules2 != "":
                item['modules_en'] += "\n" + modules2
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//div[@id='proxy_collapsecareer_opp']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            entry_requirements = response.xpath(
                "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            IELTS = response.xpath("//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath("//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                    item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath("//div[@id='proxy_collapsefees']//ul/li/a[contains(text(),'Full')]/@href").extract()
            # print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                # print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])

            item['require_chinese_en'] = "https://www.ed.ac.uk/studying/international/postgraduate-entry/asia/china"
            item['apply_proces_en'] = "https://www.ed.ac.uk/studying/postgraduate/applying"
            # apply_proces_en = response.xpath(
            #     "//div[@id='proxy_collapseHowToApply']/..").extract()
            # item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            yield item
        except Exception as e:
            with open(item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)