Ejemplo n.º 1
0
    def parses(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = 'La Trobe University'
        item['url'] = response.url
        # item['location']='Melbourne'
        item['degree_type'] = 1
        print("================================================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                '//h1[contains(text(),"Bachelor of")]/text()').extract()
            clear_space(degree_name)
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor",
                                item['degree_name'].replace("(Honours)", ""))
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)",
                    item['degree_name'].replace("(Advanced)",
                                                "").replace("(Honours)",
                                                            "").strip())
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of",
                        "").replace("(Honours)", "").replace("Master of ",
                                                             "").strip()
                print("item['programme_en']: ", item['programme_en'])

                start_date = response.xpath(
                    '//div[contains(text(),"tart")]/following-sibling::div//text()'
                ).extract()
                # print('start_date: ',start_date)
                item['start_date'] = getStartDateMonth(''.join(start_date))
                if item['start_date'] == "":
                    item['start_date'] = ''.join(start_date).strip()
                # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    '//div[contains(text(),"uration")]/following-sibling::div//text()'
                ).extract()
                # print('duration: ',duration)
                item['duration'] = ''.join(duration).strip()
                # print("item['duration']: ", item['duration'])

                fee = response.xpath(
                    '//h3[contains(text(),"tuition fee")]/following-sibling::p[1]/text()'
                ).extract()
                # print('fee: ',fee)
                fee = ''.join(fee).strip()
                tuition = fee.replace(' ', '')
                item['tuition_fee'] = tuition[0:99]
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    '//section[@id="overview"]/div[@class="block"]').extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                rntry = response.xpath(
                    '//section[@id="entry-requirements"]').extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                career = response.xpath(
                    '//section[@id="career-outcomes"]').extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                htp = response.xpath('//section[@id="how-to-apply"]').extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(htp))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                # //ul[@class='list-arrows']//li
                location_dict = {
                    'BU': 'Melbourne',
                    'BE': 'Bendigo',
                    'CI': 'City',
                    'MI': 'Mildura',
                    'OT': 'Other',
                    'FS': 'Franklin Street',
                    'SH': 'Shepparton',
                    'SY': 'Sydney',
                    'ON': 'Online',
                    'WO': 'Albury-Wodonga',
                }
                location = response.xpath(
                    "//ul[@class='list-arrows']//li//text()").extract()
                # print("location: ", location)
                item['location'] = ''.join(location).replace("(Bundoora)",
                                                             "").strip()
                if item['location'] == "":
                    location_key = response.url.replace(
                        "https://www.latrobe.edu.au/courses/data/2019/international/",
                        "").strip()
                    # print("location_key1: ", location_key)
                    location_key = location_key.split("/")[0]
                    # print("location_key: ", location_key)
                    item['location'] = location_dict.get(
                        ''.join(location_key).upper())
                # print("item['location']: ", item['location'])

                ielts = response.xpath(
                    '//p[contains(text(),"IELTS")]/text()').extract()
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts.get('IELTS')
                item['ielts_l'] = ielts.get('IELTS_L')
                item['ielts_s'] = ielts.get('IELTS_S')
                item['ielts_r'] = ielts.get('IELTS_R')
                item['ielts_w'] = ielts.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #        item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    '//ul[@class="list-arrows"]/li[1]/a/@href').extract()
                clear_space(modules_url)
                if modules_url != []:
                    try:
                        item['modules_en'] = self.parse_modules(modules_url[0])
                    except:
                        item['modules_en'] = ""
                # print("item['modules_en']: ", item['modules_en'])

                item[
                    'apply_proces_en'] = "https://www.latrobe.edu.au/international/how-to-apply/undergraduate-and-postgraduate"

                item['overview_en'] = item['degree_overview_en']
                # programme_major = response.xpath('//section[@id="overview"]/div[@class="block"]//ul/li').extract()
                programme_major = response.xpath(
                    # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li|'
                    '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li|'
                    '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li|'
                    '//th[contains(text(),"Minors")]/../preceding-sibling::*//td[contains(text(),"Yes")][1]/preceding-sibling::td|'
                    '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li|'
                    # '//th[contains(text(),"Minors")]/../preceding-sibling::tr|'
                    '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li'
                ).extract()
                print(len(programme_major))
                if len(programme_major) == 0:
                    yield item
                else:
                    for maj in programme_major:
                        print("***************************" +
                              str(programme_major.index(maj) + 1) +
                              "****************************")
                        programme_major1 = response.xpath(
                            # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li|'
                            '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//th[contains(text(),"Minors")]/../preceding-sibling::*//td[contains(text(),"Yes")][1]/preceding-sibling::td//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            # '//th[contains(text(),"Minors")]/../preceding-sibling::tr|'
                            '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' +
                            ')]/@href').extract()
                        # programme_major1 = response.xpath("//a[contains(text(),"+"'"+remove_tags(maj)+"'"+")]/@href").extract()
                        if len(programme_major1) == 0:
                            item['programme_en'] = remove_tags(maj).replace(
                                "Yes", "").replace("*", "").strip()
                            print("不用跳转的item['programme_en']_major: ",
                                  item['programme_en'])
                            yield item
                        else:
                            programme_dict_list = self.parse_major(
                                programme_major1[0], remove_tags(maj))
                            print("programme_dict_list: ", programme_dict_list)
                            for programme_dict in programme_dict_list:
                                item['programme_en'] = programme_dict.get(
                                    'programme_en')
                                item['overview_en'] = programme_dict.get(
                                    'overview_en')

                                # item['programme_en'] = ''.join(programme_major1).strip()
                                print("跳转之后的链接item['programme_en']_major: ",
                                      item['programme_en'])
                                print("跳转之后的链接item['overview_en']_major: ",
                                      item['overview_en'])
                                yield item
                        # programme_major1 = response.xpath(
                        #                     # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href|'
                        #                      '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href'
                        #                     '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href|'
                        #                     # '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr|'
                        #                     '//p[contains(text(),"disciplines:")]/following-sibling::ul/li['+str(i+1)+']//a/@href|'
                        #                     '//th[contains(text(),"Minors")]/../preceding-sibling::tr['+str(i+1)+']//a/@href|'
                        #                     '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href').extract()
                        # clear_space(programme_major1)
                        # print("programme_major1: ", programme_major1)
                        # if len(programme_major1) > 0:
                        #     major_url = programme_major1[0]
                        #     programme_dict_list = self.parse_major(major_url)
                        #     print("programme_dict_list: ", programme_dict_list)
                        #     for programme_dict in programme_dict_list:
                        #         item['programme_en'] = programme_dict.get('programme_en')
                        #         item['overview_en'] = programme_dict.get('overview_en')
                        #
                        #         # item['programme_en'] = ''.join(programme_major1).strip()
                        #         print("跳转之后的链接item['programme_en']_major: ", item['programme_en'])
                        #         print("跳转之后的链接item['overview_en']_major: ", item['overview_en'])
                        #         yield item
                        # else:
                        #     programme_major1 = response.xpath(
                        #         '//p[contains(text(),"Our Majors are:")]/following-sibling::ul[1]/li['+str(i+1)+']//text()|'
                        #         '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li['+str(i+1)+']//text()|'
                        #         # '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr|'
                        #         '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li['+ str(i+1)+']//text()|'
                        #         '//th[contains(text(),"Minors")]/../preceding-sibling::tr['+str(i+1)+']//text()|'
                        #         '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li['+str(i+1)+']//text()').extract()
                        #     item['programme_en'] = ''.join(programme_major1).replace("Yes", "").replace("*", "").strip()
                        #     print("不用跳转的item['programme_en']_major: ", item['programme_en'])
                        #     yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
Ejemplo n.º 2
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "RMIT University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.rmit.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        item['major_type1'] = response.meta.get(response.url)
        print("===========================")
        print(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h1[@id='course-name']//text()|//h1[@class='highLight program-header']//text()"
            ).extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            if item['degree_name'] == "":
                print("***degree_name为空")
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)", item['degree_name'].replace("(Honours)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = response.xpath(
                    "//span[@class='icon-location']/..//text()|"
                    "//h4[@class='description'][contains(text(),'Location')]/following-sibling::*//text()"
                ).extract()
                clear_space(location)
                item['location'] = ' '.join(location).strip()
                if item['location'] == "":
                    print("***location为空")
                print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//div[@class='b-program-content links b-international']//span[@class='icon-clock']/..//text()|"
                    "//div[@class='b-program-content links b-international  ']//span[@class='icon-clock']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Duration')]/following-sibling::*//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                # if item['duration'] == "":
                #     print("***duration为空")
                # print("item['duration']: ", item['duration'])

                tuition_fee = response.xpath(
                    "//div[contains(@class,'b-program-content links b-international')]//span[@class='icon-fees']/..//text()|"
                    "//div[contains(@class,'b-program-content links b-international  ')]//span[@class='icon-fees']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Fees')]/following-sibling::*//text()"
                ).extract()
                clear_space(tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = tuition_fee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                start_date = response.xpath(
                    "//div[@class='b-program-content links b-international']//span[@class='icon-intake']/..//text()|"
                    "//div[@class='b-program-content links b-international  ']//span[@class='icon-intake']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next intake')]/following-sibling::*//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next Intake')]/following-sibling::*//text()"
                ).extract()
                clear_space(start_date)
                item['start_date'] = getStartDateMonth(' '.join(start_date))
                if item['start_date'] == "":
                    print("***start_date 为空")
                print("item['start_date']: ", item['start_date'])

                overview = response.xpath(
                    "//div[@id='overview']/..|//div[@id='overview']/../following-sibling::div[1]|"
                    "//div[@id='Overview']/..|//div[@id='Overview']/../following-sibling::div[1]"
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))

                modules_en_url = response.xpath(
                    "//table[@class='table  program-table']//td//a[contains(text(),'View plan')]/@href"
                ).extract()
                clear_space(modules_en_url)
                if len(modules_en_url) > 0:
                    url = "https://www.rmit.edu.au" + modules_en_url[0]
                    self.parse_modules1(url, item)
                else:
                    modules_en = response.xpath(
                        "//span[contains(text(),'Electives and program structure')]/../../../.."
                    ).extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules_en))

                if item['degree_overview_en'] == "":
                    overviewModulesUrl = response.url + "/program-details"
                    self.parse_overviewModules1(overviewModulesUrl, item)

                if item['degree_overview_en'] == "":
                    print("***degree_overview_en 为空")
                print("item['degree_overview_en']: ",
                      item['degree_overview_en'])
                if item['modules_en'] == "":
                    print("***modules_en 为空")
                print("item['modules_en']: ", item['modules_en'])

                career = response.xpath(
                    "//div[@id='career']|//div[@id='career']/../following-sibling::div[1]|"
                    "//div[@id=' career']|//div[@id=' career']/../following-sibling::div[1]|"
                    "//div[@id='Career']|//div[@id='Career']/../following-sibling::div[1]|"
                    "//div[@id=' Career']|//div[@id=' Career']/../following-sibling::div[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                if item['career_en'] == "":
                    careerUrl = response.url + "/career"
                    self.parse_career1(careerUrl, item)
                if item['career_en'] == "":
                    print("***career_en 为空")
                print("item['career_en']: ", item['career_en'])

                rntry_requirements_en = response.xpath(
                    "//div[@id='admissions']/..|//div[@id='admissions']/../following-sibling::*[position()<last()-3]|"
                    "//div[@id='Admissions']/..|//div[@id='Admissions']/../following-sibling::*[position()<last()-3]"
                ).extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry_requirements_en))
                if item['rntry_requirements_en'] == "":
                    entryUrl = response.url + "/entry-requirements"
                    self.parse_entryrequirements1(entryUrl, item)
                if item['rntry_requirements_en'] == "":
                    print("***rntry_requirements_en 为空")
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                ielts_desc = response.xpath(
                    "//li[contains(text(),'IELTS (Academic): ')]//text()"
                ).extract()
                item['ielts_desc'] += clear_lianxu_space(ielts_desc)
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_d = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_d.get('IELTS')
                item["ielts_l"] = ielts_d.get('IELTS_L')
                item["ielts_s"] = ielts_d.get('IELTS_S')
                item["ielts_r"] = ielts_d.get('IELTS_R')
                item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                toefl_desc = response.xpath(
                    "//*[contains(text(),'TOEFL (Internet Based Test - IBT): ')]//text()"
                ).extract()
                item['toefl_desc'] += clear_lianxu_space(toefl_desc)
                # print("item['toefl_desc']: ", item['toefl_desc'])

                ielts_d = get_toefl(item['toefl_desc'])
                item["toefl"] = ielts_d.get('TOEFL')
                item["toefl_l"] = ielts_d.get('TOEFL_L')
                item["toefl_s"] = ielts_d.get('TOEFL_S')
                item["toefl_r"] = ielts_d.get('TOEFL_R')
                item["toefl_w"] = ielts_d.get('TOEFL_W')
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))
                # programme = response.xpath("//div[@class='program-name']/h1/text()").extract()
                # ucascode = response.xpath("//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[1]/span[2]/text()").extract()
                # clear_space(ucascode)
                # # item['ucas_code'] = ''.join(ucascode)
                # # print("item['ucas_code']2: ", item['ucas_code'])
                #
                # duration = response.xpath(
                #     "//div[@data-duration][2]/span[2]/text()").extract()
                # clear_space(duration)
                # item['duration'] = ''.join(duration)
                # print("item['duration']2: ", item['duration'])
                #
                # start_date = response.xpath(
                #     "//div[@data-intake][2]/span[2]/text()").extract()
                # clear_space(start_date)
                # item['start_date'] = ''.join(start_date)
                # print("item['start_date']2: ", item['start_date'])
                #
                # location = response.xpath(
                #     "//div[@class='c-summary-cell not-hide']/span[2]//text()").extract()
                # clear_space(location)
                # item['location'] = ''.join(location)
                # print("item['location']2: ", item['location'])
                #
                # department = response.xpath(
                #     "//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[7]/span[2]/text()").extract()
                # clear_space(department)
                # item['department'] = ''.join(department)
                # print("item['department']2: ", item['department'])
                #
                # overview = response.xpath(
                #     "//html//div[@class='program-summary-section-overview mb-md-md-md']/div[position()<last()-1]").extract()
                # item['degree_overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['degree_overview_en']2: ", item['degree_overview_en'])
                #
                #
                # # //html//div[@class='panel-group accordion']/div/div[4]
                # career = response.xpath(
                #     "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][3]").extract()
                # if "Career outlook" not in career:
                #     career = response.xpath(
                #     "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][4]").extract()
                # item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']2: ", item['career_en'])
                #
                # modulesUrl = response.url + "/program-structure"
                # self.parse_modules2(modulesUrl, item)
                #
                # how_to_applyUrl = response.url + "/how-to-apply"
                # self.parse_how_to_apply2(how_to_applyUrl, item)
                #
                # entryUrl = response.url + "/entry-requirements"
                # self.entryrequirements2(entryUrl, item)
                #
                # feeUrl = response.url + "/fees"
                # self.fees2(feeUrl, item)

                item['apply_proces_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="share-heading hide">How to Apply</div>
  </div>
                </div>
			<div class="standard-content-article mb-lg-md-md clearfix">
				<div class="org-area-module-detail-view accordian ">
					<div class="row">
						<div class="col-xs-12 ">
							<div class="clearfix">
  <p class="lead">A step-by-step guide for international students on how to apply to study at RMIT.</p>
  <div class="lower-image-container"></div>
							</div>
							<!-- Parsys -->
							<!-- This Parsys will be used to Put all Main Body Components -->
<div class="floated-image-container pull-right">
<div class="detail-img-list not-hide image-square">
	<figure>
		<div class="c-detail-image c-detail-image-square">
			<img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x800/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x640/image.jpg" class="c-responsive-image bg-cover offset-content">
		</div>
		<div class="c-detail-image c-detail-image-portrait">
			<img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x1068/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x854/image.jpg" class="c-responsive-image bg-cover offset-content">
		</div>
	</figure>
</div>
</div>
<div>
    <div class="extended-desc not-hidden">
        <p>If you want to study for only one or two semesters, you can apply for a&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/study-abroad.html">study abroad program</a>&nbsp;or&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/student-exchange.html">student exchange</a>&nbsp;at RMIT.</p>
<h3>Applying for a research degree?</h3>
<p>If you want to apply for a research program, <a href="/content/rmit-ui/en/research/phds-and-other-research-degrees/how-to-apply.html">follow this process and apply here</a> instead.<br>
</p>
<h2>Step 1: Find a program</h2>
<p>Search for a program in your&nbsp;<a href="/content/rmit-ui/en/study-with-us.html">interest area</a>&nbsp;or browse by&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students.html">level of study</a>. Some programs are not available in the July intake, in which case, you will need to apply for the next available intake.</p>
<p>You can also use the&nbsp;<a href="https://www.international.rmit.edu.au/info/programfees.asp" title="Programs, intakes and tuition fees database">Programs, intakes and tuition fees database</a>&nbsp;to search for programs.</p>
<h2>Step 2: Check the entry requirements</h2>
<p>Check that you qualify for the program's entry requirements including:</p>
<ul>
<li>English language requirements</li>
<li>academic entry requirement (see equivalent&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/entry-requirements/country-equivalency.html">entry requirements by country</a>)</li>
<li>pre-requisites</li>
<li>selection tasks.</li>
</ul>
<p>If you don’t meet the entry requirements for your preferred program, you can consider a range of programs that may provide&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/pathways-and-credit-transfer.html">pathways</a>&nbsp;to your preferred program.</p>
<p>If you are&nbsp;​currently ​studying ​an​ Australian Year 12 ​(in Australia or overseas) ​​or​ International Baccalaureate ​(​in Australia or New Zealand) and ​applying &nbsp;for a&nbsp;Bachelor, Associate or Honours degree, you will need to apply via VTAC. You should <a href="http://www.rmit.edu.au/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/international-students-studying-vce-or-ib">check the VTAC entry requirements</a>.<br>
</p>
<h2>Step 3: Collect required documents</h2>
<p>To avoid delays in admission processing, submit&nbsp;a&nbsp;complete set of supporting documents&nbsp;including:</p>
<ul>
<li>passport</li>
<li>certified copies of academic transcripts&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>certified copies of all graduation certificates in both the original&nbsp;language and English&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>evidence of English language proficiency&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>any documentation relating to selection tasks (pre-selection kits,&nbsp;folios etc.)</li>
<li>CV, work reference letter, referee report etc if applicable</li>
</ul>
<p>&nbsp;Please note that documents submitted will not be returned.</p>
<h2>Step 4: Submit your application</h2>
<p>Submit your&nbsp;application online&nbsp;with&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/documentation-required.html">all the required documents</a>.</p>
<h4>Students completing an Australian Year 12 (in Australia or overseas), or the International Baccalaureate (in Australia or New Zealand)</h4>
<ul>
<li>Apply for <strong>Higher Education</strong> programs (Bachelor, Associate Degree and Honours) through the Victorian Tertiary Admissions Centre (VTAC).<br>
<br>
<a href="http://www.vtac.edu.au/applying.html">Apply now via VTAC</a></li>
<li>Apply for <strong>Vocational Education</strong> programs (Foundation Studies, ELICOS, VCE, Certificate IV, Diploma and Advanced Diplomas) via iApply, the online application system for international students.<br>
<br>
<a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li>
</ul>
<h4>Studying fully online<br>
</h4>
<ul>
<li>If your program is delivered fully online, use the online application system for local students and follow the local student application process. Note: fully online programs do not qualify for an Australian Student Visa.<br>
<br>
<a href="https://rmit.service-now.com/rmit-admissions/">Apply now via Admissions</a></li>
</ul>
<h4>All other international students<br>
</h4>
<ul>
<li>If you are applying for on-campus study in a coursework program use iApply, the online application system for international students.<br>
<br>
<a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li>
</ul>
<h4>Application fee</h4>
<p>You will need to pay an application fee if you are from one of these&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/application-fee.html">countries classified as high risk</a>.</p>
<h2>Need help?</h2>
<p>If you need assistance,&nbsp;<a href="https://connect.prospectivestudent.info/RMITInt?_ga=1.241036611.1742672422.1416265787">contact us</a>&nbsp;or one of&nbsp;<a href="https://www.international.rmit.edu.au/info/agentlist/">RMIT’s appointed representatives</a>&nbsp;(agents).</p>
<h2>Next steps:</h2>
<p>Your application will be assessed in line with RMIT’s policies and procedures. If you are successful, you will receive an offer letter. You can then&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/accept-your-offer.html">accept your offer</a>&nbsp;by following the instructions in your offer letter.&nbsp;</p>
<p>RMIT will normally advise you on the outcome of your application within 10 business days. If you are applying from Australia you should hear within 24 hours. If you don't hear back within the time frame above please <a href="https://rmit.au1.qualtrics.com/jfe/form/SV_0fbt3k9dEkNATZ3">contact Admissions Helpdesk</a>.</p>
<p>If you are applying via VTAC <a href="http://www.vtac.edu.au/dates.html">check the VTAC website</a> for important dates.</p>
    </div>
</div>
						</div>
					</div>"""
                    ]))
                item['apply_documents_en'] = remove_class(
                    clear_lianxu_space([
                        """<p>To avoid delays in admission processing, submit&nbsp;a&nbsp;complete set of supporting documents&nbsp;including:</p>
<ul>
<li>passport</li>
<li>certified copies of academic transcripts&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>certified copies of all graduation certificates in both the original&nbsp;language and English&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>evidence of English language proficiency&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>any documentation relating to selection tasks (pre-selection kits,&nbsp;folios etc.)</li>
<li>CV, work reference letter, referee report etc if applicable</li>
</ul>
<p>&nbsp;Please note that documents submitted will not be returned.</p>"""
                    ]))
                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
Ejemplo n.º 3
0
    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Deakin University"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.deakin.edu.au'
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        # 组合字典
        links = [
            "http://www.deakin.edu.au/course/bachelor-arts-international",
            "http://www.deakin.edu.au/course/bachelor-arts-honours-international",
            "http://www.deakin.edu.au/course/bachelor-arts-psychology-international",
            "http://www.deakin.edu.au/course/bachelor-arts-psychology-honours-international",
            "http://www.deakin.edu.au/course/bachelor-arts-advanced-honours-international",
            "http://www.deakin.edu.au/course/bachelor-arts-chinese-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-arts-master-teaching-secondary-international",
            "http://www.deakin.edu.au/course/bachelor-arts-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-arts-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-arts-bachelor-science-international",
            "http://www.deakin.edu.au/course/bachelor-arts-master-arts-international-relations-international",
            "http://www.deakin.edu.au/course/bachelor-biomedical-science-international",
            "http://www.deakin.edu.au/course/bachelor-business-international",
            "http://www.deakin.edu.au/course/bachelor-business-sport-management-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-information-systems-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-science-international",
            "http://www.deakin.edu.au/course/bachelor-communication-advertising-international",
            "http://www.deakin.edu.au/course/bachelor-communication-digital-media-international",
            "http://www.deakin.edu.au/course/bachelor-communication-honours-international",
            "http://www.deakin.edu.au/course/bachelor-communication-journalism-international",
            "http://www.deakin.edu.au/course/bachelor-communication-public-relations-international",
            "http://www.deakin.edu.au/course/bachelor-computer-science-international",
            "http://www.deakin.edu.au/course/bachelor-construction-management-honours-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-drama-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-honours-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-photography-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-visual-arts-international",
            "http://www.deakin.edu.au/course/bachelor-creative-writing-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-cyber-security-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-psychological-science-international",
            "http://www.deakin.edu.au/course/bachelor-cyber-security-international",
            "http://www.deakin.edu.au/course/bachelor-design-3d-animation-international",
            "http://www.deakin.edu.au/course/bachelor-design-architecture-international",
            "http://www.deakin.edu.au/course/bachelor-design-architecture-bachelor-construction-management-honours-international",
            "http://www.deakin.edu.au/course/bachelor-design-digital-technologies-international",
            "http://www.deakin.edu.au/course/bachelor-design-visual-communication-international",
            "http://www.deakin.edu.au/course/bachelor-education-early-years-international",
            "http://www.deakin.edu.au/course/bachelor-education-primary-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-engineering-honours-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-environmental-management-and-sustainability-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-marine-biology-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-wildlife-and-conservation-biology-international",
            "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-international",
            "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-bachelor-business-sport-management-international",
            "http://www.deakin.edu.au/course/bachelor-film-television-and-animation-international",
            "http://www.deakin.edu.au/course/bachelor-food-and-nutrition-sciences-honours-international",
            "http://www.deakin.edu.au/course/bachelor-forensic-science-international",
            "http://www.deakin.edu.au/course/bachelor-forensic-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-forensic-science-bachelor-criminology-international",
            "http://www.deakin.edu.au/course/bachelor-health-sciences-international",
            "http://www.deakin.edu.au/course/bachelor-health-sciences-honours-international",
            "http://www.deakin.edu.au/course/bachelor-health-sciences-bachelor-arts-international",
            "http://www.deakin.edu.au/course/bachelor-health-and-medical-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-health-and-physical-education-international",
            "http://www.deakin.edu.au/course/bachelor-information-systems-international",
            "http://www.deakin.edu.au/course/bachelor-information-systems-bachelor-information-technology-international",
            "http://www.deakin.edu.au/course/bachelor-information-technology-international",
            "http://www.deakin.edu.au/course/bachelor-information-technology-honours-international",
            "http://www.deakin.edu.au/course/bachelor-international-studies-international",
            "http://www.deakin.edu.au/course/bachelor-international-studies-global-scholar-international",
            "http://www.deakin.edu.au/course/bachelor-international-studies-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-laws-bachelor-international-studies-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-honours-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-midwifery-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-psychological-science-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-public-health-and-health-promotion-international",
            "http://www.deakin.edu.au/course/bachelor-nutrition-science-international",
            "http://www.deakin.edu.au/course/bachelor-nutrition-science-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-occupational-therapy-international",
            "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-international",
            "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-psychological-science-international",
            "http://www.deakin.edu.au/course/bachelor-psychological-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-international",
            "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-honours-international",
            "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-science-international",
            "http://www.deakin.edu.au/course/bachelor-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-science-master-teaching-secondary-international",
            "http://www.deakin.edu.au/course/bachelor-science-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-social-work-international",
            "http://www.deakin.edu.au/course/bachelor-software-engineering-honours-international",
            "http://www.deakin.edu.au/course/bachelor-sport-development-international",
            "http://www.deakin.edu.au/course/bachelor-vision-science-master-optometry-international",
            "http://www.deakin.edu.au/course/bachelor-zoology-and-animal-science-international",
            "http://www.deakin.edu.au/course/bachelor-zoology-and-animal-science-honours-international",
        ]
        programme_dict = {}
        programme_list = [
            "Bachelor of Arts",
            "Bachelor of Arts (Honours)",
            "Bachelor of Arts (Psychology)",
            "Bachelor of Arts (Psychology) (Honours)",
            "Bachelor of Arts - Advanced (Honours)",
            "Bachelor of Arts - Chinese/Bachelor of Commerce",
            "Bachelor of Arts / Master of Teaching (Secondary)",
            "Bachelor of Arts/Bachelor of Commerce",
            "Bachelor of Arts/Bachelor of Laws",
            "Bachelor of Arts/Bachelor of Science",
            "Bachelor of Arts/Master of Arts (International Relations)",
            "Bachelor of Biomedical Science",
            "Bachelor of Business",
            "Bachelor of Business (Sport Management)",
            "Bachelor of Commerce",
            "Bachelor of Commerce/Bachelor of Information Systems",
            "Bachelor of Commerce/Bachelor of Laws",
            "Bachelor of Commerce/Bachelor of Science",
            "Bachelor of Communication (Advertising)",
            "Bachelor of Communication (Digital Media)",
            "Bachelor of Communication (Honours)",
            "Bachelor of Communication (Journalism)",
            "Bachelor of Communication (Public Relations)",
            "Bachelor of Computer Science",
            "Bachelor of Construction Management (Honours)",
            "Bachelor of Creative Arts (Drama)",
            "Bachelor of Creative Arts (Honours)",
            "Bachelor of Creative Arts (Photography)",
            "Bachelor of Creative Arts (Visual Arts)",
            "Bachelor of Creative Writing",
            "Bachelor of Criminology",
            "Bachelor of Criminology/Bachelor of Cyber Security",
            "Bachelor of Criminology/Bachelor of Laws",
            "Bachelor of Criminology/Bachelor of Psychological Science",
            "Bachelor of Cyber Security",
            "Bachelor of Design (3D Animation)",
            "Bachelor of Design (Architecture)",
            "Bachelor of Design (Architecture)/Bachelor of Construction Management (Honours)",
            "Bachelor of Design (Digital Technologies)",
            "Bachelor of Design (Visual Communication)",
            "Bachelor of Education (Early Years)",
            "Bachelor of Education (Primary)",
            "Bachelor of Environmental Engineering (Honours)",
            "Bachelor of Environmental Science (Environmental Management and Sustainability)",
            "Bachelor of Environmental Science (Honours)",
            "Bachelor of Environmental Science (Marine Biology)",
            "Bachelor of Environmental Science (Wildlife and Conservation Biology)",
            "Bachelor of Exercise and Sport Science",
            "Bachelor of Exercise and Sport Science (Honours)",
            "Bachelor of Exercise and Sport Science/Bachelor of Business (Sport Management)",
            "Bachelor of Film, Television and Animation",
            "Bachelor of Food and Nutrition Sciences (Honours)",
            "Bachelor of Forensic Science",
            "Bachelor of Forensic Science (Honours)",
            "Bachelor of Forensic Science/Bachelor of Criminology",
            "Bachelor of Health Sciences",
            "Bachelor of Health Sciences (Honours)",
            "Bachelor of Health Sciences/Bachelor of Arts",
            "Bachelor of Health and Medical Science (Honours)",
            "Bachelor of Health and Physical Education",
            "Bachelor of Information Systems",
            "Bachelor of Information Systems/Bachelor of Information Technology",
            "Bachelor of Information Technology",
            "Bachelor of Information Technology (Honours)",
            "Bachelor of International Studies",
            "Bachelor of International Studies (Global Scholar)",
            "Bachelor of International Studies/Bachelor of Commerce",
            "Bachelor of Laws",
            "Bachelor of Laws/Bachelor of International Studies",
            "Bachelor of Nursing",
            "Bachelor of Nursing (Honours)",
            "Bachelor of Nursing/Bachelor of Midwifery",
            "Bachelor of Nursing/Bachelor of Psychological Science",
            "Bachelor of Nursing/Bachelor of Public Health and Health Promotion",
            "Bachelor of Nutrition Science",
            "Bachelor of Nutrition Science/Bachelor of Commerce",
            "Bachelor of Occupational Therapy",
            "Bachelor of Property and Real Estate",
            "Bachelor of Property and Real Estate/Bachelor of Commerce",
            "Bachelor of Property and Real Estate/Bachelor of Laws",
            "Bachelor of Psychological Science",
            "Bachelor of Psychological Science (Honours)",
            "Bachelor of Public Health and Health Promotion",
            "Bachelor of Public Health and Health Promotion (Honours)",
            "Bachelor of Public Health and Health Promotion/Bachelor of Commerce",
            "Bachelor of Science",
            "Bachelor of Science (Honours)",
            "Bachelor of Science / Master of Teaching (Secondary)",
            "Bachelor of Science/Bachelor of Laws",
            "Bachelor of Social Work",
            "Bachelor of Software Engineering (Honours)",
            "Bachelor of Sport Development",
            "Bachelor of Vision Science/Master of Optometry",
            "Bachelor of Zoology and Animal Science",
            "Bachelor of Zoology and Animal Science (Honours)",
        ]
        for link in range(len(links)):
            url = links[link]
            programme_dict[url] = programme_list[link]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//div[@class='module__banner-title']/h1//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)", item['degree_name'].replace("(Honours)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "(Honours)",
                        "").replace("Master of ",
                                    "").replace("Bachelor of", "").strip()
                item['programme_en'] = item['programme_en'].replace(
                    "  ", " ").strip()
                print("item['programme_en']: ", item['programme_en'])

                # //div[@class='module__summary--items']/div[1]/div[2]
                ielts = response.xpath(
                    "//h3[contains(text(),'English language requirements')]/../following-sibling::*[1]//text()"
                ).extract()
                clear_space(ielts)
                item['ielts_desc'] = ''.join(ielts).strip()
                print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_d = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_d.get('IELTS')
                item["ielts_l"] = ielts_d.get('IELTS_L')
                item["ielts_s"] = ielts_d.get('IELTS_S')
                item["ielts_r"] = ielts_d.get('IELTS_R')
                item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                duration = response.xpath(
                    "//h3[contains(text(),'Duration')]/../following-sibling::div//text()"
                ).extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_re = re.findall(r".*full[\s\-]time",
                                         ''.join(duration).strip())
                item['duration'] = ''.join(duration_re).strip()
                # if item['duration'] == "":
                #     print("***duration 为空")
                # print("item['duration']: ", item['duration'])

                location = response.xpath(
                    "//div[@class='module__summary--icon-wrapper']//h3[@class='course__subheading'][contains(text(),'Campuses')]/../following-sibling::div//text()"
                ).extract()
                clear_space(location)
                item['location'] = ' '.join(location).strip()
                location_tmp = item['location']
                # print("item['location']: ", item['location'])

                # //div[@id='navigation__course']/following-sibling::div
                overview = response.xpath(
                    "//h2[contains(text(),'Course information')]/../.."
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # if item['degree_overview_en'] == "":
                #     print("***degree_overview_en 为空")
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                modules = response.xpath(
                    "//div[@id='module__course-structure']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en 为空")
                # print("item['modules_en']: ", item['modules_en'])

                start_date = response.xpath(
                    "//li[contains(text(),'Start date:')]//text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                item['start_date'] = getStartDateMonth(
                    ' '.join(start_date).strip())
                # print("item['start_date']: ", item['start_date'])

                entry_requirements = response.xpath(
                    "//div[@data-section='entry requirements']").extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # if item['rntry_requirements_en'] == "":
                #     print("***rntry_requirements_en 为空")
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                # //div[@data-section='fees and scholarships']
                tuition_fee = response.xpath(
                    "//div[@class='module__content-panel']//div[@class='module__key-information--item-content']/text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = tuition_fee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                career = response.xpath(
                    "//div[@data-section='graduate outcomes']|//div[@data-section='graduate outcomes']/following-sibling::div[1]|"
                    "//h3[contains(text(),'Career outcomes')]/../..").extract(
                    )
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                # //div[@data-section='application information']/following-sibling::div[2]
                how_to_apply = response.xpath(
                    "//h3[contains(text(),'How to apply')]/../..").extract()
                item['apply_desc_en'] = remove_class(
                    clear_lianxu_space(how_to_apply))
                # if item['apply_desc_en'] == "":
                #     print("***apply_desc_en 为空")
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                major_list_url = response.xpath(
                    "//h3[contains(text(), 'Major Sequences')]/..//a/@href|"
                    "//h3[contains(text(), 'Major sequences')]/..//a/@href|"
                    "//h3[contains(text(), 'Major sequences')]/following-sibling::ul[1]//a/@href|"
                    "//td[contains(text(),'Major')]/preceding-sibling::td//a/@href"
                ).extract()
                clear_space(major_list_url)
                print("major_list_url: ", major_list_url)
                print(len(major_list_url))

                major_url_l = []
                for major_url in major_list_url:
                    if "major" in major_url:
                        major_url_l.append(major_url)
                print("major_url_l: ", major_url_l)
                print(len(major_url_l))
                if len(major_url_l) == 0:
                    item['url'] = response.url
                    print("item['url']2: ", item['url'])
                    yield item
                else:
                    for major_url in major_url_l:
                        headers_base = {
                            'User-Agent':
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                        }
                        data = requests.get(major_url, headers=headers_base)
                        response_major = etree.HTML(data.text)
                        item['url'] = major_url
                        print("item['url']_major: ", item['url'])

                        programme_major = response_major.xpath(
                            "//div[@class='module__banner-title']/h1//text()")
                        item['programme_en'] = ''.join(programme_major).strip()
                        print("item['programme_en']_major: ",
                              item['programme_en'])

                        location_major = response_major.xpath(
                            "//*[contains(text(),'Campuses')]/../following-sibling::div[1]//text()"
                        )
                        item['location'] = ''.join(location_major).strip()
                        if item['location'] == "":
                            item['location'] = location_tmp
                        # print("item['location']_major: ", item['location'])

                        overview_en = response_major.xpath(
                            "//h2[contains(text(),'Overview')]/../..")
                        overview_en_str = ""
                        if len(overview_en) > 0:
                            for o in overview_en:
                                overview_en_str += etree.tostring(
                                    o, encoding='unicode', method='html')
                        item['overview_en'] = remove_class(
                            clear_lianxu_space([overview_en_str]))
                        # print("item['overview_en']_major: ", item['overview_en'])

                        modules_en = response_major.xpath(
                            "//h2[contains(text(),'Explore units')]/../..")
                        modules_en_str = ""
                        if len(modules_en) > 0:
                            for o in modules_en:
                                modules_en_str += etree.tostring(
                                    o, encoding='unicode', method='html')
                        item['modules_en'] = remove_class(
                            clear_lianxu_space([modules_en_str]))
                        # print("item['modules_en']_major: ", item['modules_en'])
                        yield item
                        # else:
                        #     item['url'] = response.url
                        #     print("item['url']1: ", item['url'])
                        #     yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
Ejemplo n.º 4
0
    def parse_data(self, response):
        # 判断是否学位下面还有专业
        specialisations = response.xpath(
            "//h2[contains(text(),'Specialisations')]/following-sibling::*//a/@href"
        ).extract()
        # print("specialisations: ", specialisations, response.url)
        if len(specialisations) > 0:
            for link in specialisations:
                if "http" in link:
                    url = link
                else:
                    url = "http://study.unisa.edu.au" + link
                yield scrapy.Request(url, callback=self.parse_data)
        else:
            item = get_item(ScrapyschoolAustralianBenItem)
            item['university'] = "University of South Australia"
            # item['country'] = 'Australia'
            # item['website'] = 'http://www.unisa.edu.au/'
            item['url'] = response.url
            print("===========================")
            print(response.url)
            item['degree_type'] = 1
            try:
                programme = response.xpath(
                    "//div[@class='title-row']/h1/text()").extract()
                clear_space(programme)
                item['degree_name'] = ''.join(programme).replace(
                    "(International)", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                pro_re = re.findall(r"Bachelor", item['degree_name'])
                print("pre_re: ", pro_re)
                if len(pro_re) < 2:
                    programme_re = re.findall(r"\(.+\)", item['degree_name'])
                    print("programme_re: ", programme_re)
                    if len(programme_re) > 0:
                        if ''.join(programme_re).strip() != "(Honours)":
                            item['programme_en'] = ''.join(
                                programme_re).replace("(",
                                                      "").replace(")",
                                                                  "").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of",
                                "").replace("(Honours)",
                                            "").strip().strip("in").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Bachelor of", "").strip().strip("in").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    start_date = response.xpath(
                        "//span[contains(text(), 'Start')]/../text()").extract(
                        )
                    clear_space(start_date)
                    # print("start_date: ", start_date)
                    item['start_date'] = getStartDateMonth(
                        ', '.join(start_date))
                    print("item['start_date']: ", item['start_date'])

                    # //span[contains(text(),'Campus')]/../a
                    location = response.xpath(
                        "//span[contains(text(),'Campus')]/../a//text()"
                    ).extract()
                    clear_space(location)
                    item['location'] = ''.join(location).strip()
                    print("item['location']: ", item['location'])

                    duration = response.xpath(
                        "//span[contains(text(),'Duration')]/../text()"
                    ).extract()
                    clear_space(duration)
                    item['duration'] = ''.join(duration).strip()
                    print("item['duration']: ", item['duration'])

                    tuition_fee = response.xpath(
                        "//span[contains(text(),'2019: AUD$')]//text()|"
                        "//span[contains(text(),'Fees')]/../text()").extract()
                    print("tuition_fee: ", tuition_fee)
                    clear_space(tuition_fee)
                    tuition_fee = getTuition_fee(''.join(tuition_fee))
                    item['tuition_fee'] = str(tuition_fee)
                    if item['tuition_fee'] == '0':
                        item['tuition_fee'] = None
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # //span[contains(text(),'English Language Requirements')]/..
                    ielts = response.xpath(
                        "//span[contains(text(),'English Language Requirements')]/../ul//text()"
                    ).extract()
                    clear_space(ielts)
                    item['ielts_desc'] = ' '.join(ielts).strip()
                    print("item['ielts_desc']: ", item['ielts_desc'])

                    ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                    if len(ieltlsrw) > 0:
                        item["ielts"] = ieltlsrw[0]

                    ielts_l_re = re.findall(r"listening\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_l"] = ''.join(ielts_l_re).replace(
                        "listening", "").replace("[", "").replace("]",
                                                                  "").strip()

                    ielts_s_re = re.findall(r"speaking\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_s"] = ''.join(ielts_s_re).replace(
                        "speaking", "").replace("[", "").replace("]",
                                                                 "").strip()

                    ielts_r_re = re.findall(r"reading\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_r"] = ''.join(ielts_r_re).replace(
                        "reading", "").replace("[", "").replace("]",
                                                                "").strip()

                    ielts_w_re = re.findall(r"writing\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_w"] = ''.join(ielts_w_re).replace(
                        "writing", "").replace("[", "").replace("]",
                                                                "").strip()
                    print(
                        "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                        % (item['ielts'], item['ielts_l'], item['ielts_s'],
                           item['ielts_r'], item['ielts_w']))

                    # //div[@class='page-info-block-inner']//ul[@id='entry-requirements']
                    entry_requirements = response.xpath(
                        "//div[@class='page-info-block-inner']//ul[@id='entry-requirements']"
                    ).extract()
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(entry_requirements))
                    print("item['rntry_requirements_en']: ",
                          item['rntry_requirements_en'])

                    degree_overview_en = response.xpath(
                        "//h2[contains(text(),'Degree overview')]/../../.."
                    ).extract()
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space(degree_overview_en))
                    print("item['degree_overview_en']: ",
                          item['degree_overview_en'])

                    overview_en = response.xpath(
                        "//h2[contains(text(),'Snapshot')]/..|"
                        "//h3[contains(text(),'Snapshot')]/..").extract()
                    item['overview_en'] = remove_class(
                        clear_lianxu_space(overview_en))
                    print("item['overview_en']: ", item['overview_en'])

                    modules_en = response.xpath(
                        "//h2[@class='theme-white'][contains(text(), 'Degree structure')]/../..|"
                        "//h3[contains(text(),'Degree structure')]/../.."
                    ).extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules_en))
                    print("item['modules_en']: ", item['modules_en'])

                    career_en = response.xpath(
                        "//h2[contains(text(),'Your career')]/../../..|"
                        "//h3[contains(text(),'Your career')]/..").extract()
                    item['career_en'] = remove_class(
                        clear_lianxu_space(career_en))
                    print("item['career_en']: ", item['career_en'])

                    apply_desc_en = response.xpath(
                        "//h2[contains(text(),'How to apply')]/../../.."
                    ).extract()
                    item['apply_desc_en'] = remove_class(
                        clear_lianxu_space(apply_desc_en))
                    print("item['apply_desc_en']: ", item['apply_desc_en'])

                    if "research" not in item['degree_name']:
                        yield item
            except Exception as e:
                with open("scrapySchool_Australian_ben/error/" +
                          item['university'] + str(item['degree_type']) +
                          ".txt",
                          'a',
                          encoding="utf-8") as f:
                    f.write(
                        str(e) + "\n" + response.url +
                        "\n========================\n")
                print("异常:", str(e))
                print("报错url:", response.url)
Ejemplo n.º 5
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of Melbourne"
        print("================================================")
        print(response.url)
        item['url'] = response.url
        item['degree_type'] = 1
        item['department'] = response.meta.get('department')
        print("item['department']: ", item['department'])
        try:
            degree_name = response.xpath(
                "//div[@class='headline']/h1/text()|//h1[@id='page-header']//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = re.findall(r"\(.*\)|\-.*", item['degree_name'])
            print(programme)
            if len(programme) > 0:
                item['degree_name'] = item['degree_name'].replace(
                    ''.join(programme), '').strip()
                item['programme_en'] = ''.join(programme).replace(
                    "(", "").replace(")", "").replace("-", "").strip()
            else:
                item['programme_en'] = item['degree_name'].replace(
                    "Master of", "").strip()
            print("item['degree_name']=: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            duration = response.xpath(
                "//div[@class='course-length icn icn-duration']/text()|//li[contains(text(),'full time')]//text()"
            ).extract()
            clear_space(duration)
            # print("duration:", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            location = response.xpath(
                "//li[@id='course-overview-campus']//text()|//li[contains(text(),'Campus')]//text()"
            ).extract()
            # print(location, '==')
            item['location'] = ''.join(location).replace(
                "On Campus", "").replace("(", "").replace(")", "").strip()
            print("item['location']: ", item['location'])

            if item['location'].lower() != "online":

                start_date = response.xpath(
                    "//li[@id='course-overview-entryPeriods']//text()"
                ).extract()
                # print(start_date, '==')
                start_date_str = getStartDateMonth(''.join(start_date))
                item['start_date'] = start_date_str
                # print("item['start_date']: ", item['start_date'])

                overview_en = response.xpath(
                    "//div[@class='course-content']").extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview_en))
                # print("item['overview_en']: ", item['overview_en'])

                career_url = response.xpath(
                    "//a[contains(text(),'Where will this take me?')]/@href"
                ).extract_first()
                if career_url:
                    item['career_en'] = self.parse_career(
                        parse.urljoin(response.url, career_url))
                # print("item['career_en']: ", item['career_en'])

                entry_url = response.xpath(
                    "//a[contains(text(),'Entry requirements')]/@href"
                ).extract_first()
                if entry_url:
                    item['rntry_requirements_en'] = self.parse_entry(
                        parse.urljoin(response.url, entry_url))
                print("item['rntry_requirements_en']: ",
                      item['rntry_requirements_en'])

                fee_url = response.xpath(
                    "//a[contains(text(),'Fees & scholarships')]/@href"
                ).extract_first()
                if fee_url:
                    item['tuition_fee'] = self.parse_fee(
                        parse.urljoin(response.url, fee_url))
                print("item['tuition_fee']: ", item['tuition_fee'])

                # https://study.unimelb.edu.au/how-to-apply/english-language-requirements/undergraduate-english-language-requirements
                item[
                    'ielts_desc'] = " you need a score of 6.5 or more in the Academic International English Language Testing System (IELTS), with no bands less than 6.0."
                item["ielts"] = '6.5'
                item["ielts_l"] = '6.0'
                item["ielts_s"] = '6.0'
                item["ielts_r"] = '6.0'
                item["ielts_w"] = '6.0'
                item[
                    'toefl_desc'] = "a score of 79 and scores of 21 for writing, 18 for speaking, 13 for reading, 13 for listening for an internet-based test. To submit your scores when you apply, use our TOEFL Institution Code: 0974."
                item["toefl"] = '79'
                item["toefl_l"] = '13'
                item["toefl_s"] = '18'
                item["toefl_r"] = '13'
                item["toefl_w"] = '21'

                print(
                    "ielts: ",
                    item['ielts'],
                    ' - ',
                    item['ielts_l'],
                    ' - ',
                    item['ielts_s'],
                    ' - ',
                    item['ielts_r'],
                    ' - ',
                    item['ielts_w'],
                )
                print(
                    "toefl: ",
                    item['toefl'],
                    ' - ',
                    item['toefl_l'],
                    ' - ',
                    item['toefl_s'],
                    ' - ',
                    item['toefl_r'],
                    ' - ',
                    item['toefl_w'],
                )

                # 匹配跳转之后获取modules
                modules_url = response.xpath(
                    "//a[contains(text(),'What will I study?')]/@href"
                ).extract_first()
                major_list = []
                major_overview_list = []
                if modules_url:
                    modules = self.parse_modules(
                        parse.urljoin(response.url, modules_url))
                    print("modules: ", modules)
                    item['modules_en'] = modules[0]
                    major_list = modules[1]
                    major_overview_list = modules[2]
                    print(len(major_list), "=====", len(major_overview_list))
                    print(major_list)
                print("item['modules_en']: ", item['modules_en'])

                # 有多个专业和一个专业的区分插入
                if len(major_list) > 0:
                    if len(major_list) == len(major_overview_list):
                        for i in range(len(major_list)):
                            item['programme_en'] = major_list[i]
                            major_overview_str = ""
                            for m in major_overview_list[i]:
                                major_overview_str += etree.tostring(
                                    m,
                                    encoding='unicode',
                                    pretty_print=False,
                                    method='html')
                            item['overview_en'] = remove_class(
                                major_overview_str)
                            print("item['overview']==: ", item['overview_en'])
                            yield item
                else:
                    yield item

        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)