def parse_fee(self, feeUrl): print("feeUrl: ", feeUrl) data = requests.get(feeUrl, headers=self.headers_base) # response = etree.HTML(data.text) # print("response.url: ", data.url) # tuition_fee = response.xpath("//span[contains(text(),'Typical annual course fee')]/following-sibling::span[1]//text()") tuition_fee0 = re.findall( r"""\"international\"\:\{\"ff\-indicative\":\"\$[\d,]*?\",\"ff\-year\"\:\"\$[\d,]*""", data.text) tuition_fee1 = re.findall(r"\"ff\-year\"\:\"\$[\d,]*", ''.join(tuition_fee0)) print("tuition_feetmp: ", tuition_fee1) clear_space(tuition_fee1) tuition_fee = getTuition_fee(''.join(tuition_fee1).replace( "$", "").replace("AUD", "")) # if tuition_fee == 0: # item['tuition_fee'] = None # else: # item['tuition_fee_pre'] = "AUD$" return tuition_fee
def parse_data(self, response): item = get_item(ScrapyschoolAustralianBenItem) item['university'] = "RMIT University" # item['country'] = 'Australia' # item['website'] = 'https://www.rmit.edu.au' item['url'] = response.url item['degree_type'] = 1 item['major_type1'] = response.meta.get(response.url) print("===========================") print(response.url) print("item['major_type1']: ", item['major_type1']) try: programme = response.xpath( "//h1[@id='course-name']//text()|//h1[@class='highLight program-header']//text()" ).extract() clear_space(programme) item['degree_name'] = ''.join(programme).strip() if item['degree_name'] == "": print("***degree_name为空") print("item['degree_name']: ", item['degree_name']) pro_re = re.findall(r"Bachelor", item['degree_name']) # print("pre_re: ", pro_re) if len(pro_re) < 2: programme_re = re.findall( r"\(.+\)", item['degree_name'].replace("(Honours)", "")) if len(programme_re) > 0: item['programme_en'] = ''.join(programme_re).replace( "(", "").replace(")", "").strip() else: item['programme_en'] = item['degree_name'].replace( "Bachelor of", "").strip() print("item['programme_en']: ", item['programme_en']) location = response.xpath( "//span[@class='icon-location']/..//text()|" "//h4[@class='description'][contains(text(),'Location')]/following-sibling::*//text()" ).extract() clear_space(location) item['location'] = ' '.join(location).strip() if item['location'] == "": print("***location为空") print("item['location']: ", item['location']) duration = response.xpath( "//div[@class='b-program-content links b-international']//span[@class='icon-clock']/..//text()|" "//div[@class='b-program-content links b-international ']//span[@class='icon-clock']/..//text()|" "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Duration')]/following-sibling::*//text()" ).extract() clear_space(duration) item['duration'] = ''.join(duration).strip() # if item['duration'] == "": # print("***duration为空") # print("item['duration']: ", item['duration']) tuition_fee = response.xpath( "//div[contains(@class,'b-program-content links b-international')]//span[@class='icon-fees']/..//text()|" "//div[contains(@class,'b-program-content links b-international ')]//span[@class='icon-fees']/..//text()|" "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Fees')]/following-sibling::*//text()" ).extract() clear_space(tuition_fee) tuition_fee = getTuition_fee(''.join(tuition_fee)) item['tuition_fee'] = tuition_fee if item['tuition_fee'] == 0: item['tuition_fee'] = None # print("item['tuition_fee']: ", item['tuition_fee']) start_date = response.xpath( "//div[@class='b-program-content links b-international']//span[@class='icon-intake']/..//text()|" "//div[@class='b-program-content links b-international ']//span[@class='icon-intake']/..//text()|" "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next intake')]/following-sibling::*//text()|" "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next Intake')]/following-sibling::*//text()" ).extract() clear_space(start_date) item['start_date'] = getStartDateMonth(' '.join(start_date)) if item['start_date'] == "": print("***start_date 为空") print("item['start_date']: ", item['start_date']) overview = response.xpath( "//div[@id='overview']/..|//div[@id='overview']/../following-sibling::div[1]|" "//div[@id='Overview']/..|//div[@id='Overview']/../following-sibling::div[1]" ).extract() item['degree_overview_en'] = remove_class( clear_lianxu_space(overview)) modules_en_url = response.xpath( "//table[@class='table program-table']//td//a[contains(text(),'View plan')]/@href" ).extract() clear_space(modules_en_url) if len(modules_en_url) > 0: url = "https://www.rmit.edu.au" + modules_en_url[0] self.parse_modules1(url, item) else: modules_en = response.xpath( "//span[contains(text(),'Electives and program structure')]/../../../.." ).extract() item['modules_en'] = remove_class( clear_lianxu_space(modules_en)) if item['degree_overview_en'] == "": overviewModulesUrl = response.url + "/program-details" self.parse_overviewModules1(overviewModulesUrl, item) if item['degree_overview_en'] == "": print("***degree_overview_en 为空") print("item['degree_overview_en']: ", item['degree_overview_en']) if item['modules_en'] == "": print("***modules_en 为空") print("item['modules_en']: ", item['modules_en']) career = response.xpath( "//div[@id='career']|//div[@id='career']/../following-sibling::div[1]|" "//div[@id=' career']|//div[@id=' career']/../following-sibling::div[1]|" "//div[@id='Career']|//div[@id='Career']/../following-sibling::div[1]|" "//div[@id=' Career']|//div[@id=' Career']/../following-sibling::div[1]" ).extract() item['career_en'] = remove_class(clear_lianxu_space(career)) if item['career_en'] == "": careerUrl = response.url + "/career" self.parse_career1(careerUrl, item) if item['career_en'] == "": print("***career_en 为空") print("item['career_en']: ", item['career_en']) rntry_requirements_en = response.xpath( "//div[@id='admissions']/..|//div[@id='admissions']/../following-sibling::*[position()<last()-3]|" "//div[@id='Admissions']/..|//div[@id='Admissions']/../following-sibling::*[position()<last()-3]" ).extract() item['rntry_requirements_en'] = remove_class( clear_lianxu_space(rntry_requirements_en)) if item['rntry_requirements_en'] == "": entryUrl = response.url + "/entry-requirements" self.parse_entryrequirements1(entryUrl, item) if item['rntry_requirements_en'] == "": print("***rntry_requirements_en 为空") # print("item['rntry_requirements_en']: ", item['rntry_requirements_en']) ielts_desc = response.xpath( "//li[contains(text(),'IELTS (Academic): ')]//text()" ).extract() item['ielts_desc'] += clear_lianxu_space(ielts_desc) # print("item['ielts_desc']: ", item['ielts_desc']) ielts_d = get_ielts(item['ielts_desc']) item["ielts"] = ielts_d.get('IELTS') item["ielts_l"] = ielts_d.get('IELTS_L') item["ielts_s"] = ielts_d.get('IELTS_S') item["ielts_r"] = ielts_d.get('IELTS_R') item["ielts_w"] = ielts_d.get('IELTS_W') # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % ( # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w'])) toefl_desc = response.xpath( "//*[contains(text(),'TOEFL (Internet Based Test - IBT): ')]//text()" ).extract() item['toefl_desc'] += clear_lianxu_space(toefl_desc) # print("item['toefl_desc']: ", item['toefl_desc']) ielts_d = get_toefl(item['toefl_desc']) item["toefl"] = ielts_d.get('TOEFL') item["toefl_l"] = ielts_d.get('TOEFL_L') item["toefl_s"] = ielts_d.get('TOEFL_S') item["toefl_r"] = ielts_d.get('TOEFL_R') item["toefl_w"] = ielts_d.get('TOEFL_W') # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % ( # item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'])) # programme = response.xpath("//div[@class='program-name']/h1/text()").extract() # ucascode = response.xpath("//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[1]/span[2]/text()").extract() # clear_space(ucascode) # # item['ucas_code'] = ''.join(ucascode) # # print("item['ucas_code']2: ", item['ucas_code']) # # duration = response.xpath( # "//div[@data-duration][2]/span[2]/text()").extract() # clear_space(duration) # item['duration'] = ''.join(duration) # print("item['duration']2: ", item['duration']) # # start_date = response.xpath( # "//div[@data-intake][2]/span[2]/text()").extract() # clear_space(start_date) # item['start_date'] = ''.join(start_date) # print("item['start_date']2: ", item['start_date']) # # location = response.xpath( # "//div[@class='c-summary-cell not-hide']/span[2]//text()").extract() # clear_space(location) # item['location'] = ''.join(location) # print("item['location']2: ", item['location']) # # department = response.xpath( # "//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[7]/span[2]/text()").extract() # clear_space(department) # item['department'] = ''.join(department) # print("item['department']2: ", item['department']) # # overview = response.xpath( # "//html//div[@class='program-summary-section-overview mb-md-md-md']/div[position()<last()-1]").extract() # item['degree_overview_en'] = remove_class(clear_lianxu_space(overview)) # print("item['degree_overview_en']2: ", item['degree_overview_en']) # # # # //html//div[@class='panel-group accordion']/div/div[4] # career = response.xpath( # "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][3]").extract() # if "Career outlook" not in career: # career = response.xpath( # "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][4]").extract() # item['career_en'] = remove_class(clear_lianxu_space(career)) # print("item['career_en']2: ", item['career_en']) # # modulesUrl = response.url + "/program-structure" # self.parse_modules2(modulesUrl, item) # # how_to_applyUrl = response.url + "/how-to-apply" # self.parse_how_to_apply2(how_to_applyUrl, item) # # entryUrl = response.url + "/entry-requirements" # self.entryrequirements2(entryUrl, item) # # feeUrl = response.url + "/fees" # self.fees2(feeUrl, item) item['apply_proces_en'] = remove_class( clear_lianxu_space([ """<div class="share-heading hide">How to Apply</div> </div> </div> <div class="standard-content-article mb-lg-md-md clearfix"> <div class="org-area-module-detail-view accordian "> <div class="row"> <div class="col-xs-12 "> <div class="clearfix"> <p class="lead">A step-by-step guide for international students on how to apply to study at RMIT.</p> <div class="lower-image-container"></div> </div> <!-- Parsys --> <!-- This Parsys will be used to Put all Main Body Components --> <div class="floated-image-container pull-right"> <div class="detail-img-list not-hide image-square"> <figure> <div class="c-detail-image c-detail-image-square"> <img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x800/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x640/image.jpg" class="c-responsive-image bg-cover offset-content"> </div> <div class="c-detail-image c-detail-image-portrait"> <img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x1068/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x854/image.jpg" class="c-responsive-image bg-cover offset-content"> </div> </figure> </div> </div> <div> <div class="extended-desc not-hidden"> <p>If you want to study for only one or two semesters, you can apply for a <a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/study-abroad.html">study abroad program</a> or <a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/student-exchange.html">student exchange</a> at RMIT.</p> <h3>Applying for a research degree?</h3> <p>If you want to apply for a research program, <a href="/content/rmit-ui/en/research/phds-and-other-research-degrees/how-to-apply.html">follow this process and apply here</a> instead.<br> </p> <h2>Step 1: Find a program</h2> <p>Search for a program in your <a href="/content/rmit-ui/en/study-with-us.html">interest area</a> or browse by <a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students.html">level of study</a>. Some programs are not available in the July intake, in which case, you will need to apply for the next available intake.</p> <p>You can also use the <a href="https://www.international.rmit.edu.au/info/programfees.asp" title="Programs, intakes and tuition fees database">Programs, intakes and tuition fees database</a> to search for programs.</p> <h2>Step 2: Check the entry requirements</h2> <p>Check that you qualify for the program's entry requirements including:</p> <ul> <li>English language requirements</li> <li>academic entry requirement (see equivalent <a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/entry-requirements/country-equivalency.html">entry requirements by country</a>)</li> <li>pre-requisites</li> <li>selection tasks.</li> </ul> <p>If you don’t meet the entry requirements for your preferred program, you can consider a range of programs that may provide <a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/pathways-and-credit-transfer.html">pathways</a> to your preferred program.</p> <p>If you are currently studying an Australian Year 12 (in Australia or overseas) or International Baccalaureate (in Australia or New Zealand) and applying for a Bachelor, Associate or Honours degree, you will need to apply via VTAC. You should <a href="http://www.rmit.edu.au/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/international-students-studying-vce-or-ib">check the VTAC entry requirements</a>.<br> </p> <h2>Step 3: Collect required documents</h2> <p>To avoid delays in admission processing, submit a complete set of supporting documents including:</p> <ul> <li>passport</li> <li>certified copies of academic transcripts (not required for current RMIT students applying to another RMIT program)</li> <li>certified copies of all graduation certificates in both the original language and English (not required for current RMIT students applying to another RMIT program)</li> <li>evidence of English language proficiency (not required for current RMIT students applying to another RMIT program)</li> <li>any documentation relating to selection tasks (pre-selection kits, folios etc.)</li> <li>CV, work reference letter, referee report etc if applicable</li> </ul> <p> Please note that documents submitted will not be returned.</p> <h2>Step 4: Submit your application</h2> <p>Submit your application online with <a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/documentation-required.html">all the required documents</a>.</p> <h4>Students completing an Australian Year 12 (in Australia or overseas), or the International Baccalaureate (in Australia or New Zealand)</h4> <ul> <li>Apply for <strong>Higher Education</strong> programs (Bachelor, Associate Degree and Honours) through the Victorian Tertiary Admissions Centre (VTAC).<br> <br> <a href="http://www.vtac.edu.au/applying.html">Apply now via VTAC</a></li> <li>Apply for <strong>Vocational Education</strong> programs (Foundation Studies, ELICOS, VCE, Certificate IV, Diploma and Advanced Diplomas) via iApply, the online application system for international students.<br> <br> <a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li> </ul> <h4>Studying fully online<br> </h4> <ul> <li>If your program is delivered fully online, use the online application system for local students and follow the local student application process. Note: fully online programs do not qualify for an Australian Student Visa.<br> <br> <a href="https://rmit.service-now.com/rmit-admissions/">Apply now via Admissions</a></li> </ul> <h4>All other international students<br> </h4> <ul> <li>If you are applying for on-campus study in a coursework program use iApply, the online application system for international students.<br> <br> <a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li> </ul> <h4>Application fee</h4> <p>You will need to pay an application fee if you are from one of these <a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/application-fee.html">countries classified as high risk</a>.</p> <h2>Need help?</h2> <p>If you need assistance, <a href="https://connect.prospectivestudent.info/RMITInt?_ga=1.241036611.1742672422.1416265787">contact us</a> or one of <a href="https://www.international.rmit.edu.au/info/agentlist/">RMIT’s appointed representatives</a> (agents).</p> <h2>Next steps:</h2> <p>Your application will be assessed in line with RMIT’s policies and procedures. If you are successful, you will receive an offer letter. You can then <a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/accept-your-offer.html">accept your offer</a> by following the instructions in your offer letter. </p> <p>RMIT will normally advise you on the outcome of your application within 10 business days. If you are applying from Australia you should hear within 24 hours. If you don't hear back within the time frame above please <a href="https://rmit.au1.qualtrics.com/jfe/form/SV_0fbt3k9dEkNATZ3">contact Admissions Helpdesk</a>.</p> <p>If you are applying via VTAC <a href="http://www.vtac.edu.au/dates.html">check the VTAC website</a> for important dates.</p> </div> </div> </div> </div>""" ])) item['apply_documents_en'] = remove_class( clear_lianxu_space([ """<p>To avoid delays in admission processing, submit a complete set of supporting documents including:</p> <ul> <li>passport</li> <li>certified copies of academic transcripts (not required for current RMIT students applying to another RMIT program)</li> <li>certified copies of all graduation certificates in both the original language and English (not required for current RMIT students applying to another RMIT program)</li> <li>evidence of English language proficiency (not required for current RMIT students applying to another RMIT program)</li> <li>any documentation relating to selection tasks (pre-selection kits, folios etc.)</li> <li>CV, work reference letter, referee report etc if applicable</li> </ul> <p> Please note that documents submitted will not be returned.</p>""" ])) yield item except Exception as e: with open("scrapySchool_Australian_ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse_data(self, degree_url, item): print("学位类型链接============" + degree_url + "===============") data = requests.get(degree_url, headers=self.headers) response = etree.HTML(data.text) # try: degree_name = response.xpath( "//div[@class='headline']/h1/text()|//h1[@id='page-header']//text()" ) item['degree_name'] = ''.join(degree_name).strip() print("item['degree_name']: ", item['degree_name']) department = '' if "Bachelor of " in item['degree_name']: department = item['degree_name'].replace("Bachelor of", "") department = ''.join(department).strip() item['department'] = department print("item['department']: ", item['department']) duration = response.xpath( "//div[@class='course-length icn icn-duration']/text()|//li[contains(text(),'full time')]//text()" ) clear_space(duration) print("duration:", duration) duration_list = getIntDuration(''.join(duration)) if len(duration_list) == 2: item['duration'] = duration_list[0] item['duration_per'] = duration_list[-1] print("item['duration']: ", item['duration']) print("item['duration_per']: ", item['duration_per']) location = response.xpath( "//div[@class='course-location icn icn-location']//text()|//li[contains(text(),'campus')]//text()" ) item['location'] = ''.join(location).strip() print("item['location']: ", item['location']) degree_description = response.xpath( "//div[@class='primary']//div[@class='description']|" "//section[@id='course-overview']//div[@class='course-section__main course-section__main-with-aside']" ) degree_description_str = "" if len(degree_description) > 0: for deg_desc in degree_description: degree_description_str += etree.tostring(deg_desc, encoding='unicode', method='html') item['degree_overview_en'] = remove_class( clear_lianxu_space([degree_description_str])) print("item['degree_overview_en']: ", item['degree_overview_en']) rntry_tuition_fee_url = data.url + ".inline?profile_citizenship=international&profile_qualification=76&profile_year=2019" # print("rntry_tuition_fee_url: ", rntry_tuition_fee_url) rntry_tuition_fee_list = self.parse_rntry_tuition_fee( rntry_tuition_fee_url) item['rntry_requirements_en'] = rntry_tuition_fee_list[0] print("item['rntry_requirements_en']: ", item['rntry_requirements_en']) item['tuition_fee'] = getTuition_fee(rntry_tuition_fee_list[1]) if item['tuition_fee'] == 0: item['tuition_fee'] = None else: item['tuition_fee_pre'] = "AUD$" print("item['tuition_fee']: ", item['tuition_fee'])
def parse_data(self, response): # 判断是否学位下面还有专业 specialisations = response.xpath( "//h2[contains(text(),'Specialisations')]/following-sibling::*//a/@href" ).extract() # print("specialisations: ", specialisations, response.url) if len(specialisations) > 0: for link in specialisations: if "http" in link: url = link else: url = "http://study.unisa.edu.au" + link yield scrapy.Request(url, callback=self.parse_data) else: item = get_item(ScrapyschoolAustralianBenItem) item['university'] = "University of South Australia" # item['country'] = 'Australia' # item['website'] = 'http://www.unisa.edu.au/' item['url'] = response.url print("===========================") print(response.url) item['degree_type'] = 1 try: programme = response.xpath( "//div[@class='title-row']/h1/text()").extract() clear_space(programme) item['degree_name'] = ''.join(programme).replace( "(International)", "").strip() print("item['degree_name']: ", item['degree_name']) pro_re = re.findall(r"Bachelor", item['degree_name']) print("pre_re: ", pro_re) if len(pro_re) < 2: programme_re = re.findall(r"\(.+\)", item['degree_name']) print("programme_re: ", programme_re) if len(programme_re) > 0: if ''.join(programme_re).strip() != "(Honours)": item['programme_en'] = ''.join( programme_re).replace("(", "").replace(")", "").strip() else: item['programme_en'] = item['degree_name'].replace( "Bachelor of", "").replace("(Honours)", "").strip().strip("in").strip() else: item['programme_en'] = item['degree_name'].replace( "Bachelor of", "").strip().strip("in").strip() print("item['programme_en']: ", item['programme_en']) start_date = response.xpath( "//span[contains(text(), 'Start')]/../text()").extract( ) clear_space(start_date) # print("start_date: ", start_date) item['start_date'] = getStartDateMonth( ', '.join(start_date)) print("item['start_date']: ", item['start_date']) # //span[contains(text(),'Campus')]/../a location = response.xpath( "//span[contains(text(),'Campus')]/../a//text()" ).extract() clear_space(location) item['location'] = ''.join(location).strip() print("item['location']: ", item['location']) duration = response.xpath( "//span[contains(text(),'Duration')]/../text()" ).extract() clear_space(duration) item['duration'] = ''.join(duration).strip() print("item['duration']: ", item['duration']) tuition_fee = response.xpath( "//span[contains(text(),'2019: AUD$')]//text()|" "//span[contains(text(),'Fees')]/../text()").extract() print("tuition_fee: ", tuition_fee) clear_space(tuition_fee) tuition_fee = getTuition_fee(''.join(tuition_fee)) item['tuition_fee'] = str(tuition_fee) if item['tuition_fee'] == '0': item['tuition_fee'] = None print("item['tuition_fee']: ", item['tuition_fee']) # //span[contains(text(),'English Language Requirements')]/.. ielts = response.xpath( "//span[contains(text(),'English Language Requirements')]/../ul//text()" ).extract() clear_space(ielts) item['ielts_desc'] = ' '.join(ielts).strip() print("item['ielts_desc']: ", item['ielts_desc']) ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc']) if len(ieltlsrw) > 0: item["ielts"] = ieltlsrw[0] ielts_l_re = re.findall(r"listening\s\[.*?\]", item['ielts_desc']) item["ielts_l"] = ''.join(ielts_l_re).replace( "listening", "").replace("[", "").replace("]", "").strip() ielts_s_re = re.findall(r"speaking\s\[.*?\]", item['ielts_desc']) item["ielts_s"] = ''.join(ielts_s_re).replace( "speaking", "").replace("[", "").replace("]", "").strip() ielts_r_re = re.findall(r"reading\s\[.*?\]", item['ielts_desc']) item["ielts_r"] = ''.join(ielts_r_re).replace( "reading", "").replace("[", "").replace("]", "").strip() ielts_w_re = re.findall(r"writing\s\[.*?\]", item['ielts_desc']) item["ielts_w"] = ''.join(ielts_w_re).replace( "writing", "").replace("[", "").replace("]", "").strip() print( "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w'])) # //div[@class='page-info-block-inner']//ul[@id='entry-requirements'] entry_requirements = response.xpath( "//div[@class='page-info-block-inner']//ul[@id='entry-requirements']" ).extract() item['rntry_requirements_en'] = remove_class( clear_lianxu_space(entry_requirements)) print("item['rntry_requirements_en']: ", item['rntry_requirements_en']) degree_overview_en = response.xpath( "//h2[contains(text(),'Degree overview')]/../../.." ).extract() item['degree_overview_en'] = remove_class( clear_lianxu_space(degree_overview_en)) print("item['degree_overview_en']: ", item['degree_overview_en']) overview_en = response.xpath( "//h2[contains(text(),'Snapshot')]/..|" "//h3[contains(text(),'Snapshot')]/..").extract() item['overview_en'] = remove_class( clear_lianxu_space(overview_en)) print("item['overview_en']: ", item['overview_en']) modules_en = response.xpath( "//h2[@class='theme-white'][contains(text(), 'Degree structure')]/../..|" "//h3[contains(text(),'Degree structure')]/../.." ).extract() item['modules_en'] = remove_class( clear_lianxu_space(modules_en)) print("item['modules_en']: ", item['modules_en']) career_en = response.xpath( "//h2[contains(text(),'Your career')]/../../..|" "//h3[contains(text(),'Your career')]/..").extract() item['career_en'] = remove_class( clear_lianxu_space(career_en)) print("item['career_en']: ", item['career_en']) apply_desc_en = response.xpath( "//h2[contains(text(),'How to apply')]/../../.." ).extract() item['apply_desc_en'] = remove_class( clear_lianxu_space(apply_desc_en)) print("item['apply_desc_en']: ", item['apply_desc_en']) if "research" not in item['degree_name']: yield item except Exception as e: with open("scrapySchool_Australian_ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = get_item(ScrapyschoolAustralianBenItem) item['university'] = "Deakin University" # item['country'] = 'Australia' # item['website'] = 'http://www.deakin.edu.au' item['degree_type'] = 1 print("===========================") print(response.url) # 组合字典 links = [ "http://www.deakin.edu.au/course/bachelor-arts-international", "http://www.deakin.edu.au/course/bachelor-arts-honours-international", "http://www.deakin.edu.au/course/bachelor-arts-psychology-international", "http://www.deakin.edu.au/course/bachelor-arts-psychology-honours-international", "http://www.deakin.edu.au/course/bachelor-arts-advanced-honours-international", "http://www.deakin.edu.au/course/bachelor-arts-chinese-bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-arts-master-teaching-secondary-international", "http://www.deakin.edu.au/course/bachelor-arts-bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-arts-bachelor-laws-international", "http://www.deakin.edu.au/course/bachelor-arts-bachelor-science-international", "http://www.deakin.edu.au/course/bachelor-arts-master-arts-international-relations-international", "http://www.deakin.edu.au/course/bachelor-biomedical-science-international", "http://www.deakin.edu.au/course/bachelor-business-international", "http://www.deakin.edu.au/course/bachelor-business-sport-management-international", "http://www.deakin.edu.au/course/bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-information-systems-international", "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-laws-international", "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-science-international", "http://www.deakin.edu.au/course/bachelor-communication-advertising-international", "http://www.deakin.edu.au/course/bachelor-communication-digital-media-international", "http://www.deakin.edu.au/course/bachelor-communication-honours-international", "http://www.deakin.edu.au/course/bachelor-communication-journalism-international", "http://www.deakin.edu.au/course/bachelor-communication-public-relations-international", "http://www.deakin.edu.au/course/bachelor-computer-science-international", "http://www.deakin.edu.au/course/bachelor-construction-management-honours-international", "http://www.deakin.edu.au/course/bachelor-creative-arts-drama-international", "http://www.deakin.edu.au/course/bachelor-creative-arts-honours-international", "http://www.deakin.edu.au/course/bachelor-creative-arts-photography-international", "http://www.deakin.edu.au/course/bachelor-creative-arts-visual-arts-international", "http://www.deakin.edu.au/course/bachelor-creative-writing-international", "http://www.deakin.edu.au/course/bachelor-criminology-international", "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-cyber-security-international", "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-laws-international", "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-psychological-science-international", "http://www.deakin.edu.au/course/bachelor-cyber-security-international", "http://www.deakin.edu.au/course/bachelor-design-3d-animation-international", "http://www.deakin.edu.au/course/bachelor-design-architecture-international", "http://www.deakin.edu.au/course/bachelor-design-architecture-bachelor-construction-management-honours-international", "http://www.deakin.edu.au/course/bachelor-design-digital-technologies-international", "http://www.deakin.edu.au/course/bachelor-design-visual-communication-international", "http://www.deakin.edu.au/course/bachelor-education-early-years-international", "http://www.deakin.edu.au/course/bachelor-education-primary-international", "http://www.deakin.edu.au/course/bachelor-environmental-engineering-honours-international", "http://www.deakin.edu.au/course/bachelor-environmental-science-environmental-management-and-sustainability-international", "http://www.deakin.edu.au/course/bachelor-environmental-science-honours-international", "http://www.deakin.edu.au/course/bachelor-environmental-science-marine-biology-international", "http://www.deakin.edu.au/course/bachelor-environmental-science-wildlife-and-conservation-biology-international", "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-international", "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-honours-international", "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-bachelor-business-sport-management-international", "http://www.deakin.edu.au/course/bachelor-film-television-and-animation-international", "http://www.deakin.edu.au/course/bachelor-food-and-nutrition-sciences-honours-international", "http://www.deakin.edu.au/course/bachelor-forensic-science-international", "http://www.deakin.edu.au/course/bachelor-forensic-science-honours-international", "http://www.deakin.edu.au/course/bachelor-forensic-science-bachelor-criminology-international", "http://www.deakin.edu.au/course/bachelor-health-sciences-international", "http://www.deakin.edu.au/course/bachelor-health-sciences-honours-international", "http://www.deakin.edu.au/course/bachelor-health-sciences-bachelor-arts-international", "http://www.deakin.edu.au/course/bachelor-health-and-medical-science-honours-international", "http://www.deakin.edu.au/course/bachelor-health-and-physical-education-international", "http://www.deakin.edu.au/course/bachelor-information-systems-international", "http://www.deakin.edu.au/course/bachelor-information-systems-bachelor-information-technology-international", "http://www.deakin.edu.au/course/bachelor-information-technology-international", "http://www.deakin.edu.au/course/bachelor-information-technology-honours-international", "http://www.deakin.edu.au/course/bachelor-international-studies-international", "http://www.deakin.edu.au/course/bachelor-international-studies-global-scholar-international", "http://www.deakin.edu.au/course/bachelor-international-studies-bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-laws-international", "http://www.deakin.edu.au/course/bachelor-laws-bachelor-international-studies-international", "http://www.deakin.edu.au/course/bachelor-nursing-international", "http://www.deakin.edu.au/course/bachelor-nursing-honours-international", "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-midwifery-international", "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-psychological-science-international", "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-public-health-and-health-promotion-international", "http://www.deakin.edu.au/course/bachelor-nutrition-science-international", "http://www.deakin.edu.au/course/bachelor-nutrition-science-bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-occupational-therapy-international", "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-international", "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-bachelor-laws-international", "http://www.deakin.edu.au/course/bachelor-psychological-science-international", "http://www.deakin.edu.au/course/bachelor-psychological-science-honours-international", "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-international", "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-honours-international", "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-bachelor-commerce-international", "http://www.deakin.edu.au/course/bachelor-science-international", "http://www.deakin.edu.au/course/bachelor-science-honours-international", "http://www.deakin.edu.au/course/bachelor-science-master-teaching-secondary-international", "http://www.deakin.edu.au/course/bachelor-science-bachelor-laws-international", "http://www.deakin.edu.au/course/bachelor-social-work-international", "http://www.deakin.edu.au/course/bachelor-software-engineering-honours-international", "http://www.deakin.edu.au/course/bachelor-sport-development-international", "http://www.deakin.edu.au/course/bachelor-vision-science-master-optometry-international", "http://www.deakin.edu.au/course/bachelor-zoology-and-animal-science-international", "http://www.deakin.edu.au/course/bachelor-zoology-and-animal-science-honours-international", ] programme_dict = {} programme_list = [ "Bachelor of Arts", "Bachelor of Arts (Honours)", "Bachelor of Arts (Psychology)", "Bachelor of Arts (Psychology) (Honours)", "Bachelor of Arts - Advanced (Honours)", "Bachelor of Arts - Chinese/Bachelor of Commerce", "Bachelor of Arts / Master of Teaching (Secondary)", "Bachelor of Arts/Bachelor of Commerce", "Bachelor of Arts/Bachelor of Laws", "Bachelor of Arts/Bachelor of Science", "Bachelor of Arts/Master of Arts (International Relations)", "Bachelor of Biomedical Science", "Bachelor of Business", "Bachelor of Business (Sport Management)", "Bachelor of Commerce", "Bachelor of Commerce/Bachelor of Information Systems", "Bachelor of Commerce/Bachelor of Laws", "Bachelor of Commerce/Bachelor of Science", "Bachelor of Communication (Advertising)", "Bachelor of Communication (Digital Media)", "Bachelor of Communication (Honours)", "Bachelor of Communication (Journalism)", "Bachelor of Communication (Public Relations)", "Bachelor of Computer Science", "Bachelor of Construction Management (Honours)", "Bachelor of Creative Arts (Drama)", "Bachelor of Creative Arts (Honours)", "Bachelor of Creative Arts (Photography)", "Bachelor of Creative Arts (Visual Arts)", "Bachelor of Creative Writing", "Bachelor of Criminology", "Bachelor of Criminology/Bachelor of Cyber Security", "Bachelor of Criminology/Bachelor of Laws", "Bachelor of Criminology/Bachelor of Psychological Science", "Bachelor of Cyber Security", "Bachelor of Design (3D Animation)", "Bachelor of Design (Architecture)", "Bachelor of Design (Architecture)/Bachelor of Construction Management (Honours)", "Bachelor of Design (Digital Technologies)", "Bachelor of Design (Visual Communication)", "Bachelor of Education (Early Years)", "Bachelor of Education (Primary)", "Bachelor of Environmental Engineering (Honours)", "Bachelor of Environmental Science (Environmental Management and Sustainability)", "Bachelor of Environmental Science (Honours)", "Bachelor of Environmental Science (Marine Biology)", "Bachelor of Environmental Science (Wildlife and Conservation Biology)", "Bachelor of Exercise and Sport Science", "Bachelor of Exercise and Sport Science (Honours)", "Bachelor of Exercise and Sport Science/Bachelor of Business (Sport Management)", "Bachelor of Film, Television and Animation", "Bachelor of Food and Nutrition Sciences (Honours)", "Bachelor of Forensic Science", "Bachelor of Forensic Science (Honours)", "Bachelor of Forensic Science/Bachelor of Criminology", "Bachelor of Health Sciences", "Bachelor of Health Sciences (Honours)", "Bachelor of Health Sciences/Bachelor of Arts", "Bachelor of Health and Medical Science (Honours)", "Bachelor of Health and Physical Education", "Bachelor of Information Systems", "Bachelor of Information Systems/Bachelor of Information Technology", "Bachelor of Information Technology", "Bachelor of Information Technology (Honours)", "Bachelor of International Studies", "Bachelor of International Studies (Global Scholar)", "Bachelor of International Studies/Bachelor of Commerce", "Bachelor of Laws", "Bachelor of Laws/Bachelor of International Studies", "Bachelor of Nursing", "Bachelor of Nursing (Honours)", "Bachelor of Nursing/Bachelor of Midwifery", "Bachelor of Nursing/Bachelor of Psychological Science", "Bachelor of Nursing/Bachelor of Public Health and Health Promotion", "Bachelor of Nutrition Science", "Bachelor of Nutrition Science/Bachelor of Commerce", "Bachelor of Occupational Therapy", "Bachelor of Property and Real Estate", "Bachelor of Property and Real Estate/Bachelor of Commerce", "Bachelor of Property and Real Estate/Bachelor of Laws", "Bachelor of Psychological Science", "Bachelor of Psychological Science (Honours)", "Bachelor of Public Health and Health Promotion", "Bachelor of Public Health and Health Promotion (Honours)", "Bachelor of Public Health and Health Promotion/Bachelor of Commerce", "Bachelor of Science", "Bachelor of Science (Honours)", "Bachelor of Science / Master of Teaching (Secondary)", "Bachelor of Science/Bachelor of Laws", "Bachelor of Social Work", "Bachelor of Software Engineering (Honours)", "Bachelor of Sport Development", "Bachelor of Vision Science/Master of Optometry", "Bachelor of Zoology and Animal Science", "Bachelor of Zoology and Animal Science (Honours)", ] for link in range(len(links)): url = links[link] programme_dict[url] = programme_list[link] item['major_type1'] = programme_dict.get(response.url) print("item['major_type1']: ", item['major_type1']) try: programme = response.xpath( "//div[@class='module__banner-title']/h1//text()").extract() clear_space(programme) item['degree_name'] = ''.join(programme).strip() print("item['degree_name']: ", item['degree_name']) pro_re = re.findall(r"Bachelor", item['degree_name']) # print("pre_re: ", pro_re) if len(pro_re) < 2: programme_re = re.findall( r"\(.+\)", item['degree_name'].replace("(Honours)", "")) if len(programme_re) > 0: item['programme_en'] = ''.join(programme_re).replace( "(", "").replace(")", "").strip() else: item['programme_en'] = item['degree_name'].replace( "(Honours)", "").replace("Master of ", "").replace("Bachelor of", "").strip() item['programme_en'] = item['programme_en'].replace( " ", " ").strip() print("item['programme_en']: ", item['programme_en']) # //div[@class='module__summary--items']/div[1]/div[2] ielts = response.xpath( "//h3[contains(text(),'English language requirements')]/../following-sibling::*[1]//text()" ).extract() clear_space(ielts) item['ielts_desc'] = ''.join(ielts).strip() print("item['ielts_desc']: ", item['ielts_desc']) ielts_d = get_ielts(item['ielts_desc']) item["ielts"] = ielts_d.get('IELTS') item["ielts_l"] = ielts_d.get('IELTS_L') item["ielts_s"] = ielts_d.get('IELTS_S') item["ielts_r"] = ielts_d.get('IELTS_R') item["ielts_w"] = ielts_d.get('IELTS_W') # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % ( # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w'])) duration = response.xpath( "//h3[contains(text(),'Duration')]/../following-sibling::div//text()" ).extract() clear_space(duration) # print("duration: ", duration) duration_re = re.findall(r".*full[\s\-]time", ''.join(duration).strip()) item['duration'] = ''.join(duration_re).strip() # if item['duration'] == "": # print("***duration 为空") # print("item['duration']: ", item['duration']) location = response.xpath( "//div[@class='module__summary--icon-wrapper']//h3[@class='course__subheading'][contains(text(),'Campuses')]/../following-sibling::div//text()" ).extract() clear_space(location) item['location'] = ' '.join(location).strip() location_tmp = item['location'] # print("item['location']: ", item['location']) # //div[@id='navigation__course']/following-sibling::div overview = response.xpath( "//h2[contains(text(),'Course information')]/../.." ).extract() item['degree_overview_en'] = remove_class( clear_lianxu_space(overview)) # if item['degree_overview_en'] == "": # print("***degree_overview_en 为空") # print("item['degree_overview_en']: ", item['degree_overview_en']) modules = response.xpath( "//div[@id='module__course-structure']").extract() item['modules_en'] = remove_class(clear_lianxu_space(modules)) # if item['modules_en'] == "": # print("***modules_en 为空") # print("item['modules_en']: ", item['modules_en']) start_date = response.xpath( "//li[contains(text(),'Start date:')]//text()").extract() clear_space(start_date) # print("start_date: ", start_date) item['start_date'] = getStartDateMonth( ' '.join(start_date).strip()) # print("item['start_date']: ", item['start_date']) entry_requirements = response.xpath( "//div[@data-section='entry requirements']").extract() item['rntry_requirements_en'] = remove_class( clear_lianxu_space(entry_requirements)) # if item['rntry_requirements_en'] == "": # print("***rntry_requirements_en 为空") # print("item['rntry_requirements_en']: ", item['rntry_requirements_en']) # //div[@data-section='fees and scholarships'] tuition_fee = response.xpath( "//div[@class='module__content-panel']//div[@class='module__key-information--item-content']/text()" ).extract() clear_space(tuition_fee) # print("tuition_fee: ", tuition_fee) tuition_fee = getTuition_fee(''.join(tuition_fee)) item['tuition_fee'] = tuition_fee if item['tuition_fee'] == 0: item['tuition_fee'] = None # print("item['tuition_fee']: ", item['tuition_fee']) career = response.xpath( "//div[@data-section='graduate outcomes']|//div[@data-section='graduate outcomes']/following-sibling::div[1]|" "//h3[contains(text(),'Career outcomes')]/../..").extract( ) item['career_en'] = remove_class(clear_lianxu_space(career)) # if item['career_en'] == "": # print("***career_en 为空") # print("item['career_en']: ", item['career_en']) # //div[@data-section='application information']/following-sibling::div[2] how_to_apply = response.xpath( "//h3[contains(text(),'How to apply')]/../..").extract() item['apply_desc_en'] = remove_class( clear_lianxu_space(how_to_apply)) # if item['apply_desc_en'] == "": # print("***apply_desc_en 为空") # print("item['apply_desc_en']: ", item['apply_desc_en']) major_list_url = response.xpath( "//h3[contains(text(), 'Major Sequences')]/..//a/@href|" "//h3[contains(text(), 'Major sequences')]/..//a/@href|" "//h3[contains(text(), 'Major sequences')]/following-sibling::ul[1]//a/@href|" "//td[contains(text(),'Major')]/preceding-sibling::td//a/@href" ).extract() clear_space(major_list_url) print("major_list_url: ", major_list_url) print(len(major_list_url)) major_url_l = [] for major_url in major_list_url: if "major" in major_url: major_url_l.append(major_url) print("major_url_l: ", major_url_l) print(len(major_url_l)) if len(major_url_l) == 0: item['url'] = response.url print("item['url']2: ", item['url']) yield item else: for major_url in major_url_l: headers_base = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", } data = requests.get(major_url, headers=headers_base) response_major = etree.HTML(data.text) item['url'] = major_url print("item['url']_major: ", item['url']) programme_major = response_major.xpath( "//div[@class='module__banner-title']/h1//text()") item['programme_en'] = ''.join(programme_major).strip() print("item['programme_en']_major: ", item['programme_en']) location_major = response_major.xpath( "//*[contains(text(),'Campuses')]/../following-sibling::div[1]//text()" ) item['location'] = ''.join(location_major).strip() if item['location'] == "": item['location'] = location_tmp # print("item['location']_major: ", item['location']) overview_en = response_major.xpath( "//h2[contains(text(),'Overview')]/../..") overview_en_str = "" if len(overview_en) > 0: for o in overview_en: overview_en_str += etree.tostring( o, encoding='unicode', method='html') item['overview_en'] = remove_class( clear_lianxu_space([overview_en_str])) # print("item['overview_en']_major: ", item['overview_en']) modules_en = response_major.xpath( "//h2[contains(text(),'Explore units')]/../..") modules_en_str = "" if len(modules_en) > 0: for o in modules_en: modules_en_str += etree.tostring( o, encoding='unicode', method='html') item['modules_en'] = remove_class( clear_lianxu_space([modules_en_str])) # print("item['modules_en']_major: ", item['modules_en']) yield item # else: # item['url'] = response.url # print("item['url']1: ", item['url']) # yield item except Exception as e: with open("scrapySchool_Australian_ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)