def parse(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) Internationnal = response.xpath( '//div[@data-kftab="2"]//text()').extract() # print(response.url) Course = response.xpath( '//div[@class="block__details block__details--overlay block__details--courseOverlay"]//h1[@class="block__details__title"]//text()' ).extract()[0] Course = Course.strip() Master = re.findall('[A-Z]{1}[A-Za-z]{1,3}\s?\([a-zA-Z]*\)', Course) Master = ''.join(Master) programme = Course.replace(Master, '') if Master == '': Master = re.findall('MA|MSc', Course) Master = ''.join(Master) # print(Master, Course, response.url) else: Master = '' # 专业描述 CourseOverview = response.xpath( '//div[@class="block large-8 columns course-col2"]').extract() overview = remove_class(CourseOverview) overview = clear_same_s(overview) # 学费 tuition_fee = response.xpath( '//*[contains(text(),"£")]//text()').extract() tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) # 课程长度 duration = response.xpath( '//*[contains(text(),"uration")]/..//text()').extract() mode = re.findall('(?i)full', ''.join(duration)) if mode != []: mode = '1' else: mode = '2' try: duration = clear_duration(duration) except: duration = {'duration_per': None, 'duration': None} print(duration) # 申请要求 standard = response.xpath( '//div[@class="row row--block course-section course-section--criteria"]' ).extract() standard = remove_class(standard) standard = clear_same_s(standard) # 课程及评估 Evaluation_method = response.xpath( '//div[@id="cycle-slideshow_course"]').extract() Evaluation_method = remove_class(Evaluation_method) Evaluation_method = clear_same_s(Evaluation_method) teaching_assessment = Evaluation_method.strip() # 就业 Career = response.xpath( '//div[@class="row row--block course-section course-section--opps"]' ).extract() career = remove_class(Career) career = clear_same_s(career) # print(Career) IELTS = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() # print(IELTS) ielts = get_ielts(IELTS) # print(IELTS) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] else: item['ielts'] = '' item['ielts_l'] = '' item['ielts_s'] = '' item['ielts_r'] = '' item['ielts_w'] = '' # print(tuition_fee) university = 'De Montfort University' programme = programme.replace(Master, '').strip() item["university"] = university item["location"] = 'Lestat de Lioncourt' item["department"] = '' item["programme_en"] = programme item["degree_name"] = Master item['degree_type'] = 2 item["teach_time"] = mode item['teach_type'] = 'taught' item["overview_en"] = overview item["assessment_en"] = teaching_assessment item["career_en"] = career item["tuition_fee"] = tuition_fee item['tuition_fee_pre'] = '£' item["modules_en"] = Evaluation_method item["duration"] = duration['duration'] item['duration_per'] = duration['duration_per'] item["start_date"] = '2018-9' item["rntry_requirements"] = standard item["url"] = response.url # print(programme) yield item
def parse_main(self,response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']='Anglia Ruskin University' item['url']=response.url item['teach_time']='1' programme=response.xpath('//h1/text()').extract() programme=''.join(programme).split('\r\n') if len(programme)==4: prog=programme[1].strip() degr=programme[2].strip() item['degree_name'] = degr else: prog=''.join(programme) item['programme_en']=prog location=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__locations"]/a/text()').extract() location=set(location) # print(location) location=','.join(location) item['location']=location start_date=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__entry"]/text()').extract() start_date=tracslateDate(start_date) # print(start_date) start_date=','.join(start_date) item['start_date']=start_date duration=response.xpath('//div[@class="course-summary__teaching"]/p[1]/text()').extract() try: duration=clear_duration(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] except: pass overview=response.xpath('//div[@id="overview"]').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview career=response.xpath('//div[@id="careers"]').extract() career=remove_class(career) # print(career) item['career_en']=career modules=response.xpath('//div[@id="modulesassessment"]').extract() modules=remove_class(modules) item['modules_en']=remove_class(modules) item['ielts']='6.5' item['ielts_l']='5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['ielts_desc']='Our standard entry criteria for postgraduate courses is IELTS 6.5 or equivalent, with nothing lower than 5.5 in any of the four elements (listening, speaking, reading and writing).' item['toefl']='88' item['toefl_l']='17' item['toefl_s'] = '20' item['toefl_r'] = '18' item['toefl_w'] = '17' item['toefl_desc']="TOEFL iBT with 88 overall and a minimum of 17 in Writing and Listening, 18 in Reading and 20 in Speaking" fee=response.xpath('//div[@id="feesfunding"]//text()').extract() tuition_fee=getTuition_fee(fee) # print(tuition_fee) if tuition_fee==2018: tuition_fee=0 item['tuition_fee']=tuition_fee item['tuition_fee_pre']='£' department=response.xpath('//a[contains(text(),"Visit your")]/@href').extract() # print(department) department=''.join(department).split('/')[-1] # print(department) department=department.title().replace('-',' ') # print(department) item['department']=department how_to_apply=["<p>Step 1 - Choose your course</p>", "<p>Step 2 - Submit your application form</p>", "<p>Step 3 - Check your email regularly</p>", "<p>Step 5 - Start your visa application</p>", "<p>Step 4 - Receive our decision on your application</p>",] how_to_apply='\n'.join(how_to_apply) item['apply_proces_en']=how_to_apply apply_d=["<ul><li>Qualification certificates and transcripts, including certified translations, where applicable</li>", "<li>A personal statement. You can download and complete our Personal Statement Form.</li>", "<li>References/recommendation letters</li>", "<li>Curriculum vitae/resume</li>", "<li>Passport</li>", "<li>Current and previous visa(s) (if applicable)</li>", "<li>Proof of name change (if applicable)</li>", "<li>Portfolio (if applicable)</li></ul>",] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d courseid=response.xpath('//input[@id="erastracode"]/@value').extract() # print(courseid) if courseid==['']: rntry=response.xpath('//h4[contains(text(),"ain")]/following-sibling::*').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry else: cid=re.findall('[A-Z0-9]+',courseid[0]) courseid='%20'.join(cid) rntry_url='https://www.anglia.ac.uk/api/coursewidget/multipleentryrequirements?academicYears=2017%2C2018&moaCode=FT&astraCode='+courseid # print(rntry_url) try: rntry_content=json.loads(requests.get(rntry_url).text)[0]['GroupItems'][0]['Text'][0] rntry_content='<div>'+rntry_content+'</div>' except: rntry_content='' item['rntry_requirements'] = rntry_content # print(rntry_content) # yield item
def parses(self, response): # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Staffordshire University' item['url'] = response.url item['location'] = 'Staffordshire' programme = response.xpath('//h1/text()').extract() programme = ''.join(programme).strip() degree_name = response.xpath( '//h2[@class="hero_header text-center"]/text()').extract() if degree_name == []: degree_name = re.findall('[A-Z]{2,}[a-z]*', programme) degree_name = ''.join(degree_name).strip() item['degree_name'] = degree_name else: item['degree_name'] = ''.join(degree_name).strip() item['programme_en'] = programme programme = response.xpath( '//div[@class="col-sm-9"]/h1/text()|//div[@id="main"]//h1/text()' ).extract() programme = ''.join(programme).strip() degree = re.findall('[A-Z]{2}[/a-zA-Z\s]*', programme) programme = programme.replace(''.join(degree), '').strip() if degree == []: degree = response.xpath( '//h2[@class="hero_header text-center"]/text()').extract() elif degree != []: degree = ''.join(degree) else: degree = '' item['degree_name'] = ''.join(degree).strip() item['programme_en'] = programme duration = response.xpath( '//th[contains(text(),"Duration")]/following-sibling::td/text()|//dt[contains(text(),"Duration")]/following-sibling::dd[1]/text()' ).extract() duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//dt[contains(text(),"Academic year:")]/following-sibling::dd/text()' ).extract() if start_date == []: start_date = response.xpath( '//th[contains(text(),"Course start")]/following-sibling::td/text()' ).extract() start_date = tracslateDate(start_date) item['start_date'] = ','.join(start_date).strip() department = response.xpath( '//th[contains(text(),"School")]/following-sibling::td/text()' ).extract() department = ''.join(department).strip() item['department'] = department fee = response.xpath('//*[contains(text(),"£")]//text()').extract() tuition_fee = getTuition_fee(fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' overview = response.xpath( '//div[@id="key-features"]|' '//section[@class="course-details_section summary-section"]//div[@class="medium-8 medium-pull-4 large-pull-3 column"]' ).extract() overview = remove_class(overview) item['overview_en'] = overview modules = response.xpath( '//div[@id="course-content"]|//section[@id="contents"]|//div[@id="course-summary"]' ).extract() modules = remove_class(modules) item['modules_en'] = modules rntry = response.xpath( '//div[@id="course-entry-requirements"]|//section[@id="entry"]' ).extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry career = response.xpath( '//div[@id="graduate-destinations"]|//section[@id="careers"]' ).extract() career = remove_class(career) item['career_en'] = career ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = ''.join(ielts).strip() item['ielts_desc'] = ielts ielts = get_ielts(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass assessment = response.xpath( '//a[contains(text(),"ssessment")]/../following-sibling::div[1]' ).extract() item['assessment_en'] = remove_class(assessment) yield item
def parses(self, response): # print('进入专业链接页面',response.url) item = get_item1(ScrapyschoolEnglandItem1) item['url'] = response.url item['university'] = 'Buckinghamshire New University' location = response.xpath( '//ul[@class="course-details"]/li[contains(text(),"Location")]/text()' ).extract() location = ''.join(location).replace('Location:', '').strip() # print(location) programme = response.xpath( '//h1[@class="banner-title"]/text()').extract() item['programme_en'] = ''.join(programme).strip() degree_name = response.xpath( '//p[@class="school-code"]/text()').extract() item['degree_name'] = ''.join(degree_name).strip() item['location'] = location duration = response.xpath( '//ul[@class="course-details"]/li[contains(text(),"Duration")]/text()' ).extract() duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//ul[@class="course-details"]/li[contains(text(),"Start Date")]/text()' ).extract() start_date = tracslateDate(start_date) # print(start_date) overview = response.xpath( '//h2[contains(text(),"Course Overview")]/..').extract() item['overview_en'] = remove_class(overview) modules = response.xpath( '//h2[contains(text(),"Course Modules")]/..').extract() item['modules_en'] = remove_class(modules) career = response.xpath( '//h2[contains(text(),"Employability")]/..').extract() item['career_en'] = remove_class(career) entry = response.xpath( '//h3[contains(text(),"What are the course entry requirements?")]/following-sibling::p[position()<=3]' ).extract() if entry == []: print(response.url) else: print(entry) item['rntry_requirements'] = remove_class(entry) item['tuition_fee'] = '11500' # item['apply_desc_en']=remove_class(entry) chi = [ ' <div> ', ' <p>Academic entry requirements</p ><p>We require successful completion of a 学士学位 (Bachelor degree) or successful completion of a three-year 本科毕业证书 (Benke) with an overall pass from a UK NARIC-recognised or Ministry of Education-listed institution.</p ><p>Mathematics entry requirements</p ><p>Students need the equivalent of GCSE Mathematics grade C/4.</p > ', ' </div> ', ] htp = [ '<p>There’s still time to apply for September 2018. Visit our <a hre>clearing section</a> to find out more.</p><p><strong>Check you meet the entry requirements</strong></p><p>Once you’ve had a good look at our course information, and chosen which one feels right for you, before applying it’s worth checking that you meet the entry requirements for your country.</p><p>We welcome applications from students with a wide range of qualifications from around the world. You’ll find details of the exact academic and English language requirements for your country on our <a hre>country pages</a>.</p><p>Every student studying with us also needs to meet our <a hre>English language requirements</a> and we will ask you to provide evidence to show you have good enough English to study a higher education course in the UK.</p><p><strong>Different ways to apply</strong></p><p>When you are ready to apply for your course, you can do so in one of three ways:</p><ul><li>directly through our <a href="https://www.applycpd.com/bucks?tabid=21">application portal</a></li><li>through <a hre>UCAS</a>, or</li><li>through a recruitment agent in your country (see <a hre>your country page</a> for details of agents we work with who are operating locally to you).</li></ul><p>It doesn’t matter which of these routes you use, but we advise you to apply early to give yourself enough time to prepare for moving to the UK and arranging your visa, if you need one.</p><p>If you’ve missed out on your first choices, declined any offers made to you, or you’re applying to university after 30 June, you can also apply to us through <a hre>Clearing</a>.</p>', ] item['require_chinese_en'] = remove_class(chi) item['apply_desc_en'] = remove_class(htp) item['ielts'] = '6.0' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' yield item
def parse(self, response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']="St Mary's University, Twickenham" item['url']=response.url item['location']='London' rntry=response.xpath('//h2[contains(text(),"Entry requirements")]/following-sibling::div').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements']=rntry modules=response.xpath('//h2[contains(text(),"Course")]/../following-sibling::div//ul').extract() modules=remove_class(modules) # print(modules) item['modules_en']=modules overview=response.xpath('//div[@id="overview"]//div[@class="large-8 columns content"]').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview duration=response.xpath('//p[contains(text(),"uration")]/preceding-sibling::p/text()').extract() duration=clear_duration(duration) # print(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] programme=response.xpath('//h1/text()').extract() # print(programme) if len(programme)==2: prog=programme[0] degr=programme[1] item['programme_en']=prog item['degree_name']=degr degree_name=degr else: prog=''.join(programme).strip() item['programme_en']=prog # print(prog) career=response.xpath('//section[@id="careers"]').extract() career=remove_class(career) # print(career) item['career_en']=career #13650 fee=response.xpath('//h2[contains(text(),"Tuition")]/following-sibling::*/text()').extract() tuition_fee=getTuition_fee(fee) # print(tuition_fee) item['tuition_fee']=tuition_fee item['tuition_fee_pre']='£' item['deadline']='2019-7-31' apply_d=["<ul><li>Copies of academic transcripts and certificates</li>", "<li>A Copy of your English language requirements (if needed)</li>", "<li>A Copy of your passport</li>", "<li>Visa history questionnaire</li></ul>",] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d # print(item) ielts=response.xpath('//h4[contains(text(),"International re")]/following-sibling::p[1]/text()').extract() ielts=''.join(ielts).strip() # print(ielts) ielts=get_ielts(ielts) try: if ielts!=[] or ielts!={}: item['ielts_l']=ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass assessment=response.xpath('//h2[contains(text(),"ssessment")]/following-sibling::p[position()<=5]').extract() if assessment==[]: print(response.url) else: print('sssssssssssssssssssssss') item['assessment_en']=remove_class(assessment) yield item
def parse_main(self,response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']='University of Bedfordshire' item['url']=response.url programme=response.xpath('//div[@id="inner-course-content"]/h1/text()').extract() # print(programme) programme=''.join(programme) # print(programme) item['tuition_fee_pre']='£' if 'MBA' in programme: # print(programme) item['tuition_fee']='14000' else: item['tuition_fee']='12750' programme=programme.split('-') if len(programme)==2: prog=programme[0].strip() degr=programme[1].strip() # print(prog) # print(degr) item['degree_name']=degr try: if degr[0] == 'M': item['degree_type'] = '2' elif degr[0] == 'P': item['degree_type'] = '3' except: pass else: prog=''.join(programme).strip() item['programme_en']=prog location=response.xpath('//strong[contains(text(),"Campus Location")]/../text()').extract() location=''.join(location).replace('-','').strip() # print(location) item['location']=location duration=response.xpath('//strong[contains(text(),"Duration")]/../text()').extract() duration=clear_duration(duration) # print(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] mode=response.xpath('//strong[contains(text(),"Attendance")]/../text()').extract() mode=''.join(mode) mode=re.findall('(?i)full',mode) if mode!=[]: item['teach_time']='1' else: item['teach_time']='2' start_date=response.xpath('//strong[contains(text(),"Start")]/../text()').extract() # print(start_date) start_date=tracslateDate(start_date) # print(start_date) start_date=','.join(start_date) # print(start_date) item['start_date']=start_date overview=response.xpath('//div[@id="why_content"]').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview modules=response.xpath('//div[@id="unit_content"]').extract() modules=remove_class(modules) # print(modules) item['modules_en']=modules assessment_en=response.xpath('//div[@id="how_content"]').extract() assessment_en=remove_class(assessment_en) item['assessment_en']=assessment_en rntry=response.xpath('//h2[@id="entry"]/following-sibling::div/ul[@class="tab-content"]/div[3]').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements']=rntry item['ielts']='6.0' item['ielts_l']='5.5' item['ielts_s']='5.5' item['ielts_r']='5.5' item['ielts_w']='5.5' # item['toefl']='80' item['toefl_l']='17' item['toefl_s']='20' item['toefl_r']='18' item['toefl_w']='17' career=response.xpath('//div[@id="career_content"]').extract() career=remove_class(career) # print(career) item['career_en']=career apply_d=['<p>There are two ways you can make a direct application to the University of Bedfordshire:</p><ul><li><a href="https://evision.beds.ac.uk/urd/sits.urd/run/siw_ipp_lgn.login?process=siw_ipp_app&code1=OA_FORM&code2=0007">Apply online now for 2017/18</a> Courses starting from 1 August 2017 to 31 July 2018</li><li>Download <span class="include_asset_summary"><a href="https://www.beds.ac.uk/__data/assets/pdf_file/0006/441798/International-Application-web-2018.pdf">an application form - <img src="https://www.beds.ac.uk/__data/asset_types/pdf_file/icon.png" alt="" title="" height="16" width="16" class="sq-icon" /> PDF 1.0 MB ', '</a></span> and submit it to our <a href="https://www.beds.ac.uk/international/international-applications/contactus">Admissions Team</a> along with scans of your supporting documents, via email, post or in person at the International Office.</li></ul><p>You can post your completed form to:</p><p>University of Bedfordshire International Admissions/International Office/University Square/Luton/Bedfordshire/LU1 3JU/United Kingdom</p><h4>Please note</h4><ul><li><strong>BSc (Hons) Nursing Studies</strong> Level 3 and <strong>MSc Advanced Nursing Studies</strong> are available to overseas students - please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a></li><li><strong>Healthcare, Nursing and Midwifery students</strong> - many of these courses are not available to overseas students due to UK immigration law in regard to bursary funding. Please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a> to find out if you are eligible to apply.</li></ul><p>*Please note that international students studying on a Tier 4 Student Visa must choose a full-time Undergraduate or Postgraduate course and are not eligible for part-time study.</p><p>Watch some more tips and advice on making your application to Bedfordshire:</p>',] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d # item['application_open_date']='2018-8' # item['deadline']='2019-7' # print(item) yield item
def pro_parse(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) item['url'] = response.url item['university'] = 'London South Bank University' item['location'] = 'London' item['tuition_fee_pre'] = '£' pro = response.xpath('//div[@id="breadcrumbs"]//span/text()').extract() prog = pro[-1].split('-') if len(prog) == 2: programme = prog[0] degree_type = prog[1] degree_type = degree_type.strip() item['degree_name'] = degree_type if degree_type[0] == 'M': item['degree_type'] = '2' elif degree_type[0] == 'P': item['degree_type'] = '3' else: programme = prog item['programme_en'] = programme fee = response.xpath( '//div[@id="tab_fees_and_funding"]//*[contains(text(),"£")]//text()' ).extract() # print(fee) tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee overview = response.xpath('//div[@id="tab_overview"]').extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath('//div[@id="tab_modules"]').extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules career = response.xpath('//div[@id="tab_employability"]').extract() career = remove_class(career) item['career_en'] = career rntry = response.xpath('//div[@id="tab_entry_requirements"]').extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry ielts = get_ielts(rntry) # print(ielts) if ielts != [] and ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] apply_desc_en = response.xpath( '//div[@id="tab_how_to_apply"]').extract() apply_desc_en = remove_class(apply_desc_en) item['apply_desc_en'] = apply_desc_en duration = response.xpath( '//td/span[contains(text(),"Duration")]/following-sibling::div/text()' ).extract() duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] mode = response.xpath( '//td/span[contains(text(),"Mode")]/following-sibling::div/text()' ).extract() mode = set(mode) mode = ''.join(mode) # print(mode) mode = re.findall('(?i)full', mode) if mode != []: item['teach_time'] = '1' else: item['teach_time'] = '2' start_date = response.xpath( '//td/span[contains(text(),"Start")]/following-sibling::div/text()' ).extract() # start_date=tracslateDate(start_date) # start_date=set(start_date) try: start_date = tracslateDate(start_date) start_date = list(set(start_date)) start_list = [] for i in start_date: start_list.append('2019' + '-' + i) start_date = ','.join(start_list) item['start_date'] = start_date except: pass item['department'] = ''.join( response.xpath( '//a[contains(text(),"School of")]/text()').extract()) yield item
def parses(self, response): print(response.url) # print('收到了') item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'University College London' item['url'] = response.url item['tuition_fee_pre'] = '£' location = response.xpath( '//div/strong[contains(text(),"Location")]/../text()').extract() location = ''.join(location).strip() item['location'] = location programme = response.xpath('//h1[@class="heading"]//text()').extract() programme = ''.join(programme) # print(programme) degree_name = re.findall('[MB][A-Z]{1,2}[a-z]*', programme) # print(degree_name) degree_name = ''.join(set(degree_name)).strip() programme = programme.replace(degree_name, '') item['programme_en'] = programme item['degree_name'] = degree_name item['degree_type'] = '2' # print(programme) mode = response.xpath('//*[contains(text(),"FT")]//text()').extract() if mode != []: item['teach_time'] = 1 else: item['teach_time'] = 2 # department=response.meta['department'] # department=''.join(department).strip() # # print(department) # item['department'] = department department = response.xpath( '//h5[contains(text(),"Department website")]/following-sibling::p/a/text()' ).extract() department = ''.join(department).strip() # # print(department) item['department'] = department overview = response.xpath( '//article[@class="article"]/h1/following-sibling::article/p[1]' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview application_open_date = response.xpath( '//div[contains(text(),"Open")]/text()').extract() application_open_date = tracslateDate(application_open_date) # print(application_open_date) application_open_date = ','.join(set(application_open_date)) item['application_open_date'] = application_open_date deadline = response.xpath( '//div[contains(text(),"Close")]/text()').extract() deadline = tracslateDate(deadline) deadline = ','.join(set(deadline)) item['deadline'] = deadline tuition_fee = getTuition_fee( response.xpath('//*[contains(text(),"£")]//text()').extract()) item['tuition_fee'] = tuition_fee duration = response.xpath( '//h4[contains(text(),"uration")]/following-sibling::div/text()' ).extract() duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//h4[contains(text(),"tarts")]/following-sibling::p//text()' ).extract() # print(start_date) start_date = tracslateDate(start_date) # print(start_date) start_date = ','.join(set(start_date)) # print(start_date) item['start_date'] = start_date item['apply_fee'] = '75' item['apply_pre'] = '£' eng_level = response.xpath( '//p[contains(text(),"English language")]/strong/text()').extract( ) eng_level = ''.join(eng_level).strip() if eng_level == 'Standard': ielts = 'Overall grade of 6.5 with a minimum of 6.0 in each of the subtests.' toefl = 'Overall score of 92 with 24/30 in reading and writing and 20/30 in speaking and listening.' elif eng_level == 'Good': ielts = 'Overall grade of 7.0 with a minimum of 6.5 in each of the subtests.' toefl = 'Overall score of 100 with 24/30 in reading and writing and 20/30 in speaking and listening.' elif eng_level == 'Advanced': ielts = 'Overall grade of 7.5 with a minimum of 6.5 in each of the subtests.' toefl = 'Overall score of 109 with 24/30 in reading and writing and 20/30 in speaking and listening.' else: ielts = '' toefl = '' ieltss = get_ielts(ielts) # print(ieltss) if ieltss != {} and ieltss != []: # ieltss=list(map(float,ieltss)) item['ielts_l'] = ieltss['IELTS_L'] item['ielts_s'] = ieltss['IELTS_S'] item['ielts_r'] = ieltss['IELTS_R'] item['ielts_w'] = ieltss['IELTS_W'] item['ielts'] = ieltss['IELTS'] toefls = re.findall('\d{1,3}', ''.join(toefl)) # print(toefls) if len(toefls) == 5: item['toefl'] = toefls[0] item['toefl_l'] = toefls[4] item['toefl_w'] = toefls[2] item['toefl_r'] = toefls[1] item['toefl_s'] = toefls[3] elif len(toefls) == 2: toefls = list(map(int, toefls)) item['toefl'] = max(toefls) item['toefl_l'] = min(toefls) item['toefl_w'] = min(toefls) item['toefl_r'] = min(toefls) item['toefl_s'] = min(toefls) item['ielts_desc'] = ielts item['toefl_desc'] = toefl # print(item) rntry_requirements = response.xpath( '//h4[contains(text(),"ntry")]/following-sibling::p[1]').extract() rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) item['rntry_requirements'] = rntry_requirements chinese_reuqirement = [ "<div>Equivalent qualifications for China", "Bachelor's degree with a minimum overall average mark of 80%. Please note that a number of programmes / departments will require higher marks.", "ALTERNATIVE QUALIFICATIONS", "Medical/ Dental/ Master's degree; Doctorate.</div>", ] chinese_reuqirement = '\n'.join(chinese_reuqirement) item['require_chinese_en'] = chinese_reuqirement modules = response.xpath( '//h2[contains(text(),"About this")]/following-sibling::div' ).extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::div').extract( ) career = remove_class(career) item['career_en'] = career yield item
def parse_main(self, response): item = get_item1(ScrapyschoolEnglandItem1) # print(response.url) item['university'] = "City, University of London" item['url'] = response.url item['location'] = 'London' item['programme_en'] = response.meta['programme'] item['degree_name'] = response.meta['degree_name'] item['tuition_fee_pre'] = '£' item['teach_type'] = 'taught' department = response.meta['department'] department = set(department) department = ' '.join(department) item['department'] = department fee = response.xpath( '//h3[contains(text(),"Fee")]/../../following-sibling::div//text()' ).extract() tuition_fee = getTuition_fee(fee) if tuition_fee == 0: fee = response.xpath( '//span[contains(text(),"£")]//text()').extract() tuition_fee = getTuition_fee(fee) item['tuition_fee'] = tuition_fee # print(item['tuition_fee']) overview = response.xpath( '//h2[contains(text(),"Who is it")]/following-sibling::*|' '//h2[contains(text(),"Overview")]/following-sibling::*').extract( ) overview = remove_class(overview) overview = clear_same_s(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//h2[contains(text(),"Structure")]/following-sibling::*|' '//h2[contains(text(),"Modules")]/following-sibling::*').extract() modules = remove_class(modules) modules = clear_same_s(modules) # print(modules) item['modules_en'] = modules rntry_requirement = response.xpath( '//h3[contains(text(),"Entry")]/following-sibling::*|//div[@id="entryreq"]' ).extract() rntry_requirement = remove_class(rntry_requirement) rntry_requirement = clear_same_s(rntry_requirement) # print(rntry_requirement) item['rntry_requirements'] = rntry_requirement ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) # print(ielts) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::*').extract() # print(career) career = remove_class(career) career = clear_same_s(career) item['career_en'] = career # print(career) duration = response.xpath( '//span[contains(text(),"Duration")]/../following-sibling::div//text()|' '//h3[contains(text(),"Duration")]/following-sibling::*//text()' ).extract() mode = re.findall('(?i)full', ''.join(duration)) if mode != []: item['teach_time'] = '1' else: item['teach_time'] = '2' # print(''.join(duration)) duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//h3[contains(text(),"Start date")]/following-sibling::p/text()' ).extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) item['start_date'] = start_date # print(start_date) apply_desc_en = response.xpath( '//h3[contains(text(),"How to apply")]/following-sibling::*|//div[@id="howtoapply"]' ).extract() apply_desc_en = remove_class(apply_desc_en) item['apply_proces_en'] = apply_desc_en require_chinese = "<p>Applicants will be considered for most postgraduate courses with a good Chinese bachelor’s degree from a recognised University.Students who don’t meet the requirements for direct entry may have the option to undertake our Graduate Diploma programme at INTO City, which then offers the opportunity for guaranteed entry into City’s Masters programmes.</p>" item['require_chinese_en'] = require_chinese assessment = response.xpath( '//h2[contains(text(),"Teaching and learning")]/following-sibling::*|//h3[contains(text(),"ssessment")]/following-sibling::*' ).extract() # if assessment==[]: # print(response.url) # else: # print('不为空') item['assessment_en'] = remove_class(assessment)
def parse_main(self,response): item=get_item1(ScrapyschoolEnglandItem1) print(response.url) item['university'] = 'Middlesex University' item['url'] = response.url item['location'] = 'London' programme=response.xpath('//div[@class="course-page-banner__texts"]/h1/text()').extract() # print(programme) programme=''.join(programme) degree_name=re.findall('[A-Z]{2,}.*',programme) # print(degree_name) degree_name=''.join(degree_name) if degree_name!=programme: programme=programme.replace(degree_name,'') # print(programme) # print(degree_name) item['programme_en'] = programme item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass start_date=response.xpath('//span[contains(text(),"Start")]/../following-sibling::div//text()').extract() # print(start_date) start_date=tracslateDate(start_date) # print(start_date) start_date=','.join(start_date) item['start_date'] = start_date duration=response.xpath('//span[contains(text(),"Duration")]/../following-sibling::div//text()').extract() mode=re.findall('(?i)full',''.join(duration)) duration=clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] if mode !=[]: item['teach_time']='1' else: item['teach_time']='2' fee = response.xpath('//span[contains(text(),"Fees")]/../following-sibling::div//text()').extract() tuition_fee=getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' overview=response.xpath('//h2[contains(text(),"Overview")]/following-sibling::*').extract() overview=remove_class(overview) # print(overview) item['overview_en'] = overview modules=response.xpath('//h2[contains(text(),"Course content")]/following-sibling::*').extract() modules=remove_class(modules) # print(modules) item['modules_en'] = modules rntry=response.xpath('//h2[contains(text(),"Entry requirements")]/following-sibling::*').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry ielts=response.xpath('//p[contains(text(),"IELTS")]//text()').extract() ielts=''.join(ielts) item['ielts_desc']=ielts ielts=get_ielts(ielts) # print(ielts) try: if ielts!=[] or ielts!={}: item['ielts_l']=ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass career=response.xpath('//h2[contains(text(),"Careers")]/following-sibling::*').extract() career=remove_class(career) # print(career) item['career_en'] = career yield item
def parse(self, response): # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) university = 'University of York' item['university'] = university item['url'] = response.url item['location'] = 'York' item['tuition_fee_pre'] = '£' start_date = response.xpath( '//h4[contains(text(),"Start date")]/following-sibling::p//text()' ).extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) item['start_date'] = start_date overview = response.xpath( '//div[@class="o-grid__box o-grid__box--half o-grid__box--half@medium"]|' '//h2[contains(text(),"verview")]/following-sibling::*|' '//h2[contains(text(),"At a glance")]/following-sibling::*|' '//h2[contains(text(),"Course summary")]/following-sibling::*|' '//h2[contains(text(),"At a Glance")]/following-sibling::*|' '//div[@id="mdcolumn"]/h1/following-sibling::*[position()<5]' ).extract() overview = remove_class(overview) item['overview_en'] = overview # print(overview) modules = response.xpath( '//div[@id="content_modules"]|' '//h2[contains(text(),"Course structure")]/following-sibling::*|' '//th[contains(text(),"Module")]/../../..|' '//h2[contains(text(),"ontent")]/following-sibling::*|' '//h3[contains(text(),"What does the course cover?")]/following-sibling::p[1]|' '//strong[contains(text(),"Course structure")]/../following-sibling::*[position()<=5]|' '//h2[contains(text(),"Structure and ethos")]/..|' '//h2[contains(text(),"Modules")]/following-sibling::*|' '//h2[contains(text(),"Structure and Ethos")]/following-sibling::*|' '//h2[contains(text(),"module")]/following-sibling::*').extract() modules = remove_class(modules) item['modules_en'] = modules # print(modules) tuition_fee = response.xpath( '//div[@id="fees"]/following-sibling::div[1]//*[contains(text(),"£")]//text()' ).extract() tuition_fee = getTuition_fee(tuition_fee) item['tuition_fee'] = tuition_fee # print(tuition_fee) assessment = response.xpath( '//h2[contains(text(),"Teaching and assessment")]/../../following-sibling::div[1]' '|//h2[contains(text(),"ssessment")]/following-sibling::*|' '//h2[contains(text(),"ssessment")]/following-sibling::*[position()<=5]|' '//strong[contains(text(),"Specialist training tailored to your interests and aspirations")]/../following-sibling::*|' '//span[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|' '//h3[contains(text(),"ssessment")]/following-sibling::*[position()<=3]|' '//strong[contains(text(),"SUMMER TERM")]/../following-sibling::*|' '//strong[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|' '//h2[contains(text(),"Teaching")]/following-sibling::*|' '//blockquote[@class="rightBox"]/following-sibling::*[1]|' '//h2[contains(text(),"Dissertation")]/following-sibling::p[1]|' '//p[contains(text(),"This programme aims: ")]/following-sibling::table[1]' ).extract() # if assessment==[]: # print(response.url) assessment = remove_class(assessment) item['assessment_en'] = assessment # print(assessment) entry_requirements = response.xpath( '//div[@id="entry"]|' '//h2[contains(text(),"requirement")]/following-sibling::*|' '//h2[contains(text(),"pplicants")]/following-sibling::*|' '//h3[contains(text(),"Entry Requirements")]/following-sibling::*|' '//h2[contains(text(),"Entry")]/following-sibling::*[position()>1]|' '//h3[contains(text(),"International students")]/following-sibling::*|' '//h3[contains(text(),"Entry requirements")]/following-sibling::*[position()<4]|' '//h2[contains(text(),"English Language Requirements")]/following-sibling::*[position()<3]' ).extract() # if entry_requirements==[]: # print(response.url) entry_requirements = remove_class(entry_requirements) item['rntry_requirements'] = entry_requirements # print(entry_requirements) ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] toefl = response.xpath( '//*[contains(text(),"TOEFL")]//text()').extract() toefl = ''.join(toefl).strip() item['toefl_desc'] = toefl toefl = re.findall('\d{2,3}', toefl) if len(toefl) == 2: toefl = list(map(int, toefl)) item['toefl'] = max(toefl) item['toefl_l'] = min(toefl) item['toefl_w'] = min(toefl) item['toefl_r'] = min(toefl) item['toefl_s'] = min(toefl) career = response.xpath( '//div[@class="o-grid__box o-grid__box--half"]|' '//h2[contains(text(),"areer")]/following-sibling::*|' '//h2[contains(text(),"Employment relevance")]/following-sibling::*|' '//p[contains(text(),"employment,")]/following-sibling::ul[1]|' '//p[contains(text(),"This programme aims: ")]/following-sibling::ul[1]|' '//h3[contains(text(),"areers")]/following-sibling::ul[1]|' '//h2[contains(text(),"Employment outcomes")]/following-sibling::*|' '//h3[contains(text(),"What can it lead to?")]/following-sibling::p[1]' ).extract() # if career==[]: # print(response.url) career = remove_class(career) # print(career) item['career_en'] = career departnemt = response.xpath( '//h4[contains(text(),"Department")]/following-sibling::p//text()|//div[@id="location"]/h1//text()' ).extract() departnemt = ''.join(departnemt) item['department'] = departnemt # pro = response.meta['programme'] # item['programme_en'] = pro # duration = response.meta['duration'] # print(duration) # duration = clear_duration(duration) # item['duration'] = duration['duration'] # item['duration_per'] = duration['duration_per'] programme = response.xpath( '//div[@id="mdcolumn"]/h1/text()|//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()' ).extract() # print(programme) clears = re.findall('[A-Za-z]+ in ', ''.join(programme)) programme = ''.join(programme).replace(''.join(clears), '').strip() item['programme_en'] = programme duration = response.xpath( '//h4[contains(text(),"Length")]/following-sibling::p//text()' ).extract() # print(duration) duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] major_type1 = response.xpath( '//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()|//div[@id="content-container"]//h1/text()' ).extract() major_type1 = ''.join(major_type1) item['major_type1'] = major_type1 # if 'diploma' not in response.url: # print(response.url) # print(major_type1) degree_name = re.findall('[A-Z]{2}[a-zA-Z]*', major_type1) # print(degree_name) degree_name = '/'.join(degree_name).strip() item['degree_name'] = degree_name
def programme(self, response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) programme = response.xpath('//h1/text()').extract() # print(programme) deg = re.findall('\(.*\)', ''.join(programme)) clears = re.findall(':.*', ''.join(programme)) # print(deg) deg = ''.join(deg) programme = ''.join(programme).replace(''.join(clears), '').replace(deg, '').strip() # print(programme) item['programme_en'] = programme item['degree_name'] = deg.replace('(', '').replace(')', '').strip() item['url'] = response.url start_date = response.xpath('//dt[contains(text(),"tart date")]/following-sibling::dd[1]//text()').extract() start_date = tracslateDate(start_date) item['start_date'] = ','.join(start_date) item['university'] = 'Birkbeck, University of London' # item['tuition_fee_pre']='£' item['location'] = ''.join( response.xpath('//dt[contains(text(),"ocation")]/following-sibling::dd[1]//text()').extract()) duration = response.xpath('//dt[contains(text(),"uration")]/following-sibling::dd[1]//text()').extract() # print(duration) mode = re.findall('(?i)full', ''.join(duration)) # if mode!=[]: # print('这个专业要') # else: # print('这个专业只有兼职,不要!!!') dura = re.findall('[a-zA-Z0-9\s]+full', ''.join(duration)) dura = clear_duration(dura) # print(dura) item['duration'] = dura['duration'] item['duration_per'] = dura['duration_per'] overview = response.xpath('//h2[contains(text(),"Highlights")]/preceding-sibling::div[1]').extract() overview = remove_class(overview) item['overview_en'] = overview # print(overview) modules = response.xpath('//h2[contains(text(),"Course structure")]/following-sibling::section').extract() modules = remove_class(modules) item['modules_en'] = modules # print(modules) # if modules=='': # print(response.url) entry = response.xpath('//h2[contains(text(),"ntry requirements")]/following-sibling::*').extract() entry = remove_class(entry) # print(entry) item['rntry_requirements']=entry chinese = ['<h3 class="content-show">Postgraduate entry requirements</h3>', "<ul><li>Please <a>check your postgraduate course online</a> to see if your programme of study has an entry requirement of a UK undergraduate degree with a 2:1 or a 2:2 classification. </li><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:2 classification</strong>, you will typically need to have one of the following:</li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 70% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 75% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 75% </li><li>a Master's degree with an overall average grade of 60%. </li></ul><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:1 classification</strong>, you will typically need to have one of the following: </li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 75% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 80% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 80% </li><li>a Master's degree with an overall average grade of 70%. </li></ul><li>If you do not meet these criteria, you can apply for Birkbeck’s <a>International Foundation Programme</a><span>, which acts as a bridge between undergraduate and postgraduate study, preparing students to study a Master’s degree in the UK. There are progression pathways onto various courses at Birkbeck.</span></li><li>Another option is the <a>Master's Foundation programme</a>, at our partner provider OnCampus London, which is available for two- or three-term progression onto a wide range of Master’s Degrees at Birkbeck.</li><li>If your transcript is provided in GPA format and not a percentage value, <a>please contact our International Office</a> to check your equivalency. For most institutions: </li><ul><li>80% is equivalent to 4/5 or 3.3/4 </li><li>75% is equivalent to 3.5/5 or 2.7/4. </li></ul>"] item['require_chinese_en'] = remove_class(chinese) item['toefl_desc'] = 'overall score of 92, with 22 in Reading, 21 in Listening, 23 in Speaking, 24 in Writing.' item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'] = '22', '23', '22', '24' ielts = 'overall score of 6.5, with 6.0 in each subtest' ielts = response.xpath('//*[contains(text(),"IELTS")]//text()').extract() # print(ielts) ies = re.findall('\d\.?\d?', ''.join(ielts)) # print(ies) if len(ies) == 2: ies = list(map(float, ies)) item['ielts'] = max(ies) item['ielts_l'] = min(ies) item['ielts_s'] = min(ies) item['ielts_r'] = min(ies) item['ielts_w'] = min(ies) item['ielts_desc'] = '\n'.join(ielts).strip() fee = response.xpath('//h2[contains(text(),"Fees")]/following-sibling::p/text()').extract() # print(fee) assessment = response.xpath('//h2[contains(text(),"Assessment")]/following-sibling::*').extract() assessment = remove_class(assessment) item['assessment_en'] = assessment department = response.xpath('//a[contains(text(),"isit the")]/text()').extract() # print(department) department = ''.join(department).replace('Visit the', '').strip() # print(department) item['department'] = department howtoapply = response.xpath('//h2[contains(text(),"How to apply")]/following-sibling::*').extract() howtoapply = remove_class(howtoapply) # print(howtoapply) item['apply_proces_en'] = howtoapply # print(item) if mode!=[]: print('这个专业要') yield item else: print('这个专业只有兼职,不要!!!')
def parse(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'University for the Creative Arts' item['url'] = response.url programme = response.xpath('//h1/text()').extract() programme = ''.join(programme) # print(programme) item['programme_en'] = programme degr = response.xpath('//h1/following-sibling::p[1]/text()').extract() # print(degr) degr = ''.join(degr).split('-') if len(degr) == 3: # print(degr) degree_name = degr[0] location = degr[1] item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass elif len(degr) == 4: # print(degr) item['degree_name'] = 'Pre-degree' item['degree_type'] = '2' duration = response.xpath( '//p[contains(text(),"Length of study")]/following-sibling::p/text()' ).extract() duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] location = response.xpath( '//p[contains(text(),"Campus")]/following-sibling::p/text()' ).extract() location = ''.join(location) item['location'] = location start_date = response.xpath( '//p[contains(text(),"Start month")]/following-sibling::p/text()' ).extract() start_date = tracslateDate(start_date) # print(start_date) start_date = ','.join(start_date) item['start_date'] = start_date overview = response.xpath('//div[@class="cell overview"]').extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//div[@id="syllabus"]/following-sibling::section[@class="article-content-area"][1]' ).extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules career = response.xpath( '//div[contains(text(),"Career")]/following-sibling::div').extract( ) career = remove_class(career) # print(career) item['career_en'] = career item['ielts'] = '6' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' rntry = [ "We will consider equivalent qualifications from your home country for entry onto our Foundation, Bachelor’s and Master’s courses. Please see below for details of the accepted qualifications (including English language qualifications) for each level of course. Each application we receive is considered individually and therefore these qualifications are provided as a guide.", "For our International Foundation in Art, Design and Media, we usually require that you have one of the following:", "Chinese Senior School graduation with 12 years of completed school study, with an average of 65% or above.", "Pre Foundation course at Guildford College Training School (China).", "For our Bachelor's courses, we usually require that you have:", "Chinese Senior School graduation with 12 years of completed school study, plus a recognised Foundation course.", "A transcript showing successful completion of one year of university study at a recognised Chinese university with average of 70%.", "For our Master's courses, we usually require that you have:", "Bachelor's degree with 80% average grade from a recognised Chinese university.", ] rntry = '\n'.join(rntry) rntry = response.xpath( '//h3[contains(text(),"UK entry requirements")]/following-sibling::*' ).extract() item['rntry_requirements'] = remove_class(rntry) portfolio = response.xpath( '//h3[contains(text(),"Your portfolio")]/following-sibling::*' ).extract() item['portfolio_desc_en'] = remove_class(portfolio) item['tuition_fee'] = '13540' item['tuition_fee_pre'] = '£' item['deadline'] = '2019-3' # print(item) yield item
def parse_main(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'University of Leicester' item['url'] = response.url item['tuition_fee_pre'] = '£' department = response.xpath( '//dt[contains(text(),"Department")]/following-sibling::dd/text()' ).extract() department = ''.join(department).strip() # print(department) item['department'] = department overview = response.xpath( '//h2[contains(text(),"Course description")]/following-sibling::*' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview chinese_require = [ "<p>", "If you have completed a four-year Bachelors degree in China, you can be considered for entry to a Masters degree at Leicester. Our requirements depend on the rank of the university from which you graduated and your chosen Masters degree. The following is intended as a guide to our requirements:</p>", "<p>If you have graduated from a 'top 200' university in China, you may be asked for 70% overall if you are applying for an Engineering or Science degree, or 75% for an Arts, Humanities, Law or Social Science degree. You may need to have scores of at least 80% in modules that are particularly relevant to your chosen Master’s degree. The School of Museum Studies requires at least 80% overall.</p>", "<p>If you graduated from a Chinese university ranked below the top 200 you may require higher scores (80-85%).</p>", "<p>If you have completed a three-year college diploma from a Chinese university, you will need to take an accepted one-year Pre-Masters course or upgrade your diploma to a Bachelor’s degree before applying for a Master’s degree.</p>", ] chinese_require = remove_class(chinese_require) item['require_chinese_en'] = chinese_require rntry = response.xpath( '//h2[contains(text(),"Entry requirements")]/following-sibling::*' ).extract() rntry = remove_class(rntry).replace( 'International Qualifications', '' ).replace('Countries list', '').replace( 'Find your country in this list to check equivalent qualifications, scholarships and additional requirements.', '') # print(rntry) item['rntry_requirements'] = rntry fee = response.xpath( '//h3[contains(text(),"International Students")]/following-sibling::*//text()' ).extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee career = response.xpath('//div[@id="careers"]').extract() career = remove_class(career) # print(career) item['career_en'] = career modules = response.xpath('//div[@id="course-structure"]').extract() modules = remove_class(modules) item['modules_en'] = modules assessment = response.xpath( '//h2[contains(text(),"Teaching and learning")]/following-sibling::div' ).extract() assessment = remove_class(assessment) item['assessment_en'] = assessment ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) # print(ielts) if ielts != []: item['ielts'] = ielts['IELTS'] item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] if item['ielts'] == 6.0: item['toefl'] = 80 elif item['ielts'] == 6.5: item['toefl'] = 90 elif item['ielts'] == 7.0: item['toefl'] = 100 if item['toefl'] != None: item['toefl_l'] = '17' item['toefl_s'] = '20' item['toefl_r'] = '18' item['toefl_w'] = '17' programme = response.xpath( '//span[contains(text(),"Course")]/following-sibling::span/text()' ).extract() # print(programme) degree_name = response.xpath( '//span[contains(text(),"Qualification")]/following-sibling::span/text()' ).extract() # print(degree_name) duration = response.xpath( '//span[contains(text(),"Duration")]/following-sibling::span/text()' ).extract() # print(duration) start_date = response.xpath( '//span[contains(text(),"Start Dates")]/following-sibling::span/text()' ).extract() # print(start_date) if start_date == []: start_date = ['', '', '', ''] for pro, deg, dur, sta in zip(programme, degree_name, duration, start_date): item['programme_en'] = pro item['degree_name'] = deg dura = clear_duration(dur) item['duration'] = dura['duration'] item['duration_per'] = dura['duration_per'] sta = tracslateDate(sta) sta = ','.join(sta) item['start_date'] = sta mode = re.findall('(?i)full', dur) if mode != []: item['teach_time'] = 'fulltime' if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE': # print(item) yield item else: item['teach_time'] = 'parttime' if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE': # print(item) yield item
def parse_main(self, response): print('进入一个详情页') # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Oxford Brookes University' item['url'] = response.url item['location'] = 'London' programme = response.xpath('//h1/text()').extract() programme = ''.join(programme).strip() # print(programme) item['programme_en'] = programme degree_name = response.xpath( '//h1/following-sibling::h2/text()').extract() degree_name = ''.join(degree_name).strip() # print(degree_name) item['degree_name'] = degree_name department = response.xpath( '//h1/following-sibling::h2/following-sibling::p/a/text()' ).extract() department = ''.join(department).strip() # print(department) item['department'] = department start_date = response.xpath( '//h3[contains(text(),"Available")]/following-sibling::p[1]/text()' ).extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) item['start_date'] = start_date duration = response.xpath( '//h3[contains(text(),"Course length")]/following-sibling::ul//text()' ).extract() # print(duration) mode = re.findall('(?i)full', ''.join(duration)) if mode != []: item['teach_time'] = 'fulltime' else: item['teach_time'] = 'parttime' try: duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] except: pass overview = response.xpath( '//h1/following-sibling::h2/following-sibling::p/following-sibling::*' ).extract() overview = remove_class(overview) item['overview_en'] = overview modules = response.xpath('//div[@id="section-two"]').extract() modules = remove_class(modules) item['modules_en'] = modules fee = response.xpath('//p[contains(text(),"£")]/text()').extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' rntry = response.xpath('//div[@id="section-four"]').extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry career = response.xpath('//div[@id="section-five"]').extract() career = remove_class(career) item['career_en'] = career ielts = response.xpath( '//*[contains(text(),"IELTS")]/text()').extract() ielts = ''.join(ielts) IELTS = ielts ielts = re.findall('\d\.\d', ielts) if len(ielts) == 2: # print('长度为二的ielts',ielts) ielts = list(map(float, ielts)) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min( ielts), min(ielts), min(ielts) elif len(ielts) == 3: # print('长度为三的ielts',ielts,IELTS) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = ielts[0], ielts[2], ielts[ 2], ielts[1], ielts[1] elif len(ielts) == 0: pass elif len(ielts) == 1: # print('长度为一的ielts',ielts) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = ielts[0], ielts[0], ielts[ 0], ielts[0], ielts[0] else: # print('其他长度的ielts',ielts,response.url) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min( ielts), min(ielts), min(ielts) # print(item) yield item
def parse_main(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) item['teach_time'] = 'fulltime' item['university'] = 'University of Glasgow' item['url'] = response.url item['location'] = 'Glasgow' item['start_date'] = '2018-9' item['deadline'] = '2018-7' item["tuition_fee_pre"] = "£" item['teach_type'] = 'taught' programme = response.xpath( '//div[@id="prog-title"]/h1/text()').extract() programme = ''.join(programme) item['programme_en'] = programme degree_type = response.xpath( '//div[@id="prog-title"]/h1/span/text()').extract() degree_type = ''.join(degree_type) item['degree_name'] = degree_type duration = response.xpath( '//li[contains(text(),"full-time")]/text()').extract() duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] # print(durations) overview = response.xpath( '//h2[contains(text(),"Why this programme")]/following-sibling::*' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//h2[contains(text(),"Programme str")]/following-sibling::*' ).extract() modules = clear_same_s(modules) modules = remove_class(modules) item['modules_en'] = modules # print(modules) career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::*').extract() career = clear_same_s(career) career = remove_class(career) item['career_en'] = career fees = response.xpath( '//h2[contains(text(),"Fees and")]/following-sibling::div//text()' ).extract() fees = response.xpath('//div[@id="fees"]//text()').extract() # print(fees) tuition_fee = getTuition_fee(fees) # print(tuition_fee) if tuition_fee == 2018: tuition_fee = '0' # print(tuition_fee) item['tuition_fee'] = tuition_fee IELTS = response.xpath( '//*[contains(text(),"IELTS")]/../following-sibling::ul[1]//text()' ).extract() # print(IELTS) ielts = get_ielts(IELTS) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] TOEFL = response.xpath( '//*[contains(text(),"TOEFL")]/..//text()').extract() # print(TOEFL) toefl = get_toefl(TOEFL) if toefl != []: try: item['toefl_r'] = toefl[1] item['toefl_l'] = toefl[2] item['toefl_s'] = toefl[3] item['toefl_w'] = toefl[4] item['toefl'] = toefl[0] except: pass entry = response.xpath( '//h2[contains(text(),"Entry requirements")]/following-sibling::*' ).extract() entry = clear_same_s(entry) entry = remove_class(entry) item['rntry_requirements'] = entry apply_d = response.xpath( '//h3[contains(text(),"Documents")]/following-sibling::ul[1]' ).extract() apply_d = clear_same_s(apply_d) item['apply_proces_en'] = remove_class(apply_d) if programme != '': yield item
def parse_career(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) overview = response.meta['overview'] item['overview_en'] = overview modules = response.meta['modules'] item['modules_en'] = modules ielts = response.meta['ielts'] # department = response.meta['department'] # item['department'] = department toefls = response.meta['toefl'] rntry_requirements = response.meta['rntry_requirements'] item['rntry_requirements'] = rntry_requirements tuition_fee = response.meta['tuition_fee'] item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' apply_documents_en = response.meta['apply'] item['apply_documents_en'] = apply_documents_en career = response.xpath('//section[@class="content"]').extract() career = remove_class(career) item['career_en'] = career # print(career) department = response.xpath( '//a[contains(text(),"Faculty")]/text()').extract() # print(department) department = ''.join(department) department = response.xpath( '//nav[@id="breadcrumb"]/ul/li/a/text()').extract() if department != []: department = department[-1] item['department'] = department item['university'] = 'University of Liverpool' item['url'] = response.url.replace('career-prospects', 'overview') item['location'] = 'Liverpool' programme = response.url.split('/')[-3] programme = programme.replace('-', ' ').title() degree_name = re.findall('\sM[sarbm][a-z]{0,2}', programme) # print(degree_name) degree_name = ' '.join(degree_name).strip() degree_name = degree_name.strip() programme = programme.replace(degree_name, '').strip() item['programme_en'] = programme item['degree_name'] = degree_name.replace('Mana', '') # print(item['programme_en']) item['toefl_desc'] = ''.join(toefls) item['ielts_desc'] = ''.join(ielts) ielts = get_ielts(ielts) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] toefl = re.findall('\d{1,3}', ''.join(toefls)) if len(toefl) == 4: item['toefl'] = toefl[0] item['toefl_l'] = toefl[1] item['toefl_w'] = toefl[1] item['toefl_r'] = toefl[2] item['toefl_s'] = toefl[3] elif len(toefl) == 2: toefl = list(map(int, toefl)) item['toefl'] = max(toefl) item['toefl_l'] = min(toefl) item['toefl_w'] = min(toefl) item['toefl_r'] = min(toefl) item['toefl_s'] = min(toefl) duration = response.xpath( '//li[contains(text(),"duration")]/span/text()').extract() # print(duration) for i in duration: if 'Full' not in i: del duration[duration.index(i)] duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per']
def parse(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['location'] = 'Leeds' item['university'] = 'Leeds Trinity University' item['url'] = response.url # item['start_date']='2019-8' # item['application_open_date']='2019-7' programme = response.xpath( '//h1[@class="course-title"]/text()').extract() programme = ''.join(programme).strip() degree_name = response.xpath( '//h2[@class="course-title"]/text()').extract() degree_name = ''.join(degree_name).strip() item['degree_type'] = '2' item['programme_en'] = programme item['degree_name'] = degree_name # print(programme) # print(degree_name) overview = response.xpath( '//h2/a[contains(text(),"Overview")]/../following-sibling::*' ).extract() overview = remove_class(overview) item['overview_en'] = overview # print(overview) duration = response.xpath( '//div[contains(text(),"Course type")]/span/text()').extract() duration = clear_duration(duration) # print(duration item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] modules = response.xpath( '//div[contains(@class,"structure")]').extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules fee = response.xpath( '//div[contains(@class,"fees")]//text()').extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' rntry = response.xpath('//div[contains(@class,"entry")]').extract() ielts = get_ielts(rntry) rntry = remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry # print(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass career = response.xpath('//div[contains(@class,"graduate")]').extract() career = remove_class(career) # print(career) item['career_en'] = career apply_p = [ "Choose a course and check its entry requirements using our course finder. You can find out more about us and your chosen course by coming to an Open Day.", "Apply for your chosen course by downloading the relevant application form below. Complete the application form and return it, along with your references (if they’re required) to the Admissions team at [email protected] or by post to: Admissions Team, Leeds Trinity University, Horsforth, Leeds, LS18 5HD", "The Admissions team will acknowledge receipt of your application by email, process your application and forward it to the relevant Programme Leader within three days of receipt.", "The Programme Leader will review your application and either make a decision based on your application or invite you to attend an Interview Day at Leeds Trinity University. Those selected for an interview will be contacted with the details of the interview within ten days of your application being processed.", "The Admissions team will notify you of your interview outcome in writing within five working days of receiving a decision from the Programme Leader.", "Made an offer? You should reply to accept or decline your offer at [email protected]. If you accept, you’ll need to prove that you satisfy the conditions outlined in your offer letter, usually by presenting the relevant supporting documentation in person to Leeds Trinity University, Student Administration Office (AM36).", ] apply_p = '<ul><li>' + '</li><li>'.join(apply_p) + '</li></ul>' item['apply_proces_en'] = apply_p # print(item) yield item
def parse_main(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Norwich University of the Arts' item['url'] = response.url item['location'] = 'Norfolk' programme = response.xpath( '//span[contains(text(),"Course")]/../../following-sibling::span/span/text()' ).extract() programme = set(programme) programme = ''.join(programme).strip() # print(programme) degree_name = re.findall('[A-Z]{2,}', programme) degree_name = ''.join(degree_name).strip() programme = programme.replace(degree_name, '').strip() item['programme_en'] = programme item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass duration = response.xpath( '//strong[contains(text(),"Course length")]/../text()').extract() mode = re.findall('(?i)full', ''.join(duration)) duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] if mode != []: item['teach_time'] = '1' else: item['teach_time'] = '2' overview = response.xpath( '//strong[contains(text(),"Course length")]/../../following-sibling::*' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview career = response.xpath( '//h3[contains(text(),"career")]/following-sibling::ul').extract() career = remove_class(career) item['career_en'] = career item[ 'ielts_desc'] = "BA and MA applicants are required to have a minimum UKVI approved IELTS exam score of 6.0 overall, with a minimum of 5.5 in each section" item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['ielts'] = '6.0' rntry = response.xpath('//div[@id="entry-requirements"]').extract() rntry = remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry portfolio_desc_en = response.xpath( '//div[@id="portfolio-guidance"]').extract() portfolio_desc_en = remove_class(portfolio_desc_en) # print(portfolio_desc_en) item['apply_proces_en'] = portfolio_desc_en fee = response.xpath('//div[@id="fees-funding"]//text()').extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' how_to_apply = response.xpath('//div[@id="how-to-apply"]').extract() item['apply_proces_en'] = remove_class(how_to_apply) yield item
def parse_main(self,response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']='Leeds Beckett University' item['url']=response.url location=response.xpath('//div[contains(text(),"Location")]/following-sibling::span/text()').extract() location=set(location) location=''.join(location).strip() # print(location) item['location']='Leeds' degree_name=response.xpath('//div[@class="course-hero__label"]/text()').extract() degree_name=''.join(degree_name).strip() item['degree_name']=degree_name programme=response.xpath('//h1[@class="course-hero__title"]/text()').extract() programme=''.join(programme).strip() # print(programme) # print(degree_name) item['programme_en']=programme department=response.xpath('//div[@class="course-hero__labels"]/a/text()').extract() department=''.join(department) # print(department) item['department']=department mode=response.xpath('//div[contains(text(),"Attendance")]/following-sibling::div//text()').extract() mode=''.join(mode) mode=re.findall('(?i)full',mode) if mode!=[]: item['teach_time']='1' else: item['teach_time']='2' start_date=response.xpath('//div[contains(text(),"Start Date")]/following-sibling::div//text()').extract() start_date=tracslateDate(start_date) start_date=set(start_date) # print(start_date) start_date=','.join(start_date) item['start_date']=start_date duration=response.xpath('//div[contains(text(),"Duration")]/following-sibling::span//text()').extract() duration=clear_duration(duration) # print(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] overview=response.xpath('//h2[contains(text(),"Overview")]/../following-sibling::div').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview rntry=response.xpath('//h2[contains(text(),"Entry Requirements")]/../following-sibling::div').extract() rntry=remove_class(rntry) item['rntry_requirements']=rntry IELTS=response.xpath('//div[@class="entry-ielts"]/text()').extract() ielts=get_ielts(IELTS) # print(ielts) try: if ielts!=[] or ielts!={}: item['ielts_l']=ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass career=response.xpath('//h3[contains(text(),"Careers")]/following-sibling::div').extract() career=remove_class(career) item['career_en']=career modules=response.xpath('//div[@class="course-modules__table-modules"]//div[@class="course-modules__dropdowns"]').extract() modules=remove_class(modules) # print(modules) item['modules_en']=modules fee=response.xpath('//div[contains(text(),"£")]/text()').extract() fee=''.join(fee).strip() fee=re.findall('£\d{3,}',fee) fee = '-'.join(fee).replace(',', '').replace('£', '') fee = fee.split('-') try: fee = list(map(int, fee)) fee = max(fee) item['tuition_fee']=fee except: pass item['tuition_fee_pre']='£' apply_d=["Academic Certificates.", "Evidence of your English language ability (see below).", "A photocopy of your passport.", "A reference to support your application – either academic or professional.", "A completed Agent Consent Form (required if you are applying via or with the help of an agent).",] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d apply_p=["Applying for a postgraduate course", "Once you have found the course you want to study in our online prospectus you will then click on the ‘Apply Now’ button located at the top of the online course page. ", "You will be asked to create an account on our application portal and complete your application via your Leeds Beckett account. Once you have submitted your application you should receive a decision within six weeks of applying. The exception to this is if the course you have applied for has a closing date specified. In this case, we will wait until the closing date has passed before we contact you",] apply_p='\n'.join(apply_p) item['apply_proces_en']=apply_p # print(item) yield item
def parses(self, response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Durham University' item['url'] = response.url item['location'] ='Durham' item['tuition_fee_pre'] = '£' programme = response.xpath( '//div[@id="course"]/div[@class="row-fluid titlebar"]/h1/span[@class="span7 title"]/text()').extract() programme = ''.join(programme).strip() # print(programme) item['programme_en'] = programme degree_type = response.xpath( '//div[@id="course"]/div[@class="row-fluid titlebar"]/h1//span[@class="type"]/text()').extract() degree_type = ''.join(degree_type).strip() # print(degree_type) item['degree_name'] =degree_type duration=response.xpath('//th[contains(text(),"Duration")]/following-sibling::td//text()').extract() duration=clear_duration(duration) item['duration'] = duration['duration'] item['duration_per']=duration['duration_per'] # print(duration) mode=response.xpath('//th[contains(text(),"Mode")]/following-sibling::td//text()').extract() if mode!=[]: item['teach_time']=1 else: item['teach_time']=2 tuition=response.xpath('//th[contains(text(),"nternational")]/following-sibling::td/text()').extract() tuition_fee=getTuition_fee(tuition) # print(tuition_fee) item['tuition_fee'] = tuition_fee department=response.xpath('//div[@id="department"]/h3[1]/text()').extract() department=' '.join(department) # print(department) item['department'] = department coursecontent=response.xpath('//div[@id="coursecontent"]//*').extract() overviewSplit=response.xpath('//div[@id="coursecontent"]/h2[contains(text(),"Structure")]/self::*').extract() if overviewSplit!=[]: overview=coursecontent[0:coursecontent.index(overviewSplit[0])] else: overview=coursecontent # print(overview) item['overview_en']=remove_class(overview) modules=response.xpath('//div[@id="coursecontent"]/h2[contains(text(),"Structure")]/following-sibling::*').extract() # print(modules) item['modules_en']=remove_class(modules) # overview=response.xpath('//div[@id="department"]/h5[contains(text(),"verview")]/following-sibling::p').extract() # item['overview_en']=remove_class(overview) item['ielts'] = '6.5' item['ielts_l'],item['ielts_s'],item['ielts_r'],item['ielts_w']='6.0','6.0','6.0','6.0' item['toefl'] ='92' item['toefl_l'],item['toefl_l'],item['toefl_l'],item['toefl_l']='23','23','23','23' item['ielts_desc'] ='6.5 (no component under 6.0)' item['toefl_desc'] ='TOEFL iBT (internet based test): 92 (no component under 23)' assessment=response.xpath('//div[@id="learning"]').extract() assessment=remove_class(assessment) item['assessment_en'] = assessment rntry=response.xpath('//div[@id="admissions"]').extract() rntry=remove_class(rntry) item['rntry_requirements'] = rntry # item['apply_pre'] = '£' # item['apply_fee'] = '60' # item['application_open_date'] = '2018-10-1' # item['start_date'] = '2018-9,2019-1,2019-4' start_date=response.xpath('//th[contains(text(),"tart Date")]/following-sibling::td/text()').extract() start_date=''.join(start_date) # print(start_date) if start_date!='': start_date='2019-10' item['start_date']=start_date apply_proces=["<p>Apply Online", "Stage One: Check entry requirements", "Stage Two: Complete the application form", "Stage Three: We process your application", "Stage Four: We communicate a decision", "Stage Five: Next steps</p>",] apply_proces='</p><p>'.join(apply_proces) item['apply_proces_en'] = apply_proces apply_documents_en=["<p>Personal details", "Your education and qualifications already achieved and details of any qualifications that you are currently studying for, if applicable", "The names and addresses of two academic referees", "A Personal Statement", "Supporting documents (for example, degree certificates / transcripts, English Language evidence if you are not a native English speaker, CV, samples of academic work).</p>",] apply_documents_en='</p><p>'.join(apply_documents_en) item['apply_documents_en'] = apply_documents_en apply_desc=["<p>The standard minimum entry requirement to study a postgraduate programme at Durham University is normally achievement of an upper second class UK honours degree (2:1) or equivalent qualification and two satisfactory academic references. Full details of qualification equivalencies by country can be found here. For applicants who are not Native English speakers, English language evidence may also be required." "However, some Academic Departments and programmes have different or additional entry requirements. Therefore, before you apply, it is important to check the appropriate course listing in the courses database or departmental web page to ensure that you meet or are able to meet before the programme commencement date:" "• The Academic Department and specific programme’s entry requirements and, if applicable, any English language requirements" "• The financial requirements of the programme you are interested in (including deposit payment, tuition fees and any other associated costs).</p>"] apply_desc='</p><p>'.join(apply_desc) item['apply_desc_en'] = apply_desc career=response.xpath('//div[@id="opportunities"]').extract() career=remove_class(career) item['career_en'] = career # if degree_type not in ['BA', 'BEng', 'BSc', 'PCert', 'PGCE', 'GDip', 'LLB']: # print(item) # yield item # print(item) yield item
def parses(self, response): item = get_item1(ScrapyschoolEnglandItem1) # print('接受了') print('开始下载', response.url, '的数据') # print(response.status) item['university'] = 'Manchester Metropolitan University' item['url'] = response.url item['location'] = 'Manchester' degree_name = response.xpath('//h1/span/text()').extract() degree_name = ''.join(degree_name) item['degree_name'] = degree_name programme = response.xpath('//h1/text()').extract() # print(programme) programme = ''.join(programme).strip() item['programme_en'] = programme # print(degree_name) # print(programme) item['degree_type'] = 2 overview = response.xpath( '//h2[contains(text(),"Overview")]/following-sibling::article' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::p').extract() career = remove_class(career) item['career_en'] = career rntry = response.xpath( '//h2[contains(text(),"Entry")]/following-sibling::p').extract() ieltssss = re.findall('\d\.?\d?', ''.join(rntry)) print(ieltssss) rntry = remove_class(rntry) item['rntry_requirements'] = rntry modules = response.xpath( '//h2[contains(text(),"Course")]/following-sibling::div').extract( ) modules = remove_class(modules) item['modules_en'] = modules fee = response.xpath('//*[contains(text(),"£")]//text()').extract() tuition = getTuition_fee(fee) # print(tuition) item['tuition_fee'] = tuition item['tuition_fee_pre'] = '£' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['ielts'] = '6.5' item[ 'ielts_desc'] = 'For Postgraduate courses, we usually ask for IELTS 6.5 (No less than 5.5 in any section) or equivalent.' item[ 'toefl_desc'] = 'Overall score: 89 With no individual test score below: Listening: 17 Reading: 18 Speaking: 20 Writing : 17' item['toefl'] = '89' item['toefl_l'] = '17' item['toefl_s'] = '20' item['toefl_r'] = '18' item['toefl_w'] = '17' turation = response.xpath( '//li[contains(text(),"Length")]/span//text()').extract() duration = clear_duration(turation) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] ieltsopen = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() # print(ieltsopen) start_date = response.xpath( '//li[contains(text(),"Start")]/span//text()').extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) item['start_date'] = start_date item['department'] = ''.join( response.xpath( '//span[@id="department_name"]/text()').extract()).strip() if response.status == 404: print("****404****") with open("errorurl.txt", 'a+') as f: f.write(response.url + "\n") else: yield item
def parse(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) item['location'] = 'Newcastle' item['university'] = 'Northumbria University' item['url'] = response.url programme = response.xpath( '//div[@class="col-sm-6"]/h1/text()|//div[@class="hero-content"]/h1/text()|//header[@class="course-heading"]/h1/text()' ).extract() programme = ''.join(programme).strip() degree_name = re.findall('[A-Z]{2,}.*', programme) degree_name = ''.join(degree_name) if degree_name != programme: programme = programme.replace(degree_name, '') item['programme_en'] = programme item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass dur = response.xpath( '//strong[contains(text(),"Mode")]/../text()|//span[contains(text(),"uration")]/../text()' ).extract() # print(dur) duration = clear_duration(dur) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] item['teach_time'] = '1' start_date = response.xpath( '//strong[contains(text(),"Start")]/../text()|//span[contains(text(),"Start")]/../text()' ).extract() start_date = list(set(start_date)) # print(start_date) start_date = tracslateDate(start_date) # print(start_date) start_date = ','.join(start_date) item['start_date'] = start_date deadline = response.xpath( '//span[contains(text(),"deadline")]/../text()').extract() deadline = list(set(deadline)) # print(deadline) deadline = tracslateDate(deadline) # print(deadline) deadline = ''.join(deadline) item['deadline'] = deadline ielts = response.xpath( '//*[contains(text(),"IELTS")]/text()').extract() item['ielts_desc'] = ''.join(ielts).strip() ielts = get_ielts(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass if ielts == []: ielts = response.xpath( '//*[contains(text(),"English Language requirements")]/../text()' ).extract() ielts = get_ielts(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass # print(ielts) overview = response.xpath( '//div[@id="tab-0"]//div[@class="rich-text"]|//h3[contains(text(),"Overview")]/following-sibling::p' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//div[@id="tab-1"]//div[@class="rich-text"]|//div[@id="modules"]' ).extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules rntry = response.xpath( '//*[contains(text(),"English Language requirements")]/..' ).extract() rntry = remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry howtoapply = response.xpath('//div[@id="how-to-apply"]').extract() howtoapply = remove_class(howtoapply) item['apply_proces_en'] = howtoapply department = response.xpath( '//strong[contains(text(),"Department")]/../text()').extract() department = ''.join(department).strip() item['department'] = department fee = response.xpath('//*[contains(text(),"£")]//text()').extract() # print(fee) tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' career = response.xpath( '//h1[contains(text(),"career")]/../following-sibling::div|//div[@id="tab-5"]' ).extract() career = remove_class(career) # print(career) item['career_en'] = career