def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Bournemouth University' # print(university) #2.location location = response.xpath( "//*[contains(text(),'Location:')]//following-sibling::p").extract( ) location = ''.join(location) location = remove_tags(location) # print(location) #3.programme_en 4.degree_name programme_en = response.xpath('/html/body/div/section//h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) try: degree_name = programme_en.split()[0] except: degree_name = '' if '-' in programme_en: programme_en = programme_en.replace('-', '') programme_en = programme_en.replace(degree_name, '') programme_en = clear_space_str(programme_en) if '–' in programme_en: programme_en = programme_en.replace('–', '').strip() programme_en = programme_en.replace('&', '') # print('programme_en:',programme_en) # print('degree_name:',degree_name) # 5.degree_type degree_type = 2 #6.teach_time teach_time = response.xpath( "//*[contains(text(),'Delivery:')]//following-sibling::*").extract( ) teach_time = ''.join(teach_time) teach_time = remove_tags(teach_time) if 'Full time' in teach_time: teach_time = 'Full time' else: teach_time = 'Part time' # print(teach_time) #7.duration #8.duration_per duration = response.xpath( "//*[contains(text(),'Duration:')]//following-sibling::p").extract( ) duration = ''.join(duration) duration = remove_tags(duration) # print(duration) if '1 year' in duration: duration = 1 duration_per = 1 elif '12-18 months' in duration: duration = 12 duration_per = 3 elif '36 months' in duration: duration = 36 duration_per = 3 elif '1 to 2 years' in duration: duration = 1 duration_per = 1 elif '2 years' in duration: duration = 2 duration_per = 1 elif '3-5 years' in duration: duration = 3 duration_per = 1 elif '48 months' in duration: duration = 48 duration_per = 3 elif '18-36 months' in duration: duration = 18 duration_per = 3 elif '12 months' in duration: duration = 12 duration_per = 3 elif '5 years' in duration: duration = 5 duration_per = 1 elif '3 years' in duration: duration = 3 duration_per = 1 elif '14 months' in duration: duration = 14 duration_per = 3 elif '15 months' in duration: duration = 15 duration_per = 3 elif '18-24 months' in duration: duration = 18 duration_per = 3 elif '27 months' in duration: duration = 27 duration_per = 3 elif '8 months' in duration: duration = 8 duration_per = 3 elif 'Nine months' in duration: duration = 9 duration_per = 3 else: duration_per = 1 duration = 1 # print('duration_per:',duration_per) # print('duration:',duration) #9.overview_en overview_en = response.xpath( '//*[@id="main-content"]/div/section[2]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #10.teach_time teach_time = 'full time' #11.modules_en modules_en = response.xpath( "//section[@id='course-details']//div[@id='accordion-1']").extract( ) modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #12.start_date start_date = response.xpath( "//*[contains(text(),'Next start date:')]//following-sibling::p" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) #13.rntry_requirements rntry_requirements = response.xpath( "//*[contains(text(),'Entry requirements')]/../following-sibling::div[1]" ).extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) rntry_requirements = clear_space_str(rntry_requirements) # print(rntry_requirements,'******************************************************************************') #14.ielts 15.16.17.18 ielts_list = re.findall('\d\.\d', rntry_requirements) # print(ielts_list) if len(ielts_list) == 4: ielts = ielts_list[2] ielts_l = ielts_list[3] ielts_s = ielts_list[3] ielts_r = ielts_list[3] ielts_w = ielts_list[3] elif len(ielts_list) == 3: ielts = ielts_list[1] ielts_l = ielts_list[2] ielts_s = ielts_list[2] ielts_r = ielts_list[2] ielts_w = ielts_list[2] elif len(ielts_list) == 2: ielts = ielts_list[0] ielts_l = ielts_list[1] ielts_s = ielts_list[1] ielts_r = ielts_list[1] ielts_w = ielts_list[1] elif len(ielts_list) == 1: ielts = ielts_list[0] ielts_l = ielts_list[0] ielts_s = ielts_list[0] ielts_r = ielts_list[0] ielts_w = ielts_list[0] else: ielts = None ielts_l = None ielts_s = None ielts_r = None ielts_w = None # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s) #19.career_en career_en = response.xpath( "//*[contains(text(),'Careers')]/../following-sibling::*|//*[contains(text(),'Careers')]//following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #20.tuition_fee,#21.tuition_fee_pre tuition_fee_list = response.xpath( '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]' ).extract() tuition_fee_list = ''.join(tuition_fee_list) # # if len(tuition_fee) == 0: # tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract() # tuition_fee = ''.join(tuition_fee) # tuition_fee = remove_tags(tuition_fee) # tuition_fee = tuition_fee.replace(',','') # tuition_fee = tuition_fee.replace('£','') # print(tuition_fee) tuition_fee = getTuition_fee(tuition_fee_list) # print(tuition_fee) tuition_fee_pre = '£' #22.url url = response.url # print(url) #23.application_open_date application_open_date = '2018-7-18' #24.apply_pre apply_pre = '£' #25.apply_fee apply_fee = 0 #26.apply_proces_en apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>' #27.require_chinese_en require_chinese_en = "<p>This is a guide to the normal entry requirements, assuming you’ve followed the Chinese education system. An admissions tutor will study your application, so make sure you include your academic background and personal information when you apply.Entry requirements vary depending on what sort of course you’re coming to BU to study. BU International College Foundation Certificates You can undertake a Foundation Certificate before going on to an undergraduate course if you’ve completed 11 years of schooling or Senior High School Year 2 in China and have a minimum of IELTS (Academic) 5.0. Undergraduate courses You can apply to study a Bachelor's degree from year one if you hold a Chinese Senior High School Diploma plus successful completion of a relevant first-year undergraduate programme in a recognised Chinese university, or a Diploma from Specialized College (zhongzhuan). Chinese Senior High School certificate of graduation with overall HuiKao result grade B average, transcripts of 3 years with 85% average (85% also eligible for AES). Top-up courses You need to hold a College Graduation Diploma (Dazhuan awarded by a university/college on completion of two to three years of study), or a BTEC Higher National Diploma or Foundation degree in a relevant subject.Postgraduate courses You need to have a Bachelor's (Honours) degree from a recognised Chinese university, normally from a four-year undergraduate programme, or a Bachelors degree from Higher Education Self-Study Examinations, or a Top-up degree or university-recognised Pre-Master’s Foundation programme. Grade requirements from Chinese Bachelor's degree holders are as below: Applicants from 985 or 211 universities Media studies and other subjects equivalent to UK 2:1 degree 65% + GPA 2.25 + Business and subjects equivalent to UK 2:2 degree 60% + GPA 2.0 + Academic Excellence Scholarship (automatic award of £3500) 75% + GPA 2.75 + Applicants from other universities Media studies and other subjects equivalent to UK 2:1 degree 70% + GPA 2.5 + Business and subjects equivalent to UK 2:2 degree 65% + GPA 2.25 + Academic Excellence Scholarship (automatic award of £3500) 80% + GPA 3.0 + Research programmes You need a good postgraduate degree to be considered for a BU research programme. Please see more detail on the postgraduate research page.You can find more information about English language requirements for entry to BU on our English language requirements page. Full information about preparatory courses is available on the Bournemouth University International College website.If you need help with your visa or want more information about the immigration process, you can find it on our immigration information page.</p>" item['require_chinese_en'] = require_chinese_en item['apply_proces_en'] = apply_proces_en item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['university'] = university item['location'] = location item['programme_en'] = programme_en item['degree_name'] = degree_name item['degree_type'] = degree_type item['teach_time'] = teach_time item['duration'] = duration item['duration_per'] = duration_per item['overview_en'] = overview_en item['teach_time'] = teach_time item['modules_en'] = modules_en item['start_date'] = start_date item['rntry_requirements'] = rntry_requirements item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['url'] = url item['application_open_date'] = application_open_date yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Edge Hill University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath('//*[@id="primary"]/header/h1/a').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name = programme_en.split()[0] programme_en = programme_en.replace(degree_name,'').strip() # print(degree_name) # print(programme_en) #6.teach_time #7.duration #8.duration_per teach_time_list = response.xpath("//*[contains(text(),'Length:')]//following-sibling::*").extract() teach_time_list= ''.join(teach_time_list) teach_time_list = remove_tags(teach_time_list) # print(teach_time_list) duration = re.findall('\d+',teach_time_list)[0] if 'Months' in teach_time_list: duration_per = 3 elif 'Weeks' in teach_time_list: duration_per = 4 else: duration_per = 1 if 'Full-Time' in teach_time_list: teach_time = 'Full-Time' else: teach_time = 'Part-Time' # print(duration,'***********',duration_per) # print(teach_time) #9.start_date start_date = response.xpath("//*[contains(text(),'Dates:')]//following-sibling::*").extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) #10.department department = response.xpath("//*[contains(text(),'Department:')]//following-sibling::*").extract() department = ''.join(department) department = remove_tags(department) # print(department) #11.location location = response.xpath("//*[contains(text(),'Location:')]//following-sibling::*").extract() location = ''.join(location) location = remove_tags(location) # print(location) #12.overview_en overview_en = response.xpath('//*[@id="overview"]/div[1]/div/ul/li/text()').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = '<p>' + overview_en +'</p>' # print(overview_en) #13.assessment_en assessment_en = response.xpath("//*[contains(text(),'How will I be assessed?')]//following-sibling::*[1]").extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en) #14.modules_en modules_en = response.xpath('//*[@id="modules"]/h4/strong').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #15.rntry_requirements rntry_requirements = response.xpath("//*[contains(text(),'Entry Requirements')]//following-sibling::*").extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) #16.ielts 17.18.19.20 ielts_list = response.xpath("//*[contains(text(),'English Language Requirements')]//following-sibling::*[1]").extract() ielts_list = ''.join(ielts_list) ielts_list = remove_tags(ielts_list) # print(ielts_list) try: ielts = re.findall('\d\.\d',ielts_list) except: ielts = None if len(ielts) ==1: a = ielts[0] ielts = a ielts_r = a ielts_w = a ielts_s = a ielts_l = a elif len(ielts) ==2: a= ielts[0] b= ielts[1] ielts = a ielts_r = b ielts_w = b ielts_s = b ielts_l = b else: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_s = 6.0 ielts_l = 6.0 # print(ielts,ielts_r,ielts_w,ielts_l,ielts_s) #21.career_en career_en = response.xpath("//*[contains(text(),'What are my career prospects?')]//following-sibling::*").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #22.tuition_fee tuition_fee= response.xpath("//*[contains(text(),'Tuition Fees')]//following-sibling::*").extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #23.tuition_fee_pre tuition_fee_pre= '£' #24.apply_proces_en apply_proces_en = response.xpath("//h4[contains(text(),'How to Apply')]//following-sibling::*").extract() apply_proces_en = ''.join(apply_proces_en) apply_proces_en = remove_class(apply_proces_en) # print(apply_proces_en) #25.apply_pre apply_pre = '£' item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['teach_time'] = teach_time item['duration'] = duration item['duration_per'] = duration_per item['start_date'] = start_date item['department'] = department item['location'] = location item['overview_en'] = overview_en item['assessment_en'] = assessment_en item['modules_en'] = modules_en item['rntry_requirements'] = rntry_requirements item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_proces_en'] = apply_proces_en yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Bournemouth University' # print(university) #2.location location = response.xpath( "//*[contains(text(),'Location:')]//following-sibling::p").extract( ) location = ''.join(location) location = remove_tags(location) # print(location) #3.programme_en 4.degree_name programme_en = response.xpath('/html/body/div/section//h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) try: degree_name = programme_en.split()[0] except: degree_name = '' if '-' in programme_en: programme_en = programme_en.replace('-', '') programme_en = programme_en.replace(degree_name, '') programme_en = clear_space_str(programme_en) if '–' in programme_en: programme_en = programme_en.replace('–', '').strip() programme_en = programme_en.replace('&', '').replace('(Hons)', '').strip() # print('programme_en:',programme_en) # print('degree_name:',degree_name) #5.degree_type degree_type = 1 #6.ucascode ucascode = response.xpath( "//*[contains(text(),'UCAS Code:')]//following-sibling::*" ).extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) # print(ucascode) #7.duration #8.duration_per duration_a = response.xpath( "//*[contains(text(),'Duration:')]//following-sibling::p").extract( ) duration_a = ''.join(duration_a) duration_a = remove_tags(duration_a) # print(duration) if 'Four years' in duration_a: duration = 4 duration_per = 1 else: duration = re.findall('\d', duration_a)[0] duration_per = 1 # print('duration_per:',duration_per) # print('duration:',duration) #9.overview_en overview_en = response.xpath( '//*[@id="main-content"]/div/section[3]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #10.alevel try: alevel_list = response.xpath( "//*[contains(text(),'GCSEs')]//preceding-sibling::p").extract( )[1] alevel = ''.join(alevel_list) alevel = remove_tags(alevel) except: alevel = 'N/A' # print(alevel) #11.modules_en modules_en = response.xpath( "//section[@id='course-details']//div[@id='accordion-1']").extract( ) modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #12.start_date start_date = response.xpath( "//*[contains(text(),'Next start date:')]//following-sibling::p" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) #13.ib ib = response.xpath( "//*[contains(text(),'International Baccalaureate')]/..").extract( ) ib = ''.join(ib) ib = remove_tags(ib) if len(ib) > 500: ib = ib[:500] # print(ib) #14.ielts 15.16.17.18 rntry_requirements = response.xpath( '//*[@id="entry-requirements"]/div').extract() rntry_requirements = ''.join(rntry_requirements) ielts_list = re.findall('\d\.\d', rntry_requirements) # print(ielts_list) if len(ielts_list) == 4: ielts = ielts_list[2] ielts_l = ielts_list[3] ielts_s = ielts_list[3] ielts_r = ielts_list[3] ielts_w = ielts_list[3] elif len(ielts_list) == 3: ielts = ielts_list[1] ielts_l = ielts_list[2] ielts_s = ielts_list[2] ielts_r = ielts_list[2] ielts_w = ielts_list[2] elif len(ielts_list) == 2: ielts = ielts_list[0] ielts_l = ielts_list[1] ielts_s = ielts_list[1] ielts_r = ielts_list[1] ielts_w = ielts_list[1] elif len(ielts_list) == 1: ielts = ielts_list[0] ielts_l = ielts_list[0] ielts_s = ielts_list[0] ielts_r = ielts_list[0] ielts_w = ielts_list[0] else: ielts = None ielts_l = None ielts_s = None ielts_r = None ielts_w = None # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s) #19.career_en career_en = response.xpath( "//*[contains(text(),'Careers')]//following-sibling::*").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #20.tuition_fee,#21.tuition_fee_pre tuition_fee_list = response.xpath( '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]' ).extract() tuition_fee_list = ''.join(tuition_fee_list) # # if len(tuition_fee) == 0: # tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract() # tuition_fee = ''.join(tuition_fee) # tuition_fee = remove_tags(tuition_fee) # tuition_fee = tuition_fee.replace(',','') # tuition_fee = tuition_fee.replace('£','') # print(tuition_fee) tuition_fee = getTuition_fee(tuition_fee_list) # print(tuition_fee) tuition_fee_pre = '£' #22.url url = response.url # print(url) #23.application_open_date application_open_date = '2018-7-18' #24.apply_pre apply_pre = '£' #25.apply_fee apply_fee = 0 #26.apply_proces_en apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>' #27.assessment_en assessment_en = response.xpath( "//*[contains(text(),'How you will be assessed')]//following-sibling::p|//*[@id='accordion-1']/div[6]" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en,url) item['assessment_en'] = assessment_en item['alevel'] = alevel item['ib'] = ib item['ucascode'] = ucascode item['apply_proces_en'] = apply_proces_en item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['university'] = university item['location'] = location item['programme_en'] = programme_en item['degree_name'] = degree_name item['degree_type'] = degree_type item['duration'] = duration item['duration_per'] = duration_per item['overview_en'] = overview_en item['modules_en'] = modules_en item['start_date'] = start_date item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['url'] = url item['application_open_date'] = application_open_date yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'London School of Economics and Political Science' # print(university) #2.url url = response.url # print(url) #3.teach_type if '2018/VRS' in url: teach_type = 'research' elif '2018/MResPhD' in url: teach_type = 'research' elif '2018/MPhilPhD' in url: teach_type = 'research' else: teach_type = 'taught' # print(teach_type) #4.programme_en programme_en = response.xpath( '//*[@id="form1"]/header[2]/div/div[2]/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #5.degree_type degree_type = 2 #6.degree_name if 'MSc' in programme_en: degree_name = 'MSc' elif 'LSE' in programme_en: degree_name = 'LSE' elif 'MPA' in programme_en: degree_name = 'MPA' elif 'LLM' in programme_en: degree_name = 'LLM' elif 'Diploma' in programme_en: degree_name = 'Diploma' elif 'MA' in programme_en: degree_name = 'MA' elif 'MPhil/PhD' in programme_en: degree_name = 'MPhil/PhD' elif 'MRes/PhD' in programme_en: degree_name = 'MRes/PhD' elif 'Visiting Research' in programme_en: degree_name = 'Visiting Research' else: degree_name = 'N/A' # print(degree_name) programme_en = programme_en.replace(degree_name, '').strip().replace('-', '') # print(programme_en) #7.department department = response.xpath( '//*[@id="form1"]/div[4]/div/div[1]/div/ul/li[2]').extract() department = ''.join(department) department = remove_tags(department) # print(department) #8.overview_en overview_en = response.xpath( '//*[@id="form1"]/div[3]/div/div[2]/div/p|//*[@id="form1"]/div[4]/div/div[2]/div/p' ).extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #9.start_date start_date = response.xpath( "//*[contains(text(),'Start date')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) try: start_date = tracslateDate(start_date)[0] if 'Introductory' in start_date: start_date = '2018-9' elif 'Early' in start_date: start_date = '2018-9' elif 'First' in start_date: start_date = '2018-9' elif 'Mandatory' in start_date: start_date = '2018-9' elif 'Suspended' in start_date: start_date = '2018-9' elif 'Late' in start_date: start_date = '2018-9' elif 'Intake' in start_date: start_date = '2018-9' else: start_date = start_date except: start_date = 'N/A' # print(start_date) #10.deadline deadline = response.xpath( "//*[contains(text(),'Application deadline')]//following-sibling::*" ).extract() deadline = ''.join(deadline) deadline = remove_tags(deadline) deadline = tracslateDate(deadline) deadline = ''.join(deadline) deadline = deadline.replace('None', '').replace('However', '').replace( 'Apply', '').replace('Sciences', '') # print(deadline) #11.duration duration_list = response.xpath( "//*[contains(text(),'Duration')]//following-sibling::*").extract( ) duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) # print(duration_list) if 'Nine months' in duration_list: duration = 9 duration_per = 3 elif 'Ten months' in duration_list: duration = 10 duration_per = 3 elif 'months' in duration_list: duration = re.findall('\d+', duration_list)[0] # print(duration) duration_per = 3 elif 'Three-four years' in duration_list: duration = 3 duration_per = 1 elif 'Three to four years' in duration_list: duration = 3 duration_per = 1 elif 'Four to five' in duration_list: duration = 4 duration_per = 1 elif 'Five years' in duration_list: duration = 5 duration_per = 1 elif '3 to 4 years' in duration_list: duration = 3 duration_per = 1 elif 'Six years' in duration_list: duration = 6 duration_per = 1 else: duration = 1 duration_per = 1 # print(duration,'********************',duration_per) #12.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'Tuition fee')]//following-sibling::*[1]" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #13.location location = response.xpath( "//*[contains(text(),'Location')]//following-sibling::*[1]" ).extract() location = ''.join(location) location = remove_tags(location) # print(location) #14.rntry_requirements rntry_requirements = response.xpath( "//*[contains(text(),'Minimum entry requirement')]//following-sibling::*[1]" ).extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) rntry_requirements = clear_space_str(rntry_requirements) # print(rntry_requirements) #15.modules_en modules_en = response.xpath( "//*[contains(text(),'Programme structure and courses')]/../../following-sibling::*" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #16.assessment_en assessment_en = response.xpath( "//h3[contains(text(),'ssessment')]//following-sibling::*" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) assessment_en = clear_space_str(assessment_en) # print(assessment_en) #17.career_en career_en = response.xpath( "//*[contains(text(),'Support for your career')]//preceding-sibling::*" ).extract() career_en = ''.join(career_en) career_en = clear_space_str(career_en) career_en = remove_class(career_en) # print(career_en) #18.ielts,19.20.21.22 if 'MPhil' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_l = 6.5 ielts_w = 7.0 ielts_s = 6.5 elif 'LLM' in programme_en: ielts = 7.5 ielts_r = 6.5 ielts_l = 7.0 ielts_w = 7.0 ielts_s = 6.5 else: ielts = 7.0 ielts_r = 6.5 ielts_l = 6.5 ielts_w = 6.0 ielts_s = 6.0 #23.require_chinese_en require_chinese_en = "<p>Graduate entry requirements for applicants from China Taught master's programmes (MSc/MA/MPA/LLM)To be considered for admission to a taught master's programme, we would normally require a bachelor's degree with an overall mark of 85 per cent from applicants who have attended a highly regarded institution in China, with all other applicants we would normally require a mark of at least 90 per cent.Research programmes (MPhil/MRes/PhD)To be considered for admission to a research programme, we would normally require a master's degree with an overall mark of 85 per cent/B from applicants who have attended a highly regarded institution, while all other applicants are normally required to obtain a mark of 90 per cent/A.</p>" #24.apply_proces_en apply_proces_en = 'http://www.lse.ac.uk/study-at-lse/Graduate/Prospective-students/How-to-Apply' #25.teach_time teach_time = 'Full time' #26.tuition_fee_pre tuition_fee_pre = '£' #27.toefl 28293031 if 'MPhil' in programme_en: toefl = 100 toefl_r = 23 toefl_l = 22 toefl_w = 27 toefl_s = 22 elif 'LLM' in programme_en: toefl = 109 toefl_r = 23 toefl_l = 24 toefl_w = 27 toefl_s = 22 else: toefl = 100 toefl_r = 23 toefl_l = 22 toefl_w = 24 toefl_s = 22 #32.apply_pre apply_pre = '£' #33.apply_documents_en apply_documents_en = '<p>We welcome applications from all suitably qualified prospective students and want to recruit students with the very best academic merit, potential and motivation, irrespective of their background. We carefully consider each application on an individual basis, taking into account all the information presented on your application form, including your: academic achievement (including predicted and achieved grades) personal statement two references CV</p>' item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['toefl_l'] = toefl_l item['toefl_s'] = toefl_s item['apply_pre'] = apply_pre item['apply_documents_en'] = apply_documents_en item['university'] = university item['url'] = url item['teach_type'] = teach_type item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['department'] = department item['overview_en'] = overview_en item['start_date'] = start_date item['deadline'] = deadline item['duration'] = duration item['duration_per'] = duration_per item['tuition_fee'] = tuition_fee item['location'] = location item['rntry_requirements'] = rntry_requirements item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['require_chinese_en'] = require_chinese_en item['apply_proces_en'] = apply_proces_en item['teach_time'] = teach_time item['tuition_fee_pre'] = tuition_fee_pre yield item