def crawl(test = False): browser = browserobject.start_browser("http://jobs.astrazeneca.com", test) COMPANY = 'Astra Zeneca' #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) browser.find_element_by_id("316").click() #sweden time.sleep(1) browser.find_element_by_id("1257").click() #student opportunities time.sleep(1) browser.find_element_by_class_name('submit').click() job_list = browser.find_elements_by_class_name('job-res-description') list_of_thesis = [] for l in job_list: title = l.find_element_by_class_name('job-title') if 'diploma' in title.text.lower(): location = l.find_element_by_class_name('locations') #print('Title: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, location.text, title.get_attribute('href'))) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser("http://jobs.astrazeneca.com", test) COMPANY = 'Astra Zeneca' #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) browser.find_element_by_id("316").click() #sweden time.sleep(1) browser.find_element_by_id("1257").click() #student opportunities time.sleep(1) browser.find_element_by_class_name('submit').click() job_list = browser.find_elements_by_class_name('job-res-description') list_of_thesis = [] for l in job_list: title = l.find_element_by_class_name('job-title') if 'diploma' in title.text.lower(): location = l.find_element_by_class_name('locations') #print('Title: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, location.text, title.get_attribute('href'))) list_of_thesis.append( dict(title=title.text, location=location.text, link=title.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://saabgroup.com/sv/career/job-opportunities/?&c=Sweden", test) COMPANY = "SAAB" #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) list_of_thesis = [] table = browser.find_element_by_class_name('vacancies') table_rows = table.find_elements_by_tag_name('li') for row in table_rows: if 'examen' in row.text.lower(): #print(row.text) title = row.find_element_by_class_name('title') location = row.find_element_by_class_name('location') date = row.find_element_by_class_name('date') link = row.find_element_by_tag_name('a') list_of_thesis.append( dict(title=title.text, location=location.text, link=link.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("https://tetrapak.taleo.net/careersection/3/jobsearch.ftl?lang=en", test) COMPANY = "Tetra Pak" list_of_thesis = [] #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) time.sleep(0.3) table = browser.find_element_by_class_name('table') table_rows = table.find_elements_by_tag_name('tr') #print(table.get_attribute('innerHTML')) for row in table_rows: if 'thesis' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[2]') date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("https://xjobs.brassring.com/TGWebHost/searchresults.aspx?partnerid=25079&siteid=5171&Codes=Volvo&AgentID=9780452&Function=runquery", test) url = 'https://xjobs.brassring.com/TGWebHost/searchresults.aspx?partnerid=25079&siteid=5171&Codes=Volvo&AgentID=9780452&Function=runquery' COMPANY = "Volvo Group" list_of_thesis = [] #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) time.sleep(0.3) table = browser.find_element_by_id('idSearchresults') table_rows = table.find_elements_by_tag_name('tr') for row in table_rows: row.text if 'thesis' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[5]') if 'sweden' in location.text.lower(): continue date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, url)) list_of_thesis.append(dict(title= title.text, location = location.text, company = COMPANY, link=url)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://www.ifsworld.com/se/verksamheten/arbeta-hos-oss/lediga-jobb/apply-for-a-job/", test) COMPANY = "IFS" browser.switch_to_frame("riframe") table = browser.find_element_by_id('jobsTable') #print(table.get_attribute('innerHTML')) table_rows = table.find_elements_by_tag_name('tr')[1:] list_of_thesis = [] for row in table_rows: if 'thesis' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[3]') date = row.find_element_by_xpath('./td[2]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://jobs.gecareers.com/search?q=thesis", test) COMPANY = "General Electric" list_of_thesis = [] table = browser.find_element_by_id('searchresults') table_rows = table.find_elements_by_tag_name('tr')[2:] for row in table_rows: if 'thesis' in row.text.lower() and 'sweden' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[2]') date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=link, company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://corporate.vattenfall.se/jobba-hos-oss/jobb/lediga-jobb/?country=Sweden&location=&function=&position=Internship&education=", test) COMPANY = "Vattenfall" list_of_thesis = [] table = browser.find_element_by_id('DataTables_Table_0') table_rows = table.find_elements_by_tag_name('tr')[1:] for row in table_rows: title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[2]') date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("https://boliden.csod.com/ats/careersite/search.aspx?site=5&c=boliden", test) baseurl = "https://boliden.csod.com/ats/careersite/JobDetails.aspx?id=" COMPANY = 'Boliden' #job_list = browser.find_elements_by_tag_name('''li''') job_list = browser.find_elements_by_xpath('''//ul/li''') list_of_thesis = [] for l in job_list: try: title = l.find_element_by_tag_name('a') location_data = l.find_element_by_class_name("FieldValue").text except: continue department, location = re.findall('\((.*?)\|(.*?)\)', location_data)[0] link_info = title.get_attribute('href') match = re.findall('.*?id=([^"]*)', link_info) #print('Title: {}\nDepartment: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, department, location, baseurl+match[0])) list_of_thesis.append(dict(title= title.text, location = location, link=baseurl+match[0], company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://www.billerudkorsnas.com/sv/Karriar/Lediga-jobb/Exjobb2/", test) COMPANY = 'Billerud' job_list = browser.find_elements_by_xpath( '''//*[@id="primarycontent"]/article/table/tbody/tr''') list_of_thesis = [] for l in job_list: title = l.find_element_by_class_name('listTitle') location = l.find_element_by_xpath('''.//td[2]''') link = l.find_element_by_tag_name('a') #print('Title: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, location.text, link.get_attribute('href'))) list_of_thesis.append( dict(title=title.text, location=location.text, link=link.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://www.ifsworld.com/se/verksamheten/arbeta-hos-oss/lediga-jobb/apply-for-a-job/", test) COMPANY = "IFS" browser.switch_to_frame("riframe") table = browser.find_element_by_id('jobsTable') #print(table.get_attribute('innerHTML')) table_rows = table.find_elements_by_tag_name('tr')[1:] list_of_thesis = [] for row in table_rows: if 'thesis' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[3]') date = row.find_element_by_xpath('./td[2]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append( dict(title=title.text, location=location.text, link=title.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://www.home.sandvik/se/karriar/student/examensarbete/examensarbeten/", test) COMPANY = "Sandvik" list_of_thesis = [] table = browser.find_element_by_tag_name('thead') table_rows = table.find_elements_by_tag_name('tr')[1:] for row in table_rows: title = row.find_element_by_xpath('./th[1]') subject = row.find_element_by_xpath('./th[2]') location = row.find_element_by_xpath('./th[3]') date = row.find_element_by_xpath('./th[4]') link = row.find_element_by_tag_name('a') list_of_thesis.append( dict(title=subject.text, location=location.text, link=link.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://www.goteborgenergi.se/Om_oss/Karriar/Student/Examensarbete", test) COMPANY = "Göteborgs Energi" time.sleep(0.3) table = browser.find_element_by_class_name('Dx-Content-Table') table_rows = table.find_elements_by_tag_name('tr')[1:] list_of_thesis = [] for row in table_rows: title = row.find_element_by_tag_name('a') location = 'Gothenburg' link = title.get_attribute('href') #print('Title: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, location, link)) list_of_thesis.append(dict(title= title.text, location = location, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://jobsearch.scania.com/segerjoblist/search.aspx", test) COMPANY = "Scania" table = browser.find_element_by_id('dgSearchResult') table_rows = table.find_elements_by_tag_name('tr')[1:] list_of_thesis = [] for row in table_rows: if 'examen' in row.text.lower(): title = row.find_element_by_tag_name('a') date = row.find_element_by_xpath('./td[1]') location = row.find_element_by_xpath('./td[5]') app_date = row.find_element_by_xpath('./td[6]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://new.abb.com/se/jobba-hos-oss/lediga-tjanster", test) COMPANY = 'ABB' time.sleep(1) #might need to explicitly wait for website to load #element has to be in view to click -> scroll to element find_button = browser.find_element_by_class_name("""findButton""") filter_selection = browser.find_element_by_xpath("""//*[@id="Content_C001_Col00"]/div/div[2]/div[3]/span[4]/div/a/span""") browser.execute_script("return arguments[0].scrollIntoView();", find_button) filter_selection.click() browser.find_element_by_xpath("""//*[@data-option-array-index='4']""").click() time.sleep(0.5) #time for content to load job_list = browser.find_elements_by_xpath("""//*[@id="jobOffers"]/tbody/tr""") list_of_thesis = [] for l in job_list: #print (l.text) title = l.find_element_by_tag_name('a') _, location, department, job_type, _ = l.find_elements_by_tag_name('td') #print('Title: {}\nLocation: {}\nDepartment: {}\nLink: {}\n\n'.format(title.text, location.text, department.text, title.get_attribute('href'))) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://jobsearch.scania.com/segerjoblist/search.aspx", test) COMPANY = "Scania" table = browser.find_element_by_id('dgSearchResult') table_rows = table.find_elements_by_tag_name('tr')[1:] list_of_thesis = [] for row in table_rows: if 'examen' in row.text.lower(): title = row.find_element_by_tag_name('a') date = row.find_element_by_xpath('./td[1]') location = row.find_element_by_xpath('./td[5]') app_date = row.find_element_by_xpath('./td[6]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append( dict(title=title.text, location=location.text, link=title.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://saabgroup.com/sv/career/job-opportunities/?&c=Sweden", test) COMPANY = "SAAB" #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) list_of_thesis = [] table = browser.find_element_by_class_name('vacancies') table_rows = table.find_elements_by_tag_name('li') for row in table_rows: if 'examen' in row.text.lower(): #print(row.text) title = row.find_element_by_class_name('title') location = row.find_element_by_class_name('location') date = row.find_element_by_class_name('date') link = row.find_element_by_tag_name('a') list_of_thesis.append(dict(title= title.text, location = location.text, link=link.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("https://jobs.bombardier.com/key/final-thesis-bombardier-jobs.html", test) COMPANY = "Bombardier" table = browser.find_element_by_id('searchresults') #print(table.get_attribute('innerHTML')) table_rows = table.find_elements_by_tag_name('tr')[2:] list_of_thesis = [] for row in table_rows: if 'thesis' in row.text.lower(): location = row.find_element_by_xpath('./td[2]') if 'SE' not in location.text: continue title = row.find_element_by_tag_name('a') date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://www.volvocars.com/intl/about/our-company/careers/job-search", test) job_list = browser.find_elements_by_xpath( """//*[@id="volvo"]/div[3]/div/div""") COMPANY = 'Volvo Cars' list_of_thesis = [] for i in range( 1, len(job_list) - 1 ): #1 to skip labels and len() -1 to skip last row "we are sorry..." position = job_list[i].find_element_by_tag_name('dt') #Test if thesis work if 'thesis' in position.text.lower(): location, app_date = job_list[i].find_elements_by_tag_name('dd') link_description = job_list[i].find_element_by_css_selector( 'a').get_attribute('href') #print(' Position: {} \n Location: {} \n Last Application Date: {}'.format(position.text, location.text, app_date.text)) list_of_thesis.append( dict(title=position.text, location=location.text, link=link_description, company=COMPANY)) #Uncomment to look into every URL # browser.execute_script("window.open('');") # browser.switch_to_window(browser.window_handles[1]) # browser.get(link_description) # description = browser.find_element_by_class_name('cl-description').text # body = browser.find_element_by_class_name('cl-description').get_attribute('outerHTML') #HTML to send to readability # time.sleep(1) # browser.close() # browser.switch_to_window(browser.window_handles[0]) # # readable_article = Document(body).summary() # # with open('thesis' + str(i), 'w') as f: # # print(readable_article, file=f) # # #print(description) else: continue if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://www.sweco.se/karriar/lediga-jobb/", test) COMPANY = "Sweco" list_of_thesis = [] #show all jobs more_jobs_button = browser.find_element_by_class_name('jobsearchresult__link') try: while True: more_jobs_button.click() time.sleep(0.2) #delay to wait for content to load except WebDriverException: #print('all jobs naow') pass table = browser.find_element_by_class_name('jobsearchresult__table') table_rows = table.find_elements_by_tag_name('tr')[1:] for row in table_rows: if any(word in row.text.lower() for word in ['examen','exjobb']): title = row.find_element_by_tag_name('a') date = row.find_element_by_xpath('./td[2]') location = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "https://skf.tms.hrdepartment.com/cgi-bin/a/searchjobs_quick.cgi?kand=thesis&country=&qty=25&sj=1&order=jobs.timedate+DESC&search=Search+Jobs", test) #browser = browserobject.start_browser("https://skf.tms.hrdepartment.com/cgi-bin/a/searchjobs_quick.cgi?kand=thesis&geog=custom__59&country=&qty=25&sj=1&order=jobs.timedate+DESC&search=Search+Jobs") COMPANY = "SKF" list_of_thesis = [] try: alert = browser.find_element_by_class_name('alert') print(alert.text) return None sys.exit() except NoSuchElementException: pass table = browser.find_element_by_tag_name('tbody') table_rows = table.find_elements_by_tag_name('tr') for row in table_rows: if 'thesis' in row.text.lower(): title = row.find_element_by_xpath('./td[1]') location = row.find_element_by_xpath('./td[2]') date = row.find_element_by_xpath('./td[3]') link = title.find_element_by_tag_name('a').get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append( dict(title=title.text, location=location.text, link=link, company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "http://new.abb.com/se/jobba-hos-oss/lediga-tjanster", test) COMPANY = 'ABB' time.sleep(1) #might need to explicitly wait for website to load #element has to be in view to click -> scroll to element find_button = browser.find_element_by_class_name("""findButton""") filter_selection = browser.find_element_by_xpath( """//*[@id="Content_C001_Col00"]/div/div[2]/div[3]/span[4]/div/a/span""" ) browser.execute_script("return arguments[0].scrollIntoView();", find_button) filter_selection.click() browser.find_element_by_xpath( """//*[@data-option-array-index='4']""").click() time.sleep(0.5) #time for content to load job_list = browser.find_elements_by_xpath( """//*[@id="jobOffers"]/tbody/tr""") list_of_thesis = [] for l in job_list: #print (l.text) title = l.find_element_by_tag_name('a') _, location, department, job_type, _ = l.find_elements_by_tag_name( 'td') #print('Title: {}\nLocation: {}\nDepartment: {}\nLink: {}\n\n'.format(title.text, location.text, department.text, title.get_attribute('href'))) list_of_thesis.append( dict(title=title.text, location=location.text, link=title.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("https://jobs.ericsson.com/search/?q=&locationsearch=sweden", test) COMPANY = "Ericsson" if int(browser.find_element_by_xpath("""//*[@id="content"]/div[3]/div/div/span/span[1]/b[2]""").text) > 25: second_page = True list_of_thesis = [] table_rows = browser.find_elements_by_css_selector('tr.data-row.clickable') for _ in range(2): for row in table_rows: if 'thesis' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[2]') date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append(dict(title= title.text, location = location.text, link=link, company = COMPANY)) if second_page: browser.find_element_by_class_name('paginationItemLast').click() table_rows = browser.find_elements_by_css_selector('tr.data-row.clickable') else: break if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://www.billerudkorsnas.com/sv/Karriar/Lediga-jobb/Exjobb2/", test) COMPANY = 'Billerud' job_list = browser.find_elements_by_xpath('''//*[@id="primarycontent"]/article/table/tbody/tr''') list_of_thesis = [] for l in job_list: title = l.find_element_by_class_name('listTitle') location = l.find_element_by_xpath('''.//td[2]''') link = l.find_element_by_tag_name('a') #print('Title: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, location.text, link.get_attribute('href'))) list_of_thesis.append(dict(title= title.text, location = location.text, link=link.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "https://xjobs.brassring.com/TGWebHost/searchresults.aspx?partnerid=25079&siteid=5171&Codes=Volvo&AgentID=9780452&Function=runquery", test) url = 'https://xjobs.brassring.com/TGWebHost/searchresults.aspx?partnerid=25079&siteid=5171&Codes=Volvo&AgentID=9780452&Function=runquery' COMPANY = "Volvo Group" list_of_thesis = [] #element = browser.find_element_by_xpath("""//*[@id="top"]/div/div[2]/div[5]/div[4]/p[2]/a/strong""") #browser.execute_script("return arguments[0].scrollIntoView();", element) time.sleep(0.3) table = browser.find_element_by_id('idSearchresults') table_rows = table.find_elements_by_tag_name('tr') for row in table_rows: row.text if 'thesis' in row.text.lower(): title = row.find_element_by_tag_name('a') location = row.find_element_by_xpath('./td[5]') if 'sweden' in location.text.lower(): continue date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, url)) list_of_thesis.append( dict(title=title.text, location=location.text, company=COMPANY, link=url)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://www.afconsult.com/sv/jobba-hos-oss/lediga-jobb/", test) COMPANY = 'ÅF' list_of_thesis = [] time.sleep(1) #might need to explicitly wait for website to load find_button = browser.find_element_by_id("""dk1-combobox""").click() #category try: option = browser.find_element_by_id("dk1--2024534255__jobFilters").click() # time.sleep(1) content = browser.find_element_by_id("contentItemListing_6373") job_list = content.find_elements_by_css_selector('.block.col.regular-12') for l in job_list: title = l.find_element_by_css_selector(".col.regular-6") if "exjobb" in title.text.lower(): #print(title.text) link = l.find_element_by_tag_name("a") location, app_date = l.find_elements_by_css_selector(".col.regular-3") #print('Title: {}\nLocation: {}\nApplication Date: {}\n\n'.format(title.text, location.text, app_date.text, link.get_attribute('href'))) list_of_thesis.append(dict(title= title.text, location = location.text, link=link.get_attribute('href'), company = COMPANY)) except NoSuchElementException: #print('nothing available') pass if list_of_thesis: return list_of_thesis browser.quit() else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "https://boliden.csod.com/ats/careersite/search.aspx?site=5&c=boliden", test) baseurl = "https://boliden.csod.com/ats/careersite/JobDetails.aspx?id=" COMPANY = 'Boliden' #job_list = browser.find_elements_by_tag_name('''li''') job_list = browser.find_elements_by_xpath('''//ul/li''') list_of_thesis = [] for l in job_list: try: title = l.find_element_by_tag_name('a') location_data = l.find_element_by_class_name("FieldValue").text except: continue department, location = re.findall('\((.*?)\|(.*?)\)', location_data)[0] link_info = title.get_attribute('href') match = re.findall('.*?id=([^"]*)', link_info) #print('Title: {}\nDepartment: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, department, location, baseurl+match[0])) list_of_thesis.append( dict(title=title.text, location=location, link=baseurl + match[0], company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test = False): browser = browserobject.start_browser("http://www.home.sandvik/se/karriar/student/examensarbete/examensarbeten/", test) COMPANY = "Sandvik" list_of_thesis = [] table = browser.find_element_by_tag_name('thead') table_rows = table.find_elements_by_tag_name('tr')[1:] for row in table_rows: title = row.find_element_by_xpath('./th[1]') subject = row.find_element_by_xpath('./th[2]') location = row.find_element_by_xpath('./th[3]') date = row.find_element_by_xpath('./th[4]') link = row.find_element_by_tag_name('a') list_of_thesis.append(dict(title= subject.text, location = location.text, link=link.get_attribute('href'), company = COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
def crawl(test=False): browser = browserobject.start_browser( "https://jobs.bombardier.com/key/final-thesis-bombardier-jobs.html", test) COMPANY = "Bombardier" table = browser.find_element_by_id('searchresults') #print(table.get_attribute('innerHTML')) table_rows = table.find_elements_by_tag_name('tr')[2:] list_of_thesis = [] for row in table_rows: if 'thesis' in row.text.lower(): location = row.find_element_by_xpath('./td[2]') if 'SE' not in location.text: continue title = row.find_element_by_tag_name('a') date = row.find_element_by_xpath('./td[3]') link = title.get_attribute('href') #print('Title: {}\nDate: {}\nLocation: {}\nLink: {}\n\n'.format(title.text, date.text, location.text, link)) list_of_thesis.append( dict(title=title.text, location=location.text, link=title.get_attribute('href'), company=COMPANY)) if list_of_thesis: browser.quit() return list_of_thesis else: if not test: browser.quit() return []
# -*- coding: utf-8 -*- import browserobject import sys import os import time sys.path.insert(1, os.path.join(sys.path[0], '..')) from json_append import * from readability.readability import Document browser = browserobject.start_browser( "http://www.volvocars.com/intl/about/our-company/careers/job-search") job_list = browser.find_elements_by_xpath( """//*[@id="volvo"]/div[3]/div/div""") list_of_thesis = [] for i in range( 1, len(job_list) - 1): #1 to skip labels and len() -1 to skip last row "we are sorry..." position = job_list[i].find_element_by_tag_name('dt') #Test if thesis work if 'thesis' in position.text.lower(): location, app_date = job_list[i].find_elements_by_tag_name('dd') link_description = job_list[i].find_element_by_css_selector( 'a').get_attribute('href') print(' Position: {} \n Location: {} \n Last Application Date: {}'. format(position.text, location.text, app_date.text))
# -*- coding: utf-8 -*- import browserobject import sys import os import time sys.path.insert(1, os.path.join(sys.path[0], '..')) from json_append import * from readability.readability import Document browser = browserobject.start_browser("http://www.volvocars.com/intl/about/our-company/careers/job-search") job_list = browser.find_elements_by_xpath("""//*[@id="volvo"]/div[3]/div/div""") list_of_thesis = [] for i in range(1, len(job_list)-1): #1 to skip labels and len() -1 to skip last row "we are sorry..." position = job_list[i].find_element_by_tag_name('dt') #Test if thesis work if 'thesis' in position.text.lower(): location, app_date = job_list[i].find_elements_by_tag_name('dd') link_description = job_list[i].find_element_by_css_selector('a').get_attribute('href') print(' Position: {} \n Location: {} \n Last Application Date: {}'.format(position.text, location.text, app_date.text)) #### list_of_thesis.append(dict(title= position.text, location = location.text, link=link_description)) ####
import browserobject browser = browserobject.start_browser("http://new.abb.com/se/jobba-hos-oss/lediga-tjanster") #element has to be in view to click -> scroll to element find_button = browser.find_element_by_class_name("""findButton""") filter_selection = browser.find_element_by_xpath("""//*[@id="Content_C001_Col00"]/div/div[2]/div[3]/span[4]/div/a/span""") browser.execute_script("return arguments[0].scrollIntoView();", find_button) filter_selection.click() browser.find_element_by_xpath("""//*[@data-option-array-index='4']""").click() time.sleep(0.5) #time for content to load job_list = browser.find_elements_by_xpath("""//*[@id="jobOffers"]/tbody/tr""") list_of_thesis = [] for l in job_list: #print (l.text) title = l.find_element_by_tag_name('a') _, location, department, job_type, _ = l.find_elements_by_tag_name('td') print('Title: {}\nLocation: {}\nDepartment: {}\nLink: {}\n\n'.format(title.text, location.text, department.text, title.get_attribute('href'))) list_of_thesis.append(dict(title= title.text, location = location.text, link=title.get_attribute('href'))) if list_of_thesis: #json_append.update_json(list_of_thesis) pass