def runScrape(verbose, upload, alljobs, timeout): # Sets the company for the script. Change each company company = JC.Company(15, "Raytheon", "https://www.raytheon.com/", "None") # Getting total number of jobs from base link page_link = 'https://jobs.raytheon.com/search-jobs/United%20States?orgIds=4679&alp=6252001&alt=2' page_response = requests.get(page_link) page_content = BeautifulSoup(page_response.content, "html.parser") totalJobs = page_content.find("h1", {"role": "status"}).text totalJobs = totalJobs[:4] #Editing link to get total number of jobs page_link = 'https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&CurrentPage=1&RecordsPerPage=' + totalJobs + '&Distance=50&RadiusUnitType=0&Keywords=&Location=United+States&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=6252001&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=3478&FacetFilters%5B0%5D.Display=United+States&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=1&SearchType=6&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=4679&PostalCode=&fc=&fl=&fcf=&afc=&afl=&afcf=' page_response = requests.get(page_link) page_json = json.loads(page_response.content) page_content = BeautifulSoup(page_json["results"], "html.parser") jobsContainer = page_content.findAll("li") #Removing country list item jobsContainer = jobsContainer[1:] #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in jobsContainer: title = job.find("h2").text location = job.find("span", attrs={"class": "job-location"}).text link = 'https://jobs.raytheon.com' + job.a["href"] titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) # Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link) page_content = BeautifulSoup(page_response.content, "html.parser") desc = page_content.find("div", attrs={ "class": "ats-description" }).text title = str(titles[i]) descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
def runScrape(verbose, upload, alljobs, timeout): # 1. Initialise Company Object, with all the information needed to attach to a job page. # 1.5. Go into Wordpress and create a company account for this company. Most important is the company id company = JC.Company(0, "Company Name", "www.companyurl.com", "Company Email") print("Scraping %s..." % company.name) # 2. Add URL of the company's career page company_careers_url = 'https://www.companyname.com/careers' page_response = requests.get(company_careers_url, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") # 3. Identify the parts of the webpage that have all the job information on. # If purely HTML, it will probably be some sort of list of <div> with an identifying class, which you can find through the findAll function # If the website uses JavaScript, then the process will be a bit more complicated jobPostingsOnWebpage = page_content.findAll('div', attrs={"class": "posting"}) # Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in jobPostingsOnWebpage: # 4. From the HTML objects, separate the link to the job, the job title, and the location (if it is there) link = job.a["href"] title = job.h5.text location = job.find('span', attrs={"class": "sort-by-location"}).text titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) # Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") title = str(titles[i]) # 4. Identify the job description, and isolate it. Include the HTML formatting (we use it to keep the job pretty on our site) desc = str(page_content.find('div', {"class": "content"})) descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company)
def runScrape(verbose, upload, alljobs, timeout): # Sets the company for the script. Change each company company = JC.Company(10, "OneWeb", "https://www.oneweb.world/", "*****@*****.**") page_link = 'https://boards.greenhouse.io/embed/job_board?for=oneweb&b=https%3A%2F%2Fwww.oneweb.world%2Fcareers' page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") jobContainer = page_content.findAll("div", attrs={"class": "opening"}) # Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in jobContainer: title = job.a.text.strip() location = job.span.text.strip() tempLink = job.a["href"] ID = tempLink[50:] link = "https://boards.greenhouse.io/embed/job_app?for=oneweb&token=" + ID + "&b=https%3A%2F%2Foneweb.world%2Fcareers-opportunities%2F" titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) #Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") title = str(titles[i]) descContainer = page_content.findAll("p") for par in descContainer: desc = par.text descriptions.append(desc) title = str(titles[i]) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
def runScrape(verbose, upload, alljobs, timeout): #Sets the company for the script. Change each company company = JC.Company(9, "Bigelow", "https://bigelowaerospace.com/", "*****@*****.**") page_link = "https://bigelowaerospace.com/pages/job-opportunities/" page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") alljobswebpage = page_content.findAll("p") #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in alljobswebpage: try: title = job.a.u.text link = job.a["href"] location = "North Las Vegas, NV, USA" titles.append(title) locations.append(location) links.append(link) except AttributeError: break print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) #Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") title = str(titles[i]) descContainer = page_content.findAll("ul") desc = descContainer[1].text descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
def runScrape(verbose, upload, alljobs, timeout): #Sets the company for the script. Change each company company = JC.Company(2, "Astranis", "www.astranis.com", "*****@*****.**") page_link = 'https://jobs.lever.co/astranis' page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") alljobswebpage = page_content.findAll('div', attrs={"class": "posting"}) #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in alljobswebpage: link = job.a["href"] title = job.h5.text location = job.find('span', attrs={"class": "sort-by-location"}).text #TODO: Add a check to see if they add a country code. Especially if Astranis expands beyond America location = location + ', USA' titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) #Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") title = str(titles[i]) desc = str(page_content.find('div', {"class": "content"})) descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(verbose, upload, alljobs, titles, locations, descriptions, company) return True
def runScrape(verbose, upload, alljobs, timeout): # Sets the company for the script. Change each company company = JC.Company(17, "Odyssey Space Research", "https://www.odysseysr.com/", "None") page_link = 'https://www.odysseysr.com/jm-ajax/get_listings/' page_response = requests.get(page_link) json_content = json.loads(page_response.content) page_content = BeautifulSoup(json_content["html"], "html.parser") jobContainer = page_content.findAll("a") #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in jobContainer: title = job.find("h3").text location = job.find("div", attrs={"class": "location"}).text.strip() link = job["href"] titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) # Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link) page_content = BeautifulSoup(page_response.content, "html.parser") desc = page_content.find("div", attrs={ "class": "job_description" }).text title = str(titles[i]) descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
def runScrape(verbose,upload,alljobs,timeout): #Sets the company for the script. Change each company company = JC.Company(6, "Rocket Lab", "www.rocketlabusa.com", "None") page_link = 'https://www.rocketlabusa.com/careers/positions/' page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") jobsContainer = page_content.findAll("a", attrs={"class":"job"}) #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for jobs in jobsContainer: title = jobs.h3.text location = jobs.h5.text link = "https://www.rocketlabusa.com" + jobs["href"] titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) #Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") desc = page_content.find("div", attrs={"class", "job__info-subtitle"}).text descriptions.append(desc) title = str(titles[i]) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
''' Created on Tuesday May 23 12:47 PM Author: JJ Fiedler ''' from bs4 import BeautifulSoup import json import os import subprocess import shlex import spacejobscrape.helperscripts.JobClasses as JC from spacejobscrape.helperscripts.writeXML import createjoblist company = JC.Company(4, "Blue Origin", "https://www.blueorigin.com/", "*****@*****.**") #Running external workday scrape script args = shlex.split( "python3 WorkdayScrape.py -u 'https://blueorigin.wd5.myworkdayjobs.com/BlueOrigin' -d './BlueOriginJSON'" ) print("Running external workday scrape script...") subprocess.run(args) #Basepath for the folder containing json files basepath = "/Users/JJ/Documents/ProgrammingStuff/PythonFiles/JobSiteWebscrape/BlueOriginJSON/" #Creates list of titles, locations and links to the application website titles = [] locations = [] links = []
def runScrape(verbose, upload, alljobs, timeout): # Sets the company for the script. Change each company company = JC.Company(16, "OrbitalInsight", "https://orbitalinsight.com/", "*****@*****.**") # Uses webdriver and chromedriver to get html from javascript chromedriver = "/Users/JJ/Documents/ProgrammingStuff/chromedriver" driver = webdriver.Chrome(chromedriver) driver.get("https://orbitalinsight.com/company/careers/#positions") html = driver.execute_script("return document.documentElement.outerHTML") page_content = BeautifulSoup(html, "html.parser") driver.close() #No overarching div that contains all jobs. Getting each item individually in a list titleContainer = page_content.find_all("h3", attrs={"class": "career__title"}) locationContainer = page_content.findAll( "div", attrs={"class": "career__meta--location"}) linkContainer = page_content.findAll("a", attrs={"class": "btn"}) # Removing Join Us link and Email link numJobs = len(titleContainer) linkContainer = linkContainer[1:numJobs + 1] #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for i in range(len(titleContainer)): title = titleContainer[i].text location = locationContainer[i].text link = linkContainer[i]["href"] titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) # Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] driver = webdriver.Chrome(chromedriver) driver.get(page_link) html = driver.execute_script( "return document.documentElement.outerHTML") page_content = BeautifulSoup(html, "html.parser") driver.close() desc = page_content.find("div", attrs={ "class": "columns small-12 medium-8" }).text title = str(titles[i]) descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
def runScrape(verbose, upload, alljobs, timeout): #Sets the company for the script. Change each company company = JC.Company(7, "Aerospace Corporation", "https://aerospace.org/", "None") page_link = 'https://careers.aerospace.org/go/View-All-Jobs/2443100/?q=&sortColumn=referencedate&sortDirection=desc' page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") #Getting base pagination links paginationLinks = page_content.find("ul", attrs={"class": "pagination"}) jobLinkContainer = paginationLinks.findAll("li") #List to store all links mainLinkContainer = [] for link in jobLinkContainer: jobsLink = "https://careers.aerospace.org" + link.a["href"] #Editing links to obtain links for all pages on website if jobsLink == "https://careers.aerospace.org/go/View-All-Jobs/2443100/100/?q=&sortColumn=referencedate&sortDirection=desc": mainLinkContainer.append(jobsLink) #Calculating the number of pages left numPagesLeft = int(((250 - 100) / 25) - 1) #Num items on page increases by 25 starting at 125 paginationAmount = 125 #Editing the links for i in range(numPagesLeft): jobsLink = "https://careers.aerospace.org/go/View-All-Jobs/2443100/%s/?q=&sortColumn=referencedate&sortDirection=desc" % ( str(paginationAmount)) paginationAmount += 25 mainLinkContainer.append(jobsLink) else: mainLinkContainer.append(jobsLink) #Removing duplicate links mainLinkContainer = list(dict.fromkeys(mainLinkContainer)) # Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] #Start of main web scrape for link in mainLinkContainer: page_response = requests.get(link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") titleContainer = page_content.findAll("a", attrs={"class": "jobTitle-link"}) locationContainer = page_content.select( "span.jobLocation.visible-phone") #Removing duplicates from the list titleContainer = list(dict.fromkeys(titleContainer)) for item in titleContainer: title = item.text link = "https://careers.aerospace.org" + item["href"] titles.append(title) links.append(link) for loc in locationContainer: location = (loc.span.text).strip() locations.append(location) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) # Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") descContainer = page_content.findAll("span", attrs={"class": "jobdescription"}) for items in descContainer: desc = items.text descriptions.append(desc) title = str(titles[i]) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
''' Date Created: August 24, 2019 9:51 pm Author: JJ Fiedler ''' from bs4 import BeautifulSoup import requests import json import spacejobscrape.helperscripts.JobClasses as JC from spacejobscrape.helperscripts.writeXML import createjoblist # Sets the company for the script. Change each company company = JC.Company(18, "Boeing", "https://www.boeing.com/", "None") page_link = 'https://jobs.boeing.com/search-jobs/results?ActiveFacetID=6252001&CurrentPage=1&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=6252001&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=914&FacetFilters%5B0%5D.Display=United+States&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=&fcf=&afc=&afl=&afcf=' page_response = requests.get(page_link) json_content = json.loads(page_response.content) page_content = BeautifulSoup(json_content["results"], "html.parser") #Getting total jobs from initial page link totalJobs = page_content.find("h2").text totalJobs = totalJobs[0:5].strip() #Editing page link to reflect total number of jobs and grabbing new HTML from JSON page_link = 'https://jobs.boeing.com/search-jobs/results?ActiveFacetID=6252001&CurrentPage=1&RecordsPerPage=' + totalJobs + '&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=6252001&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=914&FacetFilters%5B0%5D.Display=United+States&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=&fcf=&afc=&afl=&afcf=' page_response = requests.get(page_link) json_content = json.loads(page_response.content) page_content = BeautifulSoup(json_content["results"], "html.parser") jobList = page_content.find( "ul", attrs={"class", "sr-main-jobs js-watch js-watch-once"}) jobContainer = jobList.findAll("li")
def runScrape(verbose,upload,alljobs,timeout): #Sets the company for the script. Change each company company = JC.Company(12, "Lockheed Martin", "https://www.lockheedmartin.com/en-us/index.html", "*****@*****.**") #Getting page to find total number of jobs page_link = 'https://www.lockheedmartinjobs.com/search-jobs/results?ActiveFacetID=Space&CurrentPage=1&RecordsPerPage=1000&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=4566966&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=13&FacetFilters%5B0%5D.Display=Puerto+Rico&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&FacetFilters%5B1%5D.ID=6252001&FacetFilters%5B1%5D.FacetType=2&FacetFilters%5B1%5D.Count=4897&FacetFilters%5B1%5D.Display=United+States&FacetFilters%5B1%5D.IsApplied=true&FacetFilters%5B1%5D.FieldName=&FacetFilters%5B2%5D.ID=Space&FacetFilters%5B2%5D.FacetType=5&FacetFilters%5B2%5D.Count=1082&FacetFilters%5B2%5D.Display=Space&FacetFilters%5B2%5D.IsApplied=true&FacetFilters%5B2%5D.FieldName=job_level&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=0&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=6252001%2C4566966&fcf=&afc=&afl=&afcf=' page_response = requests.get(page_link, timeout=timeout) page_json = json.loads(page_response.content) page_content = BeautifulSoup(page_json["results"], "html.parser") #Getting total nunmber of jobs totalJobsStr = page_content.find("p").text totalJobs = '' for i in totalJobsStr: if i.isdigit(): totalJobs = totalJobs + str(i) #Removing number of jobs per page from string totalJobs = totalJobs[2:] #Editing page_link and getting new page to reflect thes total number of jobs page_link = 'https://www.lockheedmartinjobs.com/search-jobs/results?ActiveFacetID=Space&CurrentPage=1&RecordsPerPage=' + totalJobs + '&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=4566966&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=13&FacetFilters%5B0%5D.Display=Puerto+Rico&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&FacetFilters%5B1%5D.ID=6252001&FacetFilters%5B1%5D.FacetType=2&FacetFilters%5B1%5D.Count=4897&FacetFilters%5B1%5D.Display=United+States&FacetFilters%5B1%5D.IsApplied=true&FacetFilters%5B1%5D.FieldName=&FacetFilters%5B2%5D.ID=Space&FacetFilters%5B2%5D.FacetType=5&FacetFilters%5B2%5D.Count=1082&FacetFilters%5B2%5D.Display=Space&FacetFilters%5B2%5D.IsApplied=true&FacetFilters%5B2%5D.FieldName=job_level&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=0&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=6252001%2C4566966&fcf=&afc=&afl=&afcf=' page_response = requests.get(page_link, timeout=timeout) page_json = json.loads(page_response.content) page_content = BeautifulSoup(page_json["results"], "html.parser") titleContainer = page_content.findAll('span', attrs={'class':'job-title'}) locationContainer = page_content.findAll('span', attrs={'class':'job-location'}) linkContainer = page_content.findAll('a') #Removing unnessary "a" tags linkContainer = linkContainer[2:] #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for i in range(len(titleContainer)): title = titleContainer[i].text location = locationContainer[i].text link = 'https://www.lockheedmartinjobs.com' + linkContainer[i]["href"] titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) #Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] page_response = requests.get(page_link, timeout=timeout) page_content = BeautifulSoup(page_response.content, "html.parser") title = str(titles[i]) desc = page_content.find('div', attrs={'class':'ats-description'}).text descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles,locations,descriptions,company) return True
def runScrape(verbose, upload, alljobs, timeout): #Sets the company for the script. Change each company company = JC.Company(8, "Astrobotic", "https://www.astrobotic.com/", "*****@*****.**") #Uses webdriver and chromedriver to get html from javascript chromedriver = "/Users/JJ/Documents/ProgrammingStuff/chromedriver" driver = webdriver.Chrome(chromedriver) driver.get("https://www.astrobotic.com/careers") html = driver.execute_script("return document.documentElement.outerHTML") page_content = BeautifulSoup(html, "html.parser") jobContainer = page_content.findAll("div", attrs={"class": "rbox-opening-li"}) driver.close() #Creates list of titles, locations and links to the application website titles = [] locations = [] links = [] for job in jobContainer: title = job.find("a", attrs={"class": "rbox-opening-li-title"}).text location = job.find("div", attrs={"class": "rbox-job-shortdesc"}).text link = job.find("a", attrs={"class": "rbox-opening-li-title"})["href"] #Removing information not necessary for location (i.e. Full-time, Location:, etc) if location.find("Location") != -1 or location.find( "Full-time") != -1 or location.find("Contract") != -1: if location.find("Full-time") == 54: location = location[13:54] elif location.find("Full-time") == 29: location = location[13:29] elif location.find("Full-time") == 25: location = location[13:25] else: location = location[13:54] titles.append(title) locations.append(location) links.append(link) print("There are %s jobs to scrape. Starting scrape..." % str(len(links))) # Visits each job page and scrapes further info descriptions = [] for i in range(len(links)): page_link = links[i] #Getting jobID from page links jobID = "" jobIDList = re.findall("\d", page_link) for num in jobIDList: jobID = jobID + num #Editing the description link descLink = "https://app.recruiterbox.com/widget/4972/opening/%s/" % ( jobID) page_response = requests.get(descLink, timeout=timeout) jsonData = json.loads(page_response.content) page_content = BeautifulSoup(jsonData["description"], "html.parser") descContainer = page_content.findAll("p") + page_content.findAll("li") for par in descContainer: desc = par.text descriptions.append(desc) title = str(titles[i]) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True
def runScrape(verbose, upload, alljobs, timeout): #Sets the company for the script. Change each company company = JC.Company(13, "Firefly", "https://firefly.com/", "N/A") #Running autogui for Firefly to get json links jsonLinks = FireflyAuto.run() jsonURL_1 = jsonLinks[0] jsonURL_2 = jsonLinks[1] #Requesting json data jsonResponse_1 = requests.get(jsonURL_1, timeout=timeout) data_1 = json.loads(jsonResponse_1.content) jsonRepsonse_2 = requests.get(jsonURL_2, timeout=timeout) data_2 = json.loads(jsonRepsonse_2.content) #Creates list of titles, locations and links to the application website titles = [] locations = [] linkIDs = [] #Parsing json data to get titles, locations, and link ID numbers for items in data_1["jobRequisitions"]: titles.append(items["requisitionTitle"]) try: locations.append( items["requisitionLocations"][0]["nameCode"]["shortName"]) linkIDs.append( items["customFieldGroup"]["stringFields"][0]["stringValue"]) except IndexError: locations.append("N/A") continue for items in data_2["jobRequisitions"]: titles.append(items["requisitionTitle"]) try: locations.append( items["requisitionLocations"][0]["nameCode"]["shortName"]) linkIDs.append( items["customFieldGroup"]["stringFields"][0]["stringValue"]) except IndexError: locations.append("None") continue print("There are %s jobs to scrape. Starting scrape..." % str(len(titles))) #Visits each job page and scrapes further info descriptions = [] for i in range(len(titles)): #Editing description link based on link ID numbers link = 'https://workforcenow.adp.com/mascsr/default/careercenter/public/events/staffing/v1/job-requisitions/' + linkIDs[ i] + '?cid=241aedef-e1d0-4fca-8d2a-bb3ff0afed85&timeStamp=1563828327993&lang=en_US&ccId=19000101_000001&locale=en_US' #HTML is found in json data. Getting json response first then parsing the html jsonResponse = requests.get(link, timeout=timeout) data = json.loads(jsonResponse.content) jsonHTML = BeautifulSoup(data["requisitionDescription"], "html.parser") desc = jsonHTML.text title = str(titles[i]) descriptions.append(desc) print("Job %s scraped - %s" % (str(i + 1), str(title))) createjoblist(titles, locations, descriptions, company) return True