コード例 #1
0
def runScrape(verbose, upload, alljobs, timeout):
    # Sets the company for the script. Change each company
    company = JC.Company(15, "Raytheon", "https://www.raytheon.com/", "None")

    # Getting total number of jobs from base link
    page_link = 'https://jobs.raytheon.com/search-jobs/United%20States?orgIds=4679&alp=6252001&alt=2'
    page_response = requests.get(page_link)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    totalJobs = page_content.find("h1", {"role": "status"}).text
    totalJobs = totalJobs[:4]

    #Editing link to get total number of jobs
    page_link = 'https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&CurrentPage=1&RecordsPerPage=' + totalJobs + '&Distance=50&RadiusUnitType=0&Keywords=&Location=United+States&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=6252001&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=3478&FacetFilters%5B0%5D.Display=United+States&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=1&SearchType=6&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=4679&PostalCode=&fc=&fl=&fcf=&afc=&afl=&afcf='
    page_response = requests.get(page_link)
    page_json = json.loads(page_response.content)
    page_content = BeautifulSoup(page_json["results"], "html.parser")
    jobsContainer = page_content.findAll("li")

    #Removing country list item
    jobsContainer = jobsContainer[1:]

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for job in jobsContainer:
        title = job.find("h2").text
        location = job.find("span", attrs={"class": "job-location"}).text
        link = 'https://jobs.raytheon.com' + job.a["href"]

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    # Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link)
        page_content = BeautifulSoup(page_response.content, "html.parser")
        desc = page_content.find("div", attrs={
            "class": "ats-description"
        }).text

        title = str(titles[i])

        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #2
0
def runScrape(verbose, upload, alljobs, timeout):
    # 1. Initialise Company Object, with all the information needed to attach to a job page.
    # 1.5. Go into Wordpress and create a company account for this company. Most important is the company id
    company = JC.Company(0, "Company Name", "www.companyurl.com",
                         "Company Email")
    print("Scraping %s..." % company.name)

    # 2. Add URL of the company's career page
    company_careers_url = 'https://www.companyname.com/careers'

    page_response = requests.get(company_careers_url, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    # 3. Identify the parts of the webpage that have all the job information on.
    # If purely HTML, it will probably be some sort of list of <div> with an identifying class, which you can find through the findAll function
    # If the website uses JavaScript, then the process will be a bit more complicated
    jobPostingsOnWebpage = page_content.findAll('div',
                                                attrs={"class": "posting"})

    # Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []
    for job in jobPostingsOnWebpage:
        # 4. From the HTML objects, separate the link to the job, the job title, and the location (if it is there)
        link = job.a["href"]
        title = job.h5.text
        location = job.find('span', attrs={"class": "sort-by-location"}).text

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    # Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        title = str(titles[i])

        # 4. Identify the job description, and isolate it. Include the HTML formatting (we use it to keep the job pretty on our site)
        desc = str(page_content.find('div', {"class": "content"}))

        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
コード例 #3
0
ファイル: OneWeb.py プロジェクト: danhirst98/jobsitewebscrape
def runScrape(verbose, upload, alljobs, timeout):
    # Sets the company for the script. Change each company
    company = JC.Company(10, "OneWeb", "https://www.oneweb.world/",
                         "*****@*****.**")

    page_link = 'https://boards.greenhouse.io/embed/job_board?for=oneweb&b=https%3A%2F%2Fwww.oneweb.world%2Fcareers'
    page_response = requests.get(page_link, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    jobContainer = page_content.findAll("div", attrs={"class": "opening"})

    # Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for job in jobContainer:
        title = job.a.text.strip()
        location = job.span.text.strip()
        tempLink = job.a["href"]
        ID = tempLink[50:]
        link = "https://boards.greenhouse.io/embed/job_app?for=oneweb&token=" + ID + "&b=https%3A%2F%2Foneweb.world%2Fcareers-opportunities%2F"

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        title = str(titles[i])

        descContainer = page_content.findAll("p")

        for par in descContainer:
            desc = par.text
            descriptions.append(desc)

        title = str(titles[i])
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #4
0
def runScrape(verbose, upload, alljobs, timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(9, "Bigelow", "https://bigelowaerospace.com/",
                         "*****@*****.**")

    page_link = "https://bigelowaerospace.com/pages/job-opportunities/"
    page_response = requests.get(page_link, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    alljobswebpage = page_content.findAll("p")

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []
    for job in alljobswebpage:
        try:
            title = job.a.u.text
            link = job.a["href"]
            location = "North Las Vegas, NV, USA"
            titles.append(title)
            locations.append(location)
            links.append(link)
        except AttributeError:
            break

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        title = str(titles[i])

        descContainer = page_content.findAll("ul")
        desc = descContainer[1].text
        descriptions.append(desc)

        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #5
0
def runScrape(verbose, upload, alljobs, timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(2, "Astranis", "www.astranis.com",
                         "*****@*****.**")

    page_link = 'https://jobs.lever.co/astranis'
    page_response = requests.get(page_link, timeout=timeout)

    page_content = BeautifulSoup(page_response.content, "html.parser")

    alljobswebpage = page_content.findAll('div', attrs={"class": "posting"})

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []
    for job in alljobswebpage:
        link = job.a["href"]
        title = job.h5.text
        location = job.find('span', attrs={"class": "sort-by-location"}).text
        #TODO: Add a check to see if they add a country code. Especially if Astranis expands beyond America
        location = location + ', USA'
        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        title = str(titles[i])

        desc = str(page_content.find('div', {"class": "content"}))

        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(verbose, upload, alljobs, titles, locations, descriptions,
                  company)
    return True
コード例 #6
0
def runScrape(verbose, upload, alljobs, timeout):
    # Sets the company for the script. Change each company
    company = JC.Company(17, "Odyssey Space Research",
                         "https://www.odysseysr.com/", "None")

    page_link = 'https://www.odysseysr.com/jm-ajax/get_listings/'
    page_response = requests.get(page_link)
    json_content = json.loads(page_response.content)
    page_content = BeautifulSoup(json_content["html"], "html.parser")
    jobContainer = page_content.findAll("a")

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for job in jobContainer:
        title = job.find("h3").text
        location = job.find("div", attrs={"class": "location"}).text.strip()
        link = job["href"]

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    # Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link)
        page_content = BeautifulSoup(page_response.content, "html.parser")
        desc = page_content.find("div", attrs={
            "class": "job_description"
        }).text

        title = str(titles[i])

        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #7
0
def runScrape(verbose,upload,alljobs,timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(6, "Rocket Lab", "www.rocketlabusa.com", "None")

    page_link = 'https://www.rocketlabusa.com/careers/positions/'
    page_response = requests.get(page_link, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    jobsContainer = page_content.findAll("a", attrs={"class":"job"})

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for jobs in jobsContainer:
        title = jobs.h3.text
        location = jobs.h5.text
        link = "https://www.rocketlabusa.com" + jobs["href"]

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        desc = page_content.find("div", attrs={"class", "job__info-subtitle"}).text
        descriptions.append(desc)

        title = str(titles[i])

        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #8
0
ファイル: SpaceX.py プロジェクト: danhirst98/jobsitewebscrape
def runScrape(verbose, upload, alljobs, timeout):
    #Sets the company for the script. Change each company
    company = Company(1, "SpaceX", "www.spacex.com", "*****@*****.**")

    page_link = 'https://www.spacex.com/careers/list/robots.txt'
    page_response = requests.get(page_link, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    oddJobs = page_content.findAll('tr', attrs={"class": "odd"})
    evenJobs = page_content.findAll('tr', attrs={"class": "even"})

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []
    for job in oddJobs and evenJobs:
        link = job.a["href"]
        title = job.a.text
        location = job.findAll(
            "div", {"class": "field-name-field-job-location"})[0].text

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        title = str(titles[i])

        desc = str(page_content.find('div', {"id": "content"}))
        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #9
0
            with open(basepath + entry.name) as json_file:
                data = json.load(json_file)
                #Getting title, description and link from JSON files
                title = data['labels'][0]
                location = data['labels'][1]
                desc = data['description']
                link = data['link']
                #Removing extra string that's on location
                if location.find(", More...") == -1:
                    location = location + ", USA"
                else:
                    location = location[:-9] + ", USA"

                titles.append(title)
                locations.append(location)
                links.append(link)

print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

descriptions = []

for i in range(len(links)):
    title = str(titles[i])

    location = locations[i]

    descriptions.append(desc)
    print("Job %s scraped - %s" % (str(i + 1), str(title)))

createjoblist(titles, locations, descriptions, company)
コード例 #10
0
def runScrape(verbose, upload, alljobs, timeout):
    # Sets the company for the script. Change each company
    company = JC.Company(16, "OrbitalInsight", "https://orbitalinsight.com/",
                         "*****@*****.**")

    # Uses webdriver and chromedriver to get html from javascript
    chromedriver = "/Users/JJ/Documents/ProgrammingStuff/chromedriver"
    driver = webdriver.Chrome(chromedriver)
    driver.get("https://orbitalinsight.com/company/careers/#positions")
    html = driver.execute_script("return document.documentElement.outerHTML")
    page_content = BeautifulSoup(html, "html.parser")
    driver.close()

    #No overarching div that contains all jobs. Getting each item individually in a list
    titleContainer = page_content.find_all("h3",
                                           attrs={"class": "career__title"})
    locationContainer = page_content.findAll(
        "div", attrs={"class": "career__meta--location"})
    linkContainer = page_content.findAll("a", attrs={"class": "btn"})

    # Removing Join Us link and Email link
    numJobs = len(titleContainer)
    linkContainer = linkContainer[1:numJobs + 1]

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for i in range(len(titleContainer)):
        title = titleContainer[i].text
        location = locationContainer[i].text
        link = linkContainer[i]["href"]

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    # Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        driver = webdriver.Chrome(chromedriver)
        driver.get(page_link)
        html = driver.execute_script(
            "return document.documentElement.outerHTML")
        page_content = BeautifulSoup(html, "html.parser")
        driver.close()

        desc = page_content.find("div",
                                 attrs={
                                     "class": "columns small-12 medium-8"
                                 }).text

        title = str(titles[i])

        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #11
0
def runScrape(verbose, upload, alljobs, timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(7, "Aerospace Corporation", "https://aerospace.org/",
                         "None")

    page_link = 'https://careers.aerospace.org/go/View-All-Jobs/2443100/?q=&sortColumn=referencedate&sortDirection=desc'
    page_response = requests.get(page_link, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    #Getting base pagination links
    paginationLinks = page_content.find("ul", attrs={"class": "pagination"})
    jobLinkContainer = paginationLinks.findAll("li")

    #List to store all links
    mainLinkContainer = []

    for link in jobLinkContainer:
        jobsLink = "https://careers.aerospace.org" + link.a["href"]

        #Editing links to obtain links for all pages on website
        if jobsLink == "https://careers.aerospace.org/go/View-All-Jobs/2443100/100/?q=&sortColumn=referencedate&sortDirection=desc":
            mainLinkContainer.append(jobsLink)

            #Calculating the number of pages left
            numPagesLeft = int(((250 - 100) / 25) - 1)

            #Num items on page increases by 25 starting at 125
            paginationAmount = 125

            #Editing the links
            for i in range(numPagesLeft):
                jobsLink = "https://careers.aerospace.org/go/View-All-Jobs/2443100/%s/?q=&sortColumn=referencedate&sortDirection=desc" % (
                    str(paginationAmount))
                paginationAmount += 25

                mainLinkContainer.append(jobsLink)
        else:
            mainLinkContainer.append(jobsLink)

    #Removing duplicate links
    mainLinkContainer = list(dict.fromkeys(mainLinkContainer))

    # Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    #Start of main web scrape
    for link in mainLinkContainer:

        page_response = requests.get(link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")
        titleContainer = page_content.findAll("a",
                                              attrs={"class": "jobTitle-link"})
        locationContainer = page_content.select(
            "span.jobLocation.visible-phone")

        #Removing duplicates from the list
        titleContainer = list(dict.fromkeys(titleContainer))

        for item in titleContainer:
            title = item.text
            link = "https://careers.aerospace.org" + item["href"]
            titles.append(title)
            links.append(link)

        for loc in locationContainer:
            location = (loc.span.text).strip()
            locations.append(location)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    # Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")
        descContainer = page_content.findAll("span",
                                             attrs={"class": "jobdescription"})

        for items in descContainer:
            desc = items.text

        descriptions.append(desc)
        title = str(titles[i])
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #12
0
def runScrape(verbose,upload,alljobs,timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(12, "Lockheed Martin", "https://www.lockheedmartin.com/en-us/index.html", "*****@*****.**")

    #Getting page to find total number of jobs
    page_link = 'https://www.lockheedmartinjobs.com/search-jobs/results?ActiveFacetID=Space&CurrentPage=1&RecordsPerPage=1000&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=4566966&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=13&FacetFilters%5B0%5D.Display=Puerto+Rico&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&FacetFilters%5B1%5D.ID=6252001&FacetFilters%5B1%5D.FacetType=2&FacetFilters%5B1%5D.Count=4897&FacetFilters%5B1%5D.Display=United+States&FacetFilters%5B1%5D.IsApplied=true&FacetFilters%5B1%5D.FieldName=&FacetFilters%5B2%5D.ID=Space&FacetFilters%5B2%5D.FacetType=5&FacetFilters%5B2%5D.Count=1082&FacetFilters%5B2%5D.Display=Space&FacetFilters%5B2%5D.IsApplied=true&FacetFilters%5B2%5D.FieldName=job_level&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=0&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=6252001%2C4566966&fcf=&afc=&afl=&afcf='
    page_response = requests.get(page_link, timeout=timeout)
    page_json = json.loads(page_response.content)
    page_content = BeautifulSoup(page_json["results"], "html.parser")

    #Getting total nunmber of jobs
    totalJobsStr = page_content.find("p").text
    totalJobs = ''
    for i in totalJobsStr:
        if i.isdigit():
            totalJobs = totalJobs + str(i)

    #Removing number of jobs per page from string
    totalJobs = totalJobs[2:]

    #Editing page_link and getting new page to reflect thes total number of jobs
    page_link = 'https://www.lockheedmartinjobs.com/search-jobs/results?ActiveFacetID=Space&CurrentPage=1&RecordsPerPage=' + totalJobs + '&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&FacetFilters%5B0%5D.ID=4566966&FacetFilters%5B0%5D.FacetType=2&FacetFilters%5B0%5D.Count=13&FacetFilters%5B0%5D.Display=Puerto+Rico&FacetFilters%5B0%5D.IsApplied=true&FacetFilters%5B0%5D.FieldName=&FacetFilters%5B1%5D.ID=6252001&FacetFilters%5B1%5D.FacetType=2&FacetFilters%5B1%5D.Count=4897&FacetFilters%5B1%5D.Display=United+States&FacetFilters%5B1%5D.IsApplied=true&FacetFilters%5B1%5D.FieldName=&FacetFilters%5B2%5D.ID=Space&FacetFilters%5B2%5D.FacetType=5&FacetFilters%5B2%5D.Count=1082&FacetFilters%5B2%5D.Display=Space&FacetFilters%5B2%5D.IsApplied=true&FacetFilters%5B2%5D.FieldName=job_level&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=0&SortDirection=0&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&PostalCode=&fc=&fl=6252001%2C4566966&fcf=&afc=&afl=&afcf='
    page_response = requests.get(page_link, timeout=timeout)
    page_json = json.loads(page_response.content)
    page_content = BeautifulSoup(page_json["results"], "html.parser")
    titleContainer = page_content.findAll('span', attrs={'class':'job-title'})
    locationContainer = page_content.findAll('span', attrs={'class':'job-location'})
    linkContainer = page_content.findAll('a')

    #Removing unnessary "a" tags
    linkContainer = linkContainer[2:]

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for i in range(len(titleContainer)):
        title = titleContainer[i].text
        location = locationContainer[i].text
        link = 'https://www.lockheedmartinjobs.com' + linkContainer[i]["href"]

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link, timeout=timeout)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        title = str(titles[i])

        desc = page_content.find('div', attrs={'class':'ats-description'}).text
        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles,locations,descriptions,company)
    return True
コード例 #13
0
def runScrape(verbose, upload, alljobs, timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(8, "Astrobotic", "https://www.astrobotic.com/",
                         "*****@*****.**")

    #Uses webdriver and chromedriver to get html from javascript
    chromedriver = "/Users/JJ/Documents/ProgrammingStuff/chromedriver"
    driver = webdriver.Chrome(chromedriver)
    driver.get("https://www.astrobotic.com/careers")
    html = driver.execute_script("return document.documentElement.outerHTML")
    page_content = BeautifulSoup(html, "html.parser")
    jobContainer = page_content.findAll("div",
                                        attrs={"class": "rbox-opening-li"})
    driver.close()

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    links = []

    for job in jobContainer:
        title = job.find("a", attrs={"class": "rbox-opening-li-title"}).text
        location = job.find("div", attrs={"class": "rbox-job-shortdesc"}).text
        link = job.find("a", attrs={"class": "rbox-opening-li-title"})["href"]

        #Removing information not necessary for location (i.e. Full-time, Location:, etc)
        if location.find("Location") != -1 or location.find(
                "Full-time") != -1 or location.find("Contract") != -1:
            if location.find("Full-time") == 54:
                location = location[13:54]
            elif location.find("Full-time") == 29:
                location = location[13:29]
            elif location.find("Full-time") == 25:
                location = location[13:25]
            else:
                location = location[13:54]

        titles.append(title)
        locations.append(location)
        links.append(link)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    # Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]

        #Getting jobID from page links
        jobID = ""
        jobIDList = re.findall("\d", page_link)

        for num in jobIDList:
            jobID = jobID + num

        #Editing the description link
        descLink = "https://app.recruiterbox.com/widget/4972/opening/%s/" % (
            jobID)

        page_response = requests.get(descLink, timeout=timeout)
        jsonData = json.loads(page_response.content)
        page_content = BeautifulSoup(jsonData["description"], "html.parser")
        descContainer = page_content.findAll("p") + page_content.findAll("li")

        for par in descContainer:
            desc = par.text
            descriptions.append(desc)

        title = str(titles[i])
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #14
0
def runScrape(verbose, upload, alljobs, timeout):
    # Sets the company for the script. Change each company
    company = Company(5, "Relativity Space", "www.relativityspace.com", "None")

    company_careers_url = "https://boards.greenhouse.io/embed/job_board?for=relativity&b=https%3A%2F%2Fwww.relativityspace.com%2Fcareers"
    page_response = requests.get(company_careers_url, timeout=timeout)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    jobContainer = page_content.findAll("div", attrs={"class": "opening"})

    # Creates list of titles, locations and links to the application website
    titles = []
    links = []
    locations = []

    for jobs in jobContainer:
        title = jobs.a.text
        location = jobs.span.text
        link = jobs.a["href"]

        titles.append(title)
        links.append(link)
        locations.append(location)

    print("There are %s jobs to scrape. Starting scrape..." % str(len(links)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(links)):
        page_link = links[i]
        page_response = requests.get(page_link)
        page_content = BeautifulSoup(page_response.content, "html.parser")

        descContainer = page_content.findAll(
            "script", attrs={"type": "application/ld+json"})

        descJSON = json.loads(descContainer[0].text)

        descContent = BeautifulSoup(descJSON["description"], "html.parser")
        descContainer_2 = descContent.findAll("p")
        descContainer_3 = descContent.findAll("div")

        if descContainer_2[0].text == "Team and Role Overview":
            desc = descContainer_2[1].text
            if (str(descContainer_2[1])) == "<p> </p>":
                desc = descContainer_3[0].text
                descriptions.append(desc)
            else:
                descriptions.append(desc)
        else:
            try:
                if descContainer_3[0].text[0:4] == "Team":
                    if (str(descContainer_3[1])) == "<div> </div>":
                        desc = descContainer_3[2].text
                        descriptions.append(desc)
                    else:
                        desc = descContainer_3[1].text
                        descriptions.append(desc)
            except IndexError:
                continue

    createjoblist(titles, locations, descriptions, company)
    return True
コード例 #15
0
def runScrape(verbose, upload, alljobs, timeout):
    #Sets the company for the script. Change each company
    company = JC.Company(13, "Firefly", "https://firefly.com/", "N/A")

    #Running autogui for Firefly to get json links
    jsonLinks = FireflyAuto.run()

    jsonURL_1 = jsonLinks[0]
    jsonURL_2 = jsonLinks[1]

    #Requesting json data
    jsonResponse_1 = requests.get(jsonURL_1, timeout=timeout)
    data_1 = json.loads(jsonResponse_1.content)

    jsonRepsonse_2 = requests.get(jsonURL_2, timeout=timeout)
    data_2 = json.loads(jsonRepsonse_2.content)

    #Creates list of titles, locations and links to the application website
    titles = []
    locations = []
    linkIDs = []

    #Parsing json data to get titles, locations, and link ID numbers
    for items in data_1["jobRequisitions"]:
        titles.append(items["requisitionTitle"])
        try:
            locations.append(
                items["requisitionLocations"][0]["nameCode"]["shortName"])
            linkIDs.append(
                items["customFieldGroup"]["stringFields"][0]["stringValue"])
        except IndexError:
            locations.append("N/A")
            continue

    for items in data_2["jobRequisitions"]:
        titles.append(items["requisitionTitle"])
        try:
            locations.append(
                items["requisitionLocations"][0]["nameCode"]["shortName"])
            linkIDs.append(
                items["customFieldGroup"]["stringFields"][0]["stringValue"])
        except IndexError:
            locations.append("None")
            continue

    print("There are %s jobs to scrape. Starting scrape..." % str(len(titles)))

    #Visits each job page and scrapes further info
    descriptions = []
    for i in range(len(titles)):

        #Editing description link based on link ID numbers
        link = 'https://workforcenow.adp.com/mascsr/default/careercenter/public/events/staffing/v1/job-requisitions/' + linkIDs[
            i] + '?cid=241aedef-e1d0-4fca-8d2a-bb3ff0afed85&timeStamp=1563828327993&lang=en_US&ccId=19000101_000001&locale=en_US'

        #HTML is found in json data. Getting json response first then parsing the html
        jsonResponse = requests.get(link, timeout=timeout)
        data = json.loads(jsonResponse.content)
        jsonHTML = BeautifulSoup(data["requisitionDescription"], "html.parser")
        desc = jsonHTML.text

        title = str(titles[i])
        descriptions.append(desc)
        print("Job %s scraped - %s" % (str(i + 1), str(title)))

    createjoblist(titles, locations, descriptions, company)
    return True