def run(url): soup = get_soup(url) jobs_list = soup.find_all('h4') job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list: job_class.title = job_entry.a.text job_class.info_link = job_entry.a['href'] listing_soup = get_soup(job_class.info_link) if listing_soup.body.find_all('p', string="Job Type: Full-time"): job_class.full_or_part = 'Full-time' elif listing_soup.body.find_all('p', string="Job Type: Part-time"): job_class.full_or_part = 'Part-time' date_text = listing_soup.body.find_all( 'span', {'class': 'subtitle'})[0].text.split() month_string = date_text[2] day = int(date_text[3][0:len(date_text[3]) - 1]) year = int(date_text[4]) month = month_to_num(month_string) job_class.post_date = datetime(year, month, day) insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_table = soup.find('table', {'id': 'job-result-table'}) job_class = Job(organization, "") job_class.post_date = "" job_class.organization_id = organization_id insert_count = 0 for job_row in jobs_table.find_all('tr', {'class': 'job-result'}): job_title_cell = job_row.find('td', {'class': 'job-result-title-cell'}) job_class.title = job_title_cell.a.text.strip() job_class.info_link = 'https://pennylanecenters.jobs.net' + \ job_title_cell.a['href'] job_class.location = clean_location( job_row.find('div', { 'class': 'job-location-line' }).text) job_class.zip_code = city_to_zip(job_class.location) # Get Job Soup job_soup = get_soup(job_class.info_link) job_class.full_or_part = job_soup.find('li', { 'class': 'job-employee-type' }).find('div', { 'class': 'secondary-text-color' }).text job_class.post_date = string_to_date( job_soup.find('li', { 'class': 'job-date-posted' }).find('div', { 'class': 'secondary-text-color' }).text) insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find_all('h4') for job_entry in jobs_list: globals.job_title = job_entry.a.text globals.info_link = job_entry.a['href'] globals.job_summary = globals.info_link listing_soup = get_soup(globals.info_link) if listing_soup.body.find_all('p', string="Job Type: Full-time"): globals.full_or_part = 'Full-time' elif listing_soup.body.find_all('p', string="Job Type: Part-time"): globals.full_or_part = 'Part-time' date_text = listing_soup.body.find_all( 'span', {'class': 'subtitle'})[0].text.split() month_string = date_text[2] day = int(date_text[3][0:len(date_text[3]) - 1]) year = int(date_text[4]) month = month_to_num(month_string) globals.job_post_date = datetime(year, month, day) update_db(organization)
def run(url): soup = get_soup(url) job_lists = soup.find('div', {'class': 'post'}).find_all('ul')[:-1] job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for index, job_list in enumerate(job_lists): for job_entry in job_list.find_all('li'): if index == 0: job_class.full_or_part = 'Full-Time' elif index == 1: job_class.full_or_part = 'Part-Time' else: job_class.full_or_part = 'On-Call' job_class.title = job_entry.a.text job_class.info_link = job_entry.a['href'] job_soup = get_soup(job_class.info_link) job_details = job_soup.find('div', {'aria-label': 'Job Details'}) if job_details: job_class.location = job_details.find( 'span', { 'aria-label': 'Job Location' }).text job_class.salary = job_details.find( 'span', { 'aria-label': 'Salary Range' }).text insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url + initialPath) page = 1 insert_count = 0 while soup: for html_element in soup.find_all('div', {'class': 'views-row'}): title = html_element.find('span', { 'class': 'field-content' }).a.text job = Job(organization, title) job.organization_id = organization_id location_div = html_element.find( 'div', {'class': 'views-field-field-job-city'}) if location_div: job.location = location_div.find('span', { 'class': 'field-content' }).text summarySpan = html_element.find( 'div', { 'class': 'views-field views-field-body-summary' }).span if (summarySpan != None): #if (summarySpan.p != None): job.summary = summarySpan.text info_div = html_element.find('div', {'class': 'views-field-url'}) job.info_link = info_div.find('span', { 'class': 'field-content' }).a['href'] info_soup = get_soup(job.info_link) salary_div = info_soup.find( 'div', {'class': 'views-field-field-compensation-range'}) if salary_div: job.salary = salary_div.find('span', { 'class': 'field-content' }).text hours_div = info_soup.find( 'div', {'class': 'views-field-field-hours-week'}) if hours_div: hours = hours_div.find('span', {'class': 'field-content'}).text job.full_or_part = hours + ' hours/week' insert_count += job_insert(job) # print(job) # If there are more pages, update soup to next page and scrape if soup.find('a', {'title': 'Go to next page'}): next_page_button = soup.find('a', {'title': 'Go to next page'}) next_page_url = url + next_page_button['href'] # print(next_page_url) soup = get_soup(next_page_url) page = page + 1 else: soup = False return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find('ul', {'class': 'lcp_catlist'}) for job_entry in jobs_list.find_all('li'): globals.job_title = job_entry.a.text.strip() globals.info_link = job_entry.a['href'] job_soup = get_soup(globals.info_link) summary_match = job_soup.find(text=re.compile("Position Purpose:")) if summary_match is not None: globals.job_summary = summary_match.parent.parent.text else: raise globals.ParseError(globals.info_link, 'Cannot find job summary') update_db(organization)
def run(url): soup = get_javascript_soup_delayed_and_click(url, 'hrmSearchButton') job_listings = soup.find_all('tr', {'class': 'ReqRowClick'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_row in job_listings: job_class.title = job_row.find('td', { 'class': 'posTitle' }).text.strip() job_class.info_link = 'http://mhala.hrmdirect.com/employment/' + \ job_row.find('td', {'class': 'posTitle'}).a['href'] job_class.location = job_row.find('td', {'class': 'cities'}).text job_class.zip_code = globals.city_to_zip(job_class.location) job_soup = get_soup(job_class.info_link) summary = job_soup.find(string=["Summary:", "Summary: "]) if summary: summary_parent = summary.parent summary_parent.clear() job_class.summary = summary_parent.find_parent("p").text.strip() else: job_class.summary = '' insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.select('div[class*="JobGrid-"]')[0] job_class= Job(organization, "") job_class.organization_id= organization_id insert_count= 0 for job_entry in jobs_list.find_all('a'): job_class.info_link = 'https://path.catsone.com' + job_entry['href'] job_row = job_entry.find('div', {'class': 'row'}) job_divs = job_row.find_all('div') job_class.title = job_divs[0].text.strip() job_class.location = clean_location(job_divs[2].text.strip()) job_class.zip_code = city_to_zip(job_class.location) insert_count+= job_insert(job_class) # Possible to get more info by scraping each job link, but the listings are extremely poorly written/standardized; scraper below works for most of the listings, but a few poorly written listings break the scraper # job_soup = get_soup(info_link) # job_description = job_soup.find('div',{'class':'Job__StyledDescription-s1h17u0t-0'}) # if '\n' in job_description.find_all('strong')[0].text: # full_or_part = job_description.find_all('strong')[0].text.split('\n')[1].strip() # salary = job_description.find_all('strong')[0].text.split('\n')[2].strip().split(': ')[1] # else: # full_or_part = job_description.find_all('strong')[1].text.strip() # salary = job_description.find_all('strong')[2].text.split('\n')[0].split(':')[1].strip() return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find_all('div', {'class': 'list-data'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list: job_info = job_entry.find('div', {'class': 'job-info'}) job_class.title = job_info.find('span', { 'class': 'job-title' }).text.strip() job_class.info_link = job_info.h4.a['href'] job_class.full_or_part = job_entry.find('div', { 'class': 'job-type' }).text.strip() job_class.location = clean_location( job_entry.find('div', { 'class': 'job-location' }).text.strip()) job_class.zip_code = city_to_zip(job_class.location) relative_date = job_entry.find('div', { 'class': 'job-date' }).text.strip().split(' ') job_class.post_date = date_ago(int(relative_date[1]), relative_date[2]) job_class.summary = job_entry.find('div', { 'class': 'job-description' }).p.text.strip() insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_javascript_soup(url) job_listings = soup.find_all('div', {'class': 'job-listing-job-item'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_listing in job_listings: job_class.title = job_listing.find('span', { 'class': 'job-item-title' }).a.text.strip() job_class.info_link = 'https://recruiting.paylocity.com' + \ job_listing.find('span', {'class': 'job-item-title'}).a['href'] details = get_soup(job_class.info_link) location = details.find('div', {'class': 'preview-location'}) if location.a: job_class.location = location.a.text zipcode = location.a['href'].split('+')[-1] try: job_class.zip_code = int(zipcode) except ValueError: # generate a zip code if one is not available job_class.zip_code = city_to_zip(job_class.location) else: job_class.location = '' job_class.zip_code = '' job_class.post_date = string_to_date( job_listing.find('div', { 'class': 'job-title-column' }).find_all('span')[1].text.split(' - ')[0]) insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) job_grid = soup.find('div', {'class': 'wpjb-job-list'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_div in job_grid.find_all('div', {'class': 'wpjb-col-main'}): major_line = job_div.find('div', {'class': 'wpjb-line-major'}) job_class.title = major_line.a.text job_class.info_link = major_line.a['href'] job_class.full_or_part = major_line.find('span', { 'class': 'wpjb-sub-title' }).text.strip() minor_line = job_div.find('div', {'class': 'wpjb-line-minor'}) job_class.location = minor_line.find('span', { 'class': 'wpjb-job_location' }).text.strip() date = minor_line.find('span', { 'class': 'wpjb-job_created_at' }).text.strip().split(', ') month = month_to_num(date[0]) day = int(date[1]) if month <= datetime.now().month: year = datetime.now().year else: year = datetime.now().year - 1 job_class.post_date = datetime(year, month, day) insert_count += job_insert(job_class) return insert_count
def run(url): globals.job_post_date = '' soup = get_soup(url) jobs_table = soup.find('table',{'id':'job-result-table'}) for job_row in jobs_table.find_all('tr',{'class':'job-result'}): job_title_cell = job_row.find('td',{'class':'job-result-title-cell'}) globals.job_title = job_title_cell.a.text.strip() globals.info_link = 'https://pennylanecenters.jobs.net' + job_title_cell.a['href'] globals.job_summary = globals.info_link globals.job_location = clean_location(job_row.find('div',{'class':'job-location-line'}).text) globals.job_zip_code = city_to_zip(globals.job_location) # Get Job Soup job_soup = get_soup(globals.info_link) globals.full_or_part = job_soup.find('li',{'class':'job-employee-type'}).find('div',{'class':'secondary-text-color'}).text globals.job_post_date = string_to_date(job_soup.find('li',{'class':'job-date-posted'}).find('div',{'class':'secondary-text-color'}).text) update_db(organization) reset_vars()
def run(url): soup = get_soup("https://www.211la.org/careers") jobs_list = soup.find_all("div", {"class": "jobBtn"}) for job_entry in jobs_list: for child in job_entry.find_all("a"): globals.job_title = child.text globals.info_link = child.get('href') update_db(organization)
def run(url): soup = get_soup(url) jobs_list = soup.find('ul', {'class': 'lcp_catlist'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list.find_all('li'): job_class.title = job_entry.a.text.strip() job_class.info_link = job_entry.a['href'] job_soup = get_soup(job_class.info_link) summary_match = job_soup.find(text=re.compile("Position Purpose:")) if summary_match is not None: job_class.summary = summary_match.parent.parent.text else: raise globals.ParseError(job_class.info_link, 'Cannot find job summary') insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_div = soup.find('div', {'class':'sqs-block-content'}) jobs_list = jobs_div.find_all('p') for job_entry in jobs_list[4:len(jobs_list)-3]: globals.job_title = job_entry.a.text.strip() globals.info_link = 'https://lafh.org' + job_entry.a['href'] update_db(organization)
def run(url): soup = get_soup(url) jobs_list = soup.find_all("div", {"class": "js-job-container"}) job_class= Job(organization, "") job_class.organization_id= organization_id insert_count= 0 for job_entry in jobs_list: job_class.title = job_entry.find( "span", {"class", "js-job-title"}).a.text job_class.info_link = 'https://careers.jobscore.com' + \ job_entry.find("span", {"class", "js-job-title"}).a['href'] job_class.location = job_entry.find( "span", {"class", "js-job-location"}).text.strip() job_soup = get_soup(job_class.info_link) job_class.full_or_part = job_soup.find( "h2", {"class": "js-subtitle"}).text.split(' | ')[2] insert_count+= job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find_all('div', {'class': 'et_pb_toggle'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list: job_class.title = job_entry.find('h5').text.strip() job_class.link = url insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for html_element in soup.find_all('h4'): job_class.title = html_element.a.text job_class.info_link = html_element.a['href'] job_class.location = html_element.span.text.split(']')[1] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_div = soup.find('h1', text='Careers').parent job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_listing in jobs_div.find_all('a'): job_class.title = job_listing.text job_class.info_link = job_listing['href'] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) listings_container = soup.find('ul', {'class': 'display-posts-listing'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for listing in listings_container.find_all('li'): job_class.title = listing.text job_class.info_link = listing.a['href'] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_container = soup.find(text='Current Openings:').parent.parent.parent job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_listing in jobs_container.find_all('a'): job_class.title = job_listing.text job_class.info_link = job_listing['href'] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup("https://www.211la.org/careers") jobs_list = soup.find_all("div", {"class": "jobBtn"}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list: for child in job_entry.find_all("a"): job_class.title = child.text job_class.info_link = child.get('href') insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find('tbody') job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list.find_all('tr'): job_details = job_entry.find_all('td') job_class.title = job_details[0].find('a').text job_class.info_link = job_details[0].find('a')['href'] job_class.location = job_details[2].text insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.select('div[class*="JobGrid-"]')[0] for job_entry in jobs_list.find_all('a'): globals.info_link = 'https://path.catsone.com' + job_entry['href'] job_row = job_entry.find('div', {'class':'row'}) job_divs = job_row.find_all('div') globals.job_title = job_divs[0].text.strip() globals.job_location = clean_location(job_divs[2].text.strip()) globals.job_zip_code = city_to_zip(globals.job_location) update_db(organization)
def run(url): soup = get_soup(url) jobs_div = soup.find('div', {'id': 'yui_3_16_0_ym19_1_1492463820306_5454'}) job_class= Job(organization, "") job_class.organization_id= organization_id insert_count= 0 for job_listing in jobs_div.find_all('p'): listing_element = job_listing.find_all('a') if len(listing_element) > 0: job_class.title = listing_element[0].text job_class.info_link = listing_element[0]['href'] insert_count+= job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) job_listings = soup.find('article').find_all('a') job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_item in job_listings: if len(job_item.text.strip()): job_class.title = job_item.text.strip() job_class.info_link = url + job_item['href'] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_javascript_soup(url) current_openings = soup.findAll(attrs={"data-tn-element": "jobLink[]"}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for current_opening in current_openings: detail_page_link = current_opening.find('a')['href'] detail_page_soup = get_soup(detail_page_link) detail_page_desc = detail_page_soup.find( 'div', {"data-tn-component": "jobDescription"}) job_class.title = detail_page_desc.find('h1').text.strip() job_summary_parts = detail_page_desc.findAll(['p', 'li']) job_class.summary = ' '.join( map(lambda a: a.getText(), job_summary_parts[1:-1])).strip() job_class.location = detail_page_desc.find( 'dt', string="Location").findNext().get_text() location_parts = job_class.location.split(',') if len(location_parts) > 1 and len( location_parts[-1] ) and location_parts[-1].strip().lower() != 'ca': # skip job if state is not CA print('Skip location: %s' % job_class.location) continue job_class.zip_code = city_to_zip(location_parts[0]) posted_ago = job_summary_parts[-1].get_text().split(' ') length = posted_ago[1] if (length[-1:] == '+'): length = length[:1] length = int(length) unit = posted_ago[2] job_class.post_date = date_ago(length, unit) job_class.full_or_part = detail_page_desc.find( 'dt', string="Job Type").findNext().get_text() salary_search = detail_page_desc.find('dt', string="Salary") if (salary_search is not None): job_class.salary = salary_search.findNext().get_text() job_class.info_link = detail_page_link insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) job_listings = soup.find( 'div', {'class': 'tf-sh-78847e2ef97967b68fdec32a2997ab8f'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_item in job_listings.find_all('a'): job_class.title = job_item.text.strip() job_class.info_link = job_item['href'] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_javascript_soup(url) job_listings = soup.find_all('div', {'class': 'job-listing-job-item'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_listing in job_listings: job_description = job_listing.find_all('span') # Get job title and link job_class.title = job_description[0].a.text job_class.info_link = 'https://recruiting.paylocity.com' + \ job_description[0].a['href'] # Get date as string date = job_description[1].text # Clean up date string by removing trailing -'s, then split and convert # to datetime object if date[len(date) - 2] == '-': date = date[0:len(date) - 3] date = date.strip().split('/') month = int(date[0]) day = int(date[1]) year = int(date[2]) job_class.post_date = datetime(year, month, day) # Get Location job_class.location = job_listing.find('div', { 'class': 'location-column' }).span.text # Get soup of job listing to scrape more info listing_soup = get_soup(job_class.info_link) listing_body = listing_soup.find('body').find_all('p') # Retrieve Full/Part-time and Salary info if available if 'Location' in listing_body[0].text: location_string = listing_body[0].text.split(':')[1].lstrip() zip_code_result = re.search(r'(\d{5})', location_string) if zip_code_result is not None: job_class.zip_code = zip_code_result.group(1) # can't get city since there's no standard. It could be # "Hollywood", "Koreatown, Los angeles, California", or even # "Multiple Locations" if len(job_class.zip_code) == 0: job_class.zip_code = globals.city_to_zip(job_class.location) if 'Status' in listing_body[1].text: job_class.full_or_part = listing_body[1].text[8:] if 'Salary' in listing_body[2].text: job_class.salary = listing_body[2].text[14:] insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find_all("tr", {"class": "reqitem"}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list: for child in job_entry.find_all("td", {"class": "posTitle"}): for child2 in child.find_all("a"): job_class.title = child.text job_class.info_link = "https://covca.hrmdirect.com/" + \ child2.get('href') for child in job_entry.find_all("td", {"class": "cities"}): job_location = child.text if (job_location == "Los Angeles"): insert_count += job_insert(job_class) return insert_count