def query_for_data(driver, json_dct, job, idx): """Grab all info. from the job posting This will include the job title, the job location, the posting company, the date posted, and then any stars assigned. After grabbing this information, click and get the job posting's actual text. Args: driver: Selenium webdriver json_dct: dict Dictionary holding the current information that is being stored for that job posting. job: Selenium WebElement idx: int Holds the # of the job posting the program is on (0 indexed here). Return: dct """ posting_title = job.find_element_by_class_name('title').text split_posting_company = job.find_element_by_class_name( 'companyInfo').text.split() posting_location = job.find_element_by_xpath( "//div//span[@itemprop='jobLocation']").text try: posting_date = job.find_element_by_class_name('minor').text except: posting_date = '' # I couldn't think of any clearly better way to do this. If they have # a number of stars, it comes in the posting companies text. I guess # I could have done a search and replace, but I'd rather slightly adjust # some functionality I already have (i.e. parse_num) than build another # function to find the number of stars, store it, and then replace it with # empty text. if parse_num(' '.join(split_posting_company), 0): num_stars = split_posting_company[0] posting_company = ' '.join(split_posting_company[1:]) out_json_dct = gen_output(json_dct.copy(), posting_title, posting_location, posting_date, posting_company, num_stars) else: posting_company = ' '.join(split_posting_company) out_json_dct = gen_output(json_dct.copy(), posting_title, posting_location, posting_date, posting_company) out_json_dct['posting_txt'] = grab_posting_txt(driver, job, idx) return out_json_dct
raise Exception( 'Program needs a job title, job location, and radius inputted!') base_URL = 'http://www.simplyhired.com/search?' query_parameters = [ 'q={}'.format('+'.join(job_title.split())), '&l={}'.format('+'.join(job_location.split())), '&mi={}'.format(radius), '&fdb=5', '&clst=CTL' ] query_URL = format_query(base_URL, query_parameters) html = get_html(query_URL) try: num_jobs_txt = str(html.select('.result-headline')[0].text) num_jobs = int(parse_num(num_jobs_txt, 2)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'simplyhired', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'simplyhired') # All of the jobs should be available through the '.js-job-link' CSS class.
try: job_title = sys.argv[1] job_location = sys.argv[2] except IndexError: raise Exception('Program needs a job title and job location inputted!') # Issue the job query. base_URL = 'https://www.glassdoor.com/index.htm' query_params = (('KeywordSearch', job_title), ('LocationSearch', job_location)) driver = issue_driver_query(base_URL, query_params) # Find the text holding the number of jobs, and parse it. time.sleep(random.randint(7, 15)) num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text num_jobs = int(parse_num(num_jobs_txt, 0)) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'glassdoor', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'glassdoor') # Find the text holding the number of pages in the job search. time.sleep(random.randint(2, 6)) try: num_pages_txt = driver.find_element_by_id('ResultsFooter').text
base_URL = 'http://jobs.monster.com/search/?' query_parameters = [ 'q={}'.format('-'.join(job_title.split())), '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di', '&rad={}'.format(radius) ] query_URL = format_query(base_URL, query_parameters) driver = issue_driver_query(query_URL, driver_path=driver_path) if verbose: print('<v> Successfully connected selenium') try: num_jobs = get_num_jobs_txt(driver) num_jobs = int(parse_num(num_jobs, 0)) if verbose: print('<v> {} jobs found'.format(num_jobs)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) assert 0, 'halt' current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'monster', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location }