def scrap(job_title, job_location, radius, result_nb): base_URL = 'http://www.indeed.fr/emplois?' query_parameters = [ 'q={}'.format('+'.join(job_title.split())), '&l={}'.format('+'.join(job_location.split())), '&rq={}'.format(radius), '&sort=date', '&fromage=last' ] query_URL = format_query(base_URL, query_parameters) print(query_URL) html = get_html(query_URL) try: num_jobs_txt = str(html.select('#searchCount')) num_jobs = int(parse_num(num_jobs_txt, 2)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'indeed', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } # Cycle through all of the job postings that we can and grab the url pointing to # it, to then query it. All of the jobs should be available via the # .turnstileLink class, and then the href attribute will point to the URL. max_start_position = 1000 if num_jobs >= 1000 else num_jobs start_positions = range(0, max_start_position, 10) db_path = "".join([ "db_", job_title, "_", job_location, "_", str(radius), "_", str(result_nb) ]) jobs = [] for i in range(0, result_nb, 10): try: jobs.extend( multiprocess_pages(query_URL, job_title, job_location, i)) except RuntimeError: pass #retry ? #cPickle.dump(jobs, "jobs") with open("".join([db_path, ".pkl"]), 'w') as f: cPickle.dump(jobs, f) f.close() return jobs
def query_for_data(driver, json_dct, job, idx): """Grab all info. from the job posting This will include the job title, the job location, the posting company, the date posted, and then any stars assigned. After grabbing this information, click and get the job posting's actual text. Args: driver: Selenium webdriver json_dct: dict Dictionary holding the current information that is being stored for that job posting. job: Selenium WebElement idx: int Holds the # of the job posting the program is on (0 indexed here). Return: dct """ posting_title = job.find_element_by_class_name('title').text split_posting_company = job.find_element_by_class_name( 'companyInfo').text.split() posting_location = job.find_element_by_xpath( "//div//span[@itemprop='jobLocation']").text try: posting_date = job.find_element_by_class_name('minor').text except: posting_date = '' # I couldn't think of any clearly better way to do this. If they have # a number of stars, it comes in the posting companies text. I guess # I could have done a search and replace, but I'd rather slightly adjust # some functionality I already have (i.e. parse_num) than build another # function to find the number of stars, store it, and then replace it with # empty text. if parse_num(' '.join(split_posting_company), 0): num_stars = split_posting_company[0] posting_company = ' '.join(split_posting_company[1:]) out_json_dct = gen_output(json_dct.copy(), posting_title, posting_location, posting_date, posting_company, num_stars) else: posting_company = ' '.join(split_posting_company) out_json_dct = gen_output(json_dct.copy(), posting_title, posting_location, posting_date, posting_company) out_json_dct['posting_txt'] = grab_posting_txt(driver, job, idx) return out_json_dct
'Program needs a job title, job location, and radius inputted!') base_URL = 'https://www.ziprecruiter.com/candidate/search?' query_parameters = [ 'search={}'.format('+'.join(job_title.split())), '&location={}'.format('+'.join(job_location.split())), '&radius={}'.format(radius), '&days=5', '&include_near_duplicates=1' ] query_URL = format_query(base_URL, query_parameters) #print (query_URL) html = get_html(query_URL) try: num_jobs_txt = str(html.select('#job_results_headline')[0].text) num_jobs = int(parse_num(num_jobs_txt, 0)) print(num_jobs) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'ziprecruiter', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'ziprecruiter')
job_location = sys.argv[2] radius = sys.argv[3] except IndexError: raise Exception('Program needs a job title, job location, and radius inputted!') base_URL = 'http://jobs.monster.com/search/?' query_parameters = ['q={}'.format('-'.join(job_title.split())), '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di', '&rad={}'.format(radius)] query_URL = format_query(base_URL, query_parameters) driver = issue_driver_query(query_URL) try: num_jobs_txt = get_num_jobs_txt(driver) num_jobs = int(parse_num(num_jobs_txt, 0)) except: print 'No jobs for search {} in {}'.format(job_title, job_location) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = {'job_site': 'monster', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location} store_in_mongo([storage_dct], 'job_numbers', 'monster') # This loop will be used to keep clicking the next button after # scraping jobs on that page. is_next = True while is_next: scrape_job_page(driver, job_title, job_location) is_next = check_if_next(driver)
try: job_title = sys.argv[1] job_location = sys.argv[2] except IndexError: raise Exception('Program needs a job title and job location inputted!') # Issue the job query. base_URL = 'https://www.glassdoor.com/index.htm' query_params = (('KeywordSearch', job_title), ('LocationSearch', job_location)) driver = issue_driver_query(base_URL, query_params) # Find the text holding the number of jobs, and parse it. time.sleep(random.randint(7, 15)) num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text num_jobs = int(parse_num(num_jobs_txt, 0)) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'glassdoor', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'glassdoor') # Find the text holding the number of pages in the job search. time.sleep(random.randint(2, 6)) try: num_pages_txt = driver.find_element_by_id('ResultsFooter').text
try: job_title = sys.argv[1] job_location = sys.argv[2] except IndexError: raise Exception('Program needs a job title and job location inputted!') # Issue the job query. base_URL = 'https://www.glassdoor.com/index.htm' query_params = (('KeywordSearch', job_title), ('LocationSearch', job_location)) driver = issue_driver_query(base_URL, query_params) # Find the text holding the number of jobs, and parse it. time.sleep(random.randint(7, 15)) num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text num_jobs = int(parse_num(num_jobs_txt, 0)) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = {'job_site': 'glassdoor', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location} store_in_mongo([storage_dct], 'job_numbers', 'glassdoor') # Find the text holding the number of pages in the job search. time.sleep(random.randint(2, 6)) try: num_pages_txt = driver.find_element_by_id('ResultsFooter').text num_pages = int(parse_num(num_pages_txt, 1)) except: print 'No jobs for search {} in {}'.format(job_title, job_location) sys.exit(0)