def scrap(job_title, job_location, radius, result_nb): base_URL = 'http://www.indeed.fr/emplois?' query_parameters = [ 'q={}'.format('+'.join(job_title.split())), '&l={}'.format('+'.join(job_location.split())), '&rq={}'.format(radius), '&sort=date', '&fromage=last' ] query_URL = format_query(base_URL, query_parameters) print(query_URL) html = get_html(query_URL) try: num_jobs_txt = str(html.select('#searchCount')) num_jobs = int(parse_num(num_jobs_txt, 2)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'indeed', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } # Cycle through all of the job postings that we can and grab the url pointing to # it, to then query it. All of the jobs should be available via the # .turnstileLink class, and then the href attribute will point to the URL. max_start_position = 1000 if num_jobs >= 1000 else num_jobs start_positions = range(0, max_start_position, 10) db_path = "".join([ "db_", job_title, "_", job_location, "_", str(radius), "_", str(result_nb) ]) jobs = [] for i in range(0, result_nb, 10): try: jobs.extend( multiprocess_pages(query_URL, job_title, job_location, i)) except RuntimeError: pass #retry ? #cPickle.dump(jobs, "jobs") with open("".join([db_path, ".pkl"]), 'w') as f: cPickle.dump(jobs, f) f.close() return jobs
try: job_title = sys.argv[1] job_location = sys.argv[2] radius = sys.argv[3] except IndexError: raise Exception( 'Program needs a job title, job location, and radius inputted!') base_URL = 'https://www.ziprecruiter.com/candidate/search?' query_parameters = [ 'search={}'.format('+'.join(job_title.split())), '&location={}'.format('+'.join(job_location.split())), '&radius={}'.format(radius), '&days=5', '&include_near_duplicates=1' ] query_URL = format_query(base_URL, query_parameters) #print (query_URL) html = get_html(query_URL) try: num_jobs_txt = str(html.select('#job_results_headline')[0].text) num_jobs = int(parse_num(num_jobs_txt, 0)) print(num_jobs) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'ziprecruiter', 'num_jobs': num_jobs,
# I expect that at the very least a job title, job location, and radius # will be passed in, so I'll attempt to get both of those within # a try except and throw an error otherwise. try: job_title = sys.argv[1] job_location = sys.argv[2] radius = sys.argv[3] except IndexError: raise Exception('Program needs a job title, job location, and radius inputted!') base_URL = 'http://jobs.monster.com/search/?' query_parameters = ['q={}'.format('-'.join(job_title.split())), '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di', '&rad={}'.format(radius)] query_URL = format_query(base_URL, query_parameters) driver = issue_driver_query(query_URL) try: num_jobs_txt = get_num_jobs_txt(driver) num_jobs = int(parse_num(num_jobs_txt, 0)) except: print 'No jobs for search {} in {}'.format(job_title, job_location) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = {'job_site': 'monster', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location} store_in_mongo([storage_dct], 'job_numbers', 'monster') # This loop will be used to keep clicking the next button after