def get_job_postings(main_link, thread_count, verbose): postings = [] postings_page_dic = util.get_request_to_dic(main_link, verbose) #find the pagination end point end_points = util.extract_key(postings_page_dic, 'endPoints') base_url = main_link.split('.com')[0] + '.com' pagination_end_point = base_url pagination_key = "Pagination" for end_point in end_points: if end_point['type'] == pagination_key: pagination_end_point += end_point['uri'] + '/' break #paginate until we have all the postings if verbose: print("Scraping list of all job postings..\n") job_postings = [] while True: #attempt to retrieve list of job postings from json response postings_list = util.extract_key(postings_page_dic, 'listItems') if postings_list is None: break paginated_urls = [JobPosting(post, base_url) for post in postings_list] job_postings += paginated_urls postings_page_dic = util.get_request_to_dic( pagination_end_point + str(len(job_postings)), verbose) if verbose: print("\nThere are", len(job_postings), "job postings.\n") print("Scraping full descriptions of each job posting..\n") threads = [] for i in range(thread_count): start = int(i * len(job_postings) / thread_count) end = int((i + 1) * len(job_postings) / thread_count) # thread = Process(target=get_job_description, args=(job_postings, start, end, dest_dir, postings, verbose)) # threads.append(thread) # thread.start() # # for i in range(thread_count): # threads[i].join() get_job_description(job_postings, start, end, postings, verbose) #print(postings) return postings
def get_job_description(job_postings, start, end, dest_dir, verbose=False): for i in range(start, end): job_posting = job_postings[i] job_page_dic = util.get_request_to_dic(job_posting.url, verbose) description = util.extract_key(job_page_dic, 'description') job_info = job_posting.info job_info['description'] = description util.write_to_file(job_posting.ID, job_info, dest_dir)
def get_job_postings(main_link, dest_dir, thread_count, verbose): postings_page_dic = util.get_request_to_dic(main_link, verbose) end_points = util.extract_key(postings_page_dic, 'endPoints') base_url = main_link.split('.com')[0] + '.com' pagination_end_point = base_url pagination_key = "Pagination" for end_point in end_points: if end_point['type'] == pagination_key: pagination_end_point += end_point['uri'] + '/' break if verbose: print("Scraping list of all job postings..\n") job_postings = [] while True: postings_list = util.extract_key(postings_page_dic, 'listItems') if postings_list is None: break paginated_urls = [JobPosting(post, base_url) for post in postings_list] job_postings += paginated_urls print(pagination_end_point + str(len(job_postings))) postings_page_dic = util.get_request_to_dic( pagination_end_point + str(len(job_postings)), verbose) if verbose: print("\nThere are", len(job_postings), "job postings.\n") print("Scraping full descriptions of each job posting..\n") threads = [] for i in range(thread_count): start = int(i * len(job_postings) / thread_count) end = int((i + 1) * len(job_postings) / thread_count) thread = Process(target=get_job_description, args=(job_postings, start, end, dest_dir, verbose)) threads.append(thread) thread.start() for i in range(thread_count): threads[i].join() if verbose: print("\nDone. All files stored under", dest_dir)
def get_job_description(job_postings, start, end, dest_dir, verbose=False): ''' Iterates through [start, end) portion of the job postings, retrieves their full description, and writes to file Input: job_postings: list of JobPosting start: start index end: end index dest_dir: write path for file storage Returns: No return, writes to file ''' for i in range(start, end): job_posting = job_postings[i] job_page_dic = util.get_request_to_dic(job_posting.url, verbose) description = util.extract_key(job_page_dic, 'description') job_info = job_posting.info job_info['link'] = job_posting.url job_info['description'] = description util.write_to_file(job_posting.ID, job_info, dest_dir)