def get_job_postings(main_link, thread_count, verbose):
    postings = []

    postings_page_dic = util.get_request_to_dic(main_link, verbose)

    #find the pagination end point
    end_points = util.extract_key(postings_page_dic, 'endPoints')
    base_url = main_link.split('.com')[0] + '.com'
    pagination_end_point = base_url
    pagination_key = "Pagination"
    for end_point in end_points:
        if end_point['type'] == pagination_key:
            pagination_end_point += end_point['uri'] + '/'
            break

    #paginate until we have all the postings
    if verbose:
        print("Scraping list of all job postings..\n")
    job_postings = []
    while True:

        #attempt to retrieve list of job postings from json response
        postings_list = util.extract_key(postings_page_dic, 'listItems')
        if postings_list is None:
            break

        paginated_urls = [JobPosting(post, base_url) for post in postings_list]

        job_postings += paginated_urls

        postings_page_dic = util.get_request_to_dic(
            pagination_end_point + str(len(job_postings)), verbose)

    if verbose:
        print("\nThere are", len(job_postings), "job postings.\n")
        print("Scraping full descriptions of each job posting..\n")
    threads = []
    for i in range(thread_count):
        start = int(i * len(job_postings) / thread_count)
        end = int((i + 1) * len(job_postings) / thread_count)
        #     thread = Process(target=get_job_description, args=(job_postings, start, end, dest_dir, postings, verbose))
        #     threads.append(thread)
        #     thread.start()
        #
        # for i in range(thread_count):
        #     threads[i].join()
        get_job_description(job_postings, start, end, postings, verbose)

    #print(postings)
    return postings
コード例 #2
0
ファイル: crawler.py プロジェクト: Rajat-Draup/Draup-Scrapers
def get_job_description(job_postings, start, end, dest_dir, verbose=False):
    for i in range(start, end):
        job_posting = job_postings[i]
        job_page_dic = util.get_request_to_dic(job_posting.url, verbose)
        description = util.extract_key(job_page_dic, 'description')
        job_info = job_posting.info
        job_info['description'] = description
        util.write_to_file(job_posting.ID, job_info, dest_dir)
コード例 #3
0
ファイル: crawler.py プロジェクト: Rajat-Draup/Draup-Scrapers
def get_job_postings(main_link, dest_dir, thread_count, verbose):
    postings_page_dic = util.get_request_to_dic(main_link, verbose)
    end_points = util.extract_key(postings_page_dic, 'endPoints')
    base_url = main_link.split('.com')[0] + '.com'
    pagination_end_point = base_url
    pagination_key = "Pagination"
    for end_point in end_points:
        if end_point['type'] == pagination_key:
            pagination_end_point += end_point['uri'] + '/'
            break
    if verbose:
        print("Scraping list of all job postings..\n")
    job_postings = []
    while True:
        postings_list = util.extract_key(postings_page_dic, 'listItems')
        if postings_list is None:
            break
        paginated_urls = [JobPosting(post, base_url) for post in postings_list]
        job_postings += paginated_urls
        print(pagination_end_point + str(len(job_postings)))
        postings_page_dic = util.get_request_to_dic(
            pagination_end_point + str(len(job_postings)), verbose)
    if verbose:
        print("\nThere are", len(job_postings), "job postings.\n")
        print("Scraping full descriptions of each job posting..\n")
    threads = []
    for i in range(thread_count):
        start = int(i * len(job_postings) / thread_count)
        end = int((i + 1) * len(job_postings) / thread_count)
        thread = Process(target=get_job_description,
                         args=(job_postings, start, end, dest_dir, verbose))
        threads.append(thread)
        thread.start()
    for i in range(thread_count):
        threads[i].join()
    if verbose:
        print("\nDone. All files stored under", dest_dir)
コード例 #4
0
def get_job_description(job_postings, start, end, dest_dir, verbose=False):
    '''
    Iterates through [start, end) portion of the job postings, retrieves their full description, and writes to file
    Input:
        job_postings: list of JobPosting
        start: start index
        end: end index
        dest_dir: write path for file storage
    Returns:
        No return, writes to file
    '''
    for i in range(start, end):
        job_posting = job_postings[i]
        job_page_dic = util.get_request_to_dic(job_posting.url, verbose)
        description = util.extract_key(job_page_dic, 'description')
        job_info = job_posting.info
        job_info['link'] = job_posting.url
        job_info['description'] = description
        util.write_to_file(job_posting.ID, job_info, dest_dir)