Esempio n. 1
0
def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, 
    `start`, that allows you to start the job search at jobs 10-20, 
    20-30, etc. I can use this to grab job results from multiple pages at
    once. This function takes in the base_URL and then adds that
    start={page_start} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.

    Args: 
        base_URL: String that holds the base URL to add the page_start 
            parameter to. 
        job_title: String holding the job title used for the search
        job_location: String holding the job location used for the search 
        page_start: Integer of what the `start` parameter in the URL should
            be set to. 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Esempio n. 2
0
def multiprocess_pages(base_URL, job_title, job_location, page_start):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, `start`, that 
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting. 
    
    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job.
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Esempio n. 3
0
def multiprocess_pages(base_URL, job_title, job_location, page_number): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job. 
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs: 
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)
    
    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
Esempio n. 4
0
def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, `start`, that 
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting. 
    
    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
def multiprocess_pages(query_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """
    url = query_URL + '&page=' + str(page_num)
    #print (url)
    html = get_html(url)

    rows = html.select('.job_content')
    #print (rows)
    threads = []
    mongo_update_lst = []
    for row in rows:
        # print (row)
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    #store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
    #print (mongo_update_lst)
    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter_final')
Esempio n. 6
0
def multiprocess_pages(base_URL, job_title, job_location, page_num): 
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional 
    parameter, `page`, that allows you to start the job search at page 
    0-20 (20 is the max). I can use this to grab job results from multiple
    pages at once. This function here takes in the base_URL, and then 
    adds that page={page_num} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each 
    job posting. 

    Args: 
        base_URL: String that holds the base URL to add the page_num 
            parameter to. 
        job_title: String holding the job title used for the search 
        job_location: String holding the job location used for the search
        page_num: Integer of what the `page` paramter in the URL should 
            be set to. 
    """

    url = query_URL + '&page=' + str(page_num)
    html = get_html(url)
    rows = html.select('.job_result')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
def multiprocess_pages(base_URL, job_title, job_location, page_number):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job.
    #f = open("HTML Code 2", "w", encoding = "utf-8")
    #f.write(html.prettify())
    jobs = html.select('.jobs')
    #print (jobs)
    threads = []
    mongo_update_lst = []
    for job in jobs:
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'monster')
Esempio n. 8
0
def scrape_job_page(driver, job_title, job_location):
    """Scrape a page of jobs from Glassdoor. 

    Grab everything that is possible or relevant for each of the 
    jobs posted on a given page. This will typically include the job title, 
    job location, posting company, date posted, and any stars assigned 
    (if any). Parse the relevant information, and then store it. 

    Args: 
        driver: Selenium webdriver
        job_title: str
        job_location: str
    """
    
    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'glassdoor'}

    jobs = driver.find_elements_by_class_name('jobListing')

    mongo_update_lst = [query_for_data(driver, json_dct, job, idx) for 
            idx, job in enumerate(jobs[:-1])]

    store_in_mongo(mongo_update_lst, 'job_postings', 'glassdoor')
Esempio n. 9
0
def multiprocess_pages(base_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = query_URL + "&page=" + str(page_num)
    html = get_html(url)
    rows = html.select(".job_result")
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, "job_postings", "ziprecruiter")
Esempio n. 10
0
def multiprocess_pages(base_URL, job_title, job_location, page_start):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, 
    `start`, that allows you to start the job search at jobs 10-20, 
    20-30, etc. I can use this to grab job results from multiple pages at
    once. This function takes in the base_URL and then adds that
    start={page_start} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.

    Args: 
        base_URL: String that holds the base URL to add the page_start 
            parameter to. 
        job_title: String holding the job title used for the search
        job_location: String holding the job location used for the search 
        page_start: Integer of what the `start` parameter in the URL should
            be set to. 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job.
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Esempio n. 11
0
def scrape_job_page(driver, job_title, job_location):
    """Scrape a page of jobs from Glassdoor. 

    Grab everything that is possible or relevant for each of the 
    jobs posted on a given page. This will typically include the job title, 
    job location, posting company, date posted, and any stars assigned 
    (if any). Parse the relevant information, and then store it. 

    Args: 
        driver: Selenium webdriver
        job_title: str
        job_location: str
    """

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'glassdoor'}

    jobs = driver.find_elements_by_class_name('jobListing')

    mongo_update_lst = [
        query_for_data(driver, json_dct, job, idx)
        for idx, job in enumerate(jobs[:-1])
    ]

    store_in_mongo(mongo_update_lst, 'job_postings', 'glassdoor')
Esempio n. 12
0
    def __exit__(self, *args):
        """Ensure that any URLs scraped for get their text attributes updated."""

        store_in_mongo(self.articles_to_scrape,
                       self.db_name,
                       self.coll_name,
                       key='web_url')
Esempio n. 13
0
def scrape_job_page(driver, job_title, job_location):
    """Scrape a page of jobs from Monster.

    Grab everything that is possible (or relevant) for each of the 
    jobs posted for a given page. This will typically include the job title, 
    job location, posting company, and the date posted. From the given 
    href, click the job posting itself and grab the text. Lastly, store
    all of this in Mongo. 

    Args: 
    ----
        driver: Selenium webdriver
        job_title: str
            Job title to use in the search query. 
        job_location: str
            Job locatino to use in the search query. 
    """

    titles, locations, companies, dates, hrefs = query_for_data(driver)


    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'monster'}

    thread_lst = []
    for href in hrefs: 
        try: 
            thread = HrefQueryThread(href.get_attribute('href'))
        except: 
            print 'Exception in href thread builder' 
            thread = HrefQueryThread('')
        thread_lst.append(thread)
        thread.start()
    mongo_update_lst = []
    for title, location, company, date, thread in \
            izip(titles, locations, companies, dates, thread_lst): 
        try: 
            mongo_dct = gen_output(json_dct.copy(), title, location, 
                    company, date, thread)
            mongo_update_lst.append(mongo_dct)
        except: 
            print 'Missed element in Monster!'


    store_in_mongo(mongo_update_lst, 'job_postings', 'monster')
Esempio n. 14
0
def scrape_job_page(driver, job_title, job_location): 
    """Scrape a page of jobs from CareerBuilder.

    Grab all relevant information possible for each of the jobs 
    posted on a given page. This will typically include the job title, 
    job location, posting company, and date posted. Parse that data, 
    and then store it in Mongo. 

    Args: 
    ----
        driver: Selenium webdriver
        job_title: str
            String holding the job title we searched for. 
        job_location: str
            String holding the job location we searched for. 
    """
    titles, locations, companies, dates, hrefs = query_for_data(driver)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'careerbuilder'}

    thread_lst = []
    for href in hrefs:  
        try: 
            thread = HrefQueryThread(href.get_attribute('href'))
        except: 
            print 'Exception in href thread builder' 
            thread = HrefQueryThread('')
        thread_lst.append(thread)
        thread.start()
    mongo_update_lst = []
    for title, location, company, date, thread, idx in \
            izip(titles, locations, companies, dates, thread_lst, range(len(hrefs))): 
        try: 
            mongo_dct = gen_output(json_dct.copy(), title, location, 
                    company, date, thread, idx)
            mongo_update_lst.append(mongo_dct)
        except: 
            print 'Missed element in careerbuilder!'

    store_in_mongo(mongo_update_lst, 'job_postings', 'careerbuilder')
Esempio n. 15
0
def scrape_job_page(driver, job_title, job_location): 
    """Scape a page of jobs from CareerBuilder.

    We will grab all the relevant information that we can for each of 
    the jobs posted on a given page. This will include the job title, 
    job location, posting company, and date posted. 

    Args: 
        driver: Selenium webdriver
        job_title: str
            String holding the job title we searched for. 
        job_location: str
            String holding the job location we searched for. 
    """

    titles, locations, companies, dates, hrefs = query_for_data()

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'careerbuilder'}

    thread_lst = []
    for href in hrefs:  
        try: 
            thread = HrefQueryThread(href.get_attribute('href'))
        except: 
            print 'Exception in href thread builder' 
            thread = HrefQueryThread('')
        thread_lst.append(thread)
        thread.start()
    mongo_update_lst = []
    for title, location, company, date, thread, idx in \
            izip(titles, locations, companies, dates, thread_lst, range(len(hrefs))): 
        try: 
            mongo_dct = gen_output(json_dct.copy(), title, location, 
                    company, date, thread, idx)
            mongo_update_lst.append(mongo_dct)
        except: 
            print 'Missed element in careerbuilder!'

    store_in_mongo(mongo_update_lst, 'job_postings', 'careerbuilder')
Esempio n. 16
0
def scrape_job_page(driver, job_title, job_location):
    """Scrape a page of jobs from Monster.

    Grab everything that is possible (or relevant) for each of the jobs posted 
    for a given page. This will typically include the job title, job location,
    posting company, the date posted, and the posting text. 

    Args: 
    ----
        driver: Selenium webdriver
        job_title: str
        job_location: str
    """

    titles, locations, companies, dates, hrefs = query_for_data(driver)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'monster'}

    thread_lst = []
    for href in hrefs:
        try:
            thread = HrefQueryThread(href.get_attribute('href'))
        except:
            print('Exception in href thread builder')
            thread = HrefQueryThread('')
        thread_lst.append(thread)
        thread.start()
    mongo_update_lst = []
    for title, location, company, date, thread in \
            zip(titles, locations, companies, dates, thread_lst):
        try:
            mongo_dct = gen_output(json_dct.copy(), title, location, company,
                                   date, thread)
            mongo_update_lst.append(mongo_dct)
        except:
            print('Missed element in Monster!')

    store_in_mongo(mongo_update_lst, 'job_postings', 'monster')
Esempio n. 17
0
def multiprocess_pages(base_URL, job_title, job_location, page_number): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, 
    `pn`, that allows you to start the job search at jobs 11-20, 
    21-30, etc. Use this to grab job results from multiple pages at
    once. 
    
    This function takes in the base_URL, then adds that
    pn={page_number} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.


    Args: 
    ----
        base_URL: str 
            Holds the base URL to add the page_start parameter to. 
        job_title: str 
            Holds the job title used for the search. 
        job_location: str 
            Holds the job location used for the search. 
        page_number: int 
            Holds what the `start` parameter in the URL should be set to. 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job. 
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs: 
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)
    
    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
    while attribute.find('Other') == -1:
        values[attribute] = value
        points_misc_idx += 1
        # The value is always the last item present, surrounded by (), and the 
        # 1+ items before that are the attributes to which those points belong. 
        split_text = sum_points_misc_lst[points_misc_idx].split()
        attribute = ' '.join(split_text[:-1])
        value = split_text[-1].replace('(', '').replace(')', '')
    values[attribute] = value
    points_misc_idx += 1

    return values, points_misc_idx 

if __name__ == '__main__':
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', 
                     '.summaryPointsMisc']
    desired_contents = select_soup(soup, css_selectors)
    desired_contents_text = grab_contents_key(desired_contents, "text")
    desired_contents_renamed = rename_keys(desired_contents_text)
    final_lst = parse_contents(desired_contents_renamed)
    store_in_mongo(final_lst, 'music', 'music_lists')
Esempio n. 19
0
        job_location = sys.argv[2]
    except IndexError: 
        raise Exception('Program needs a job title and job location inputted!')

    
    # Navigate to the base URL
    base_URL = 'http://www.careerbuilder.com/'
    query_params = (('search-key', job_title), ('search-loc', job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Grab num. jobs
    try: 
        num_jobs_txt = driver.find_element_by_id('n_pnlJobResultsCount').text
        num_jobs = int(parse_num(num_jobs_txt, 0)) 
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'careerbuilder', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'careerbuilder')

    # This loop will be used to keep clicking the next button after
    # scraping jobs on that page. 
    is_next = True
    while is_next: 
        jobs = scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
    driver.close()
Esempio n. 20
0
        'q={}'.format('-'.join(job_title.split())),
        '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di',
        '&rad={}'.format(radius)
    ]

    query_URL = format_query(base_URL, query_parameters)
    driver = issue_driver_query(query_URL)

    try:
        num_jobs_txt = get_num_jobs_txt(driver)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'monster',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'monster')

    is_next = True
    while is_next:
        scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
    driver.close()
Esempio n. 21
0
def format_output(raw_output):
    '''
    Input: Dictionary
    Output: List

    Reformat the dictionary so that we can easily insert it into our Mongo
    database. Basically, right now the dictionary consists of album titles 
    as the keys, and lists of their ratings on critics lists as the values. 
    We need it to be a list of dictionaries in the format: 
    
    {'Album Title': album_title, 'Critics Scores' : critics_scores_lst}
    '''

    output_lst = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.iteritems()]
    return output_lst


if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = format_output(raw_output)
    store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")
Esempio n. 22
0
    except IndexError: 
        raise Exception('Program needs a job title, job location, and radius inputted!')

    base_URL = 'http://jobs.monster.com/search/?'
    query_parameters = ['q={}'.format('-'.join(job_title.split())), 
            '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di', 
            '&rad={}'.format(radius)]

    query_URL = format_query(base_URL, query_parameters)
    driver = issue_driver_query(query_URL)
    
    try: 
        num_jobs_txt = get_num_jobs_txt(driver)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'monster', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'monster')
    
    # This loop will be used to keep clicking the next button after
    # scraping jobs on that page. 
    is_next = True
    while is_next: 
        scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
    driver.close()
Esempio n. 23
0
 def __exit__(self, *args): 
     """Ensure that any URLs scraped for get their text attributes updated."""
     
     store_in_mongo(self.articles_to_scrape, self.db_name, self.coll_name, 
                    key='web_url')
Esempio n. 24
0
    driver = issue_driver_query(base_URL, query_params)

    # Find the text holding the number of jobs, and parse it.
    time.sleep(random.randint(7, 15))
    num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text
    num_jobs = int(parse_num(num_jobs_txt, 0))

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'glassdoor',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'glassdoor')

    # Find the text holding the number of pages in the job search.
    time.sleep(random.randint(2, 6))
    try:
        num_pages_txt = driver.find_element_by_id('ResultsFooter').text
        num_pages = int(parse_num(num_pages_txt, 1))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    # Give it a little time before starting to click and parse
    time.sleep(random.randint(6, 12))

    is_next = True
    while is_next:
Esempio n. 25
0
    query_URL = format_query(base_URL, query_parameters)
    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select("#job_results_headline")[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except:
        print("No jobs for search {} in {}".format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone("US/Mountain")))
    storage_dct = {
        "job_site": "ziprecruiter",
        "num_jobs": num_jobs,
        "date": current_date,
        "title": job_title,
        "location": job_location,
    }
    store_in_mongo([storage_dct], "job_numbers", "ziprecruiter")

    # Cycle through the pages of jobs to grab all of the info. that we want. Each
    # page holds 20 jobs, so the number of pages we'll cyle through will be
    # num_jobs / 20. The caveat, though is that they only give 20 pages to look
    # through at maximum (hence the min below).
    pages = min(20, num_jobs // 20 + 1)
    page_positions = range(1, pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL, job_title, job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_positions)
Esempio n. 26
0
    # Get HTML for base query.
    html = get_html(query_URL)
    try:
        num_jobs_txt = str(html.select('#searchCount'))
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except:
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'indeed',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'indeed')

    # Now we need to cycle through all of the job postings that we can and
    # grab the url pointing to it, to then query it. All of the jobs should
    # be available via the .turnstileLink class, and then the href attribute
    # will point to the URL. I'm going to multiprocess and multithread this.
    max_start_position = 1000 if num_jobs >= 1000 else num_jobs
    # I'll need to be able to pass an iterable to the multiprocessing pool.
    start_positions = range(0, max_start_position, 10)
    execute_queries = partial(multiprocess_pages, query_URL, \
            job_title, job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, start_positions)
Esempio n. 27
0
    
    # Issue the job query. 
    base_URL = 'https://www.glassdoor.com/index.htm'
    query_params = (('KeywordSearch', job_title), 
            ('LocationSearch', job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Find the text holding the number of jobs, and parse it. 
    time.sleep(random.randint(7, 15))
    num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text
    num_jobs = int(parse_num(num_jobs_txt, 0)) 

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'glassdoor', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'glassdoor')

    # Find the text holding the number of pages in the job search. 
    time.sleep(random.randint(2, 6))
    try: 
        num_pages_txt = driver.find_element_by_id('ResultsFooter').text
        num_pages = int(parse_num(num_pages_txt, 1))
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    # Find all the jobs. 
    time.sleep(random.randint(6, 12))

    # This loop will be used to keep clicking the next button after
    # scraping jobs on that page. 
        rating_txt: str
            Text that potentially holds the rating. 
        idx: int
            Holds the rating if the text does not. 

    Return: int
    """

    if len(rating_txt) >= 1: 
        rating = int(rating_txt[0].replace('.', ''))
    else: 
        rating = idx

    return rating

if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.items()]
    store_in_mongo(formatted_output, 'music', 'music_lists', 
                        key="Album Title")

Esempio n. 29
0
            '&include_near_duplicates=1']

    query_URL = format_query(base_URL, query_parameters)

    # Get HTML for base query. 
    html = get_html(query_URL)

    try: 
        num_jobs_txt = str(html.select('#job_results_headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'ziprecruiter', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'ziprecruiter')
    
    # Here we'll cycle through the pages of jobs to grab all of the 
    # info. that we want. Each page holds 20 jobs, so the number of 
    # pages we'll cyle through will be num_jobs / 20. The caveat, though
    # is that they only give 20 pages to look through at maximum (hence 
    # the min below). 
    pages = min(20, num_jobs / 20 + 1)
    page_positions = range(1, pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL,
            job_title, job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_positions)
Esempio n. 30
0
    content_txt = content.text
    score_idx = content_txt.find(score_str) 
    score_str_len = len(score_str)
    beg_idx = score_idx + score_str_len
    end_idx = beg_idx + 2    
    score = content_txt[beg_idx:end_idx]

    return score

if __name__ == '__main__': 
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab data music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.albumTitle']
    album_titles_contents = select_soup(soup, css_selectors)
    album_titles_lst = list(grab_contents_key(album_titles_contents, 'text').values())
    album_titles = album_titles_lst[0]
    album_title_links = grab_contents_key(album_titles_contents, 'a')
    album_title_hrefs = grab_contents_key(album_title_links, 'href')

    final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles)
    store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")

    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select('#job_results_headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
        print(num_jobs)
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'ziprecruiter',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'ziprecruiter')

    # Cycle through the pages of jobs to grab all of the info. that we want. Each
    # page holds 20 jobs, so the number of pages we'll cyle through will be
    # num_jobs / 20. The caveat, though is that they only give 20 pages to look
    # through at maximum (hence the min below).
    pages = min(20, num_jobs // 20 + 1)
    page_positions = range(1, pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL, job_title,
                              job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_positions)
Esempio n. 32
0
    base_URL = 'https://www.indeed.com/jobs?'
    query_parameters = ['q={}'.format('+'.join(job_title.split())),
            '&l={}'.format('+'.join(job_location.split())), 
            '&radius={}'.format(radius), '&sort=date', '&fromage=5']

    query_URL = format_query(base_URL, query_parameters)

    html = get_html(query_URL)
    try: 
        num_jobs_txt = str(html.select('#searchCount'))
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except: 
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'indeed', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'indeed')

    # Cycle through all of the job postings that we can and grab the url pointing to
    # it, to then query it. All of the jobs should be available via the 
    # .turnstileLink class, and then the href attribute will point to the URL. 
    max_start_position = 1000 if num_jobs >= 1000 else num_jobs
    start_positions = range(0, max_start_position, 10)
    execute_queries = partial(multiprocess_pages, query_URL, \
            job_title, job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, start_positions)
Esempio n. 33
0
    except IndexError: 
        raise Exception('Program needs a job title, job location, and radius inputted!')

    base_URL = 'http://www.simplyhired.com/search?'
    query_parameters = ['q={}'.format('+'.join(job_title.split())), 
            '&l={}'.format('+'.join(job_location.split())), '&mi={}'.format(radius),
            '&fdb=5', '&clst=CTL']
    
    query_URL = format_query(base_URL, query_parameters)

    html = get_html(query_URL)
    try: 
        num_jobs_txt = str(html.select('.result-headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except: 
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'simplyhired', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'simplyhired')

    # All of the jobs should be available through the '.js-job-link' CSS class.
    max_pages = num_jobs // 10 + 1
    page_numbers = range(1, max_pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL, job_title, 
            job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_numbers)
Esempio n. 34
0
        job_location = sys.argv[2]
    except IndexError: 
        raise Exception('Program needs a job title and job location inputted!')

    
    # Navigate to the base URL
    base_URL = 'http://www.careerbuilder.com/'
    query_params = (('keywords', job_title), ('location', job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Grab num. jobs
    try: 
        num_jobs_txt = driver.find_element_by_css_selector('div .count').text
        num_jobs = int(parse_num(num_jobs_txt, 0)) 
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'careerbuilder', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'careerbuilder')

    # This loop will be used to keep clicking the next button after
    # scraping jobs on that page. 
    is_next = True
    while is_next: 
        jobs = scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
    driver.close()
        xy = num_jobs_txt
        #print (xy)
        cd = int(xy[0]) * 1000 + int(xy[2]) * 100 + int(xy[3]) * 10 + int(
            xy[4])
        #print (num_jobs_txt)
        num_jobs = int(cd)
        print(num_jobs)
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))

    storage_dct = {
        'job_site': 'simplyhired',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'simplyhired')

    # All of the jobs should be available through the '.js-job-link' CSS class.
    max_pages = num_jobs // 10 + 1
    #print (max_pages)
    page_numbers = range(1, max_pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL, job_title,
                              job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_numbers)