Example #1
0
def update_salaries(jobs=[], cities=[], df=[], skiprows=0,
                    table='salary', verbose_=True):

    columns = ['job', 'city', 'state', 'salary']
    #columns += ['n_postings', 'state_name']
    #columns += ['relative_salary', 'salaries_max', 'salaries_median',
    #            'trend_last2first', 'trend_median', 'trend_max']

    # get jobs from text file
    if not any(jobs):
        jobs = pd.read_csv(PATH + 'jobs.txt')[skiprows:]
        #jobs.job = jobs.job.str.title()

    # get unique cities from postings
    if not any(cities):
        cities = db.get_cities_from_db()

    if not any(df):
        df = pd.DataFrame(columns=columns)

    for job, location in itertools.product(jobs.job.values, cities.values):
        city, state = location
        if db.queryNotInDb(job, city, state, table):
            df = scrape_indeed(job, city, state, df)
            if verbose_:
                print df.tail(1)
            db.to_sql(df.tail(1), table, 'append', null=0)

    return df
Example #2
0
def get_postings_top_cities(job, maxResults=1001, n_cities=30,
                            table='postings', save_=True):

    with open(PATH + 'jobs.txt', 'a') as f:
        print >> f, job

    # get top cities only for faster search
    cities = db.get_top_cities_from_db(n_cities)

    for city, state in cities.values:
        print city, state
        if db.queryNotInDb(job, city, state, table):
            indeed_api(job, city, state, table, maxResults, 'frontend',
                       save_=save_)
Example #3
0
def get_salaries_for_job(job, table='salary', verbose_=True, save_=True):

    columns = ['job', 'city', 'state', 'salary']
    df = pd.DataFrame(columns=columns)

    # get only the cities that actually have postings for the job
    cities = db.get_cities_for_job(job)

    for city, state in cities.values:
        if verbose_: print city, state
        if db.queryNotInDb(job, city, state, table):
            df = scrape_indeed(job, city, state, df)
            if verbose_: print df.tail(1)
            if save_: db.to_sql(df.tail(1), table, 'append', null=0)

    return df
Example #4
0
def update_postings(jobs=[], cities=[], skiprows=0, force=False,
                    table='postings'):

    # get jobs from text file
    if not any(jobs):
        jobs = pd.read_csv(PATH + 'jobs.txt')[skiprows:]
        #jobs.job = jobs.job.str.title()

    # get unique cities from postings
    if not any(cities):
        cities = db.get_cities_from_db()

    for job, location in itertools.product(jobs.job.values, cities.values):
        city, state = location
        print job, city, state
        if db.queryNotInDb(job, city, state, table) or force:
            indeed_api(job, city, state, table)