def update_salaries(jobs=[], cities=[], df=[], skiprows=0, table='salary', verbose_=True): columns = ['job', 'city', 'state', 'salary'] #columns += ['n_postings', 'state_name'] #columns += ['relative_salary', 'salaries_max', 'salaries_median', # 'trend_last2first', 'trend_median', 'trend_max'] # get jobs from text file if not any(jobs): jobs = pd.read_csv(PATH + 'jobs.txt')[skiprows:] #jobs.job = jobs.job.str.title() # get unique cities from postings if not any(cities): cities = db.get_cities_from_db() if not any(df): df = pd.DataFrame(columns=columns) for job, location in itertools.product(jobs.job.values, cities.values): city, state = location if db.queryNotInDb(job, city, state, table): df = scrape_indeed(job, city, state, df) if verbose_: print df.tail(1) db.to_sql(df.tail(1), table, 'append', null=0) return df
def get_postings_top_cities(job, maxResults=1001, n_cities=30, table='postings', save_=True): with open(PATH + 'jobs.txt', 'a') as f: print >> f, job # get top cities only for faster search cities = db.get_top_cities_from_db(n_cities) for city, state in cities.values: print city, state if db.queryNotInDb(job, city, state, table): indeed_api(job, city, state, table, maxResults, 'frontend', save_=save_)
def get_salaries_for_job(job, table='salary', verbose_=True, save_=True): columns = ['job', 'city', 'state', 'salary'] df = pd.DataFrame(columns=columns) # get only the cities that actually have postings for the job cities = db.get_cities_for_job(job) for city, state in cities.values: if verbose_: print city, state if db.queryNotInDb(job, city, state, table): df = scrape_indeed(job, city, state, df) if verbose_: print df.tail(1) if save_: db.to_sql(df.tail(1), table, 'append', null=0) return df
def update_postings(jobs=[], cities=[], skiprows=0, force=False, table='postings'): # get jobs from text file if not any(jobs): jobs = pd.read_csv(PATH + 'jobs.txt')[skiprows:] #jobs.job = jobs.job.str.title() # get unique cities from postings if not any(cities): cities = db.get_cities_from_db() for job, location in itertools.product(jobs.job.values, cities.values): city, state = location print job, city, state if db.queryNotInDb(job, city, state, table) or force: indeed_api(job, city, state, table)