def connect_indeed(self, config_filepath='indeed_cred.yml'): # Store in .ssh # yamload = yaml.load(open(config_filepath)) # credentials = yamload['indeed'] # pub_num = credentials.get('publisher_num') self.c = IndeedClient(publisher='4353162753214099') print('connect_indeed done')
def get_indeed_job_list(query, location, radius): client = IndeedClient(publisher=2863621289879018) progress_bar = pyprind.ProgBar(4, title='Searching For Jobs') results_pd = pd.DataFrame() for numb_results in range(0, 100, 25): params = { 'q': query, 'radius': radius, 'l': location, 'userip': "1.2.3.4", 'limit': '25', 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'start': numb_results } search_response = client.search(**params) results_pd = pd.concat( [results_pd, pd.DataFrame.from_dict(search_response['results'])], axis=0) progress_bar.update() if len(results_pd) == 0: sys.exit('Search did not return any jobs') results_pd.reset_index(drop=True, inplace=True) results_pd['date'] = pd.to_datetime(results_pd.date) results_pd.drop([ 'source', 'expired', 'country', 'formattedLocation', 'formattedLocationFull', 'onmousedown', 'stations', 'state', 'sponsored' ], axis=1, inplace=True) return results_pd # returns the search results as a pandas data frame
def setup(self): self.client = IndeedClient("YOUR_PUBLISHER_NUMBER") self.params = { 'q': "python", 'l': "austin", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", }
def setup(self): self.client = IndeedClient('8251007850639120') self.params = { 'q': "python", 'l': "austin", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", } self.utils = Utils()
def indeed_urls(parameters, publisher_key=None): """Use Indeed publisher ID to retrieve URLs from the Indeed API.""" if publisher_key is None: publisher_key = os.environ['API_KEY'] client = IndeedClient(publisher_key) response = client.search(**parameters) try: urls = [str(links['url']) for links in response['results']] return urls except KeyError: raise NameError('Invalid Publisher ID')
def get_data(): client = IndeedClient('7381316591612982') params = { 'q': "front end engineer", 'l': "austin", 'userip': "172.68.141.95", 'useragent': """Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""", 'limit': 25 } search_response = client.search(**params) cities = [ 'New York, NY', 'Austin, TX', 'San Francisco, CA', 'Boston, MA', 'Chicago, IL', 'Miami, FL' ] jobs = [ 'Front End Engineer', 'Back End Engineer', 'Data Science', 'Product Management', 'Director of Engineering', 'Data Engineer', 'Data Analyst', 'Accounting', 'Marketing', 'Finance', 'Nurse', 'Doctor', 'Lawyer', 'Paralegal', 'sales', 'customer_service', 'human resources', 'executive assistant', 'operations', 'teacher', 'maintenance', 'security guards' ] res_list = ['jobs'] for c in cities: for j in jobs: params = { 'q': j, 'l': c, 'userip': "172.68.141.95", 'useragent': """Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""", 'limit': 25 } search_response = client.search(**params) for res in search_response['results']: job_dict = {} if not res['expired']: job_dict['city'] = res['city'] job_dict['date_posted'] = res['date'] job_dict['company'] = res['company'] job_dict['title'] = res['jobtitle'] job_dict['url'] = res['url'] job_dict['job_id'] = res['jobkey'] job_dict['state'] = res['state'] job_dict['snippet'] = res['snippet'] res_list.append(job_dict) return res_list
def Search(query, location, limit=10, start=0): client = IndeedClient(publisher=PUBLISHER_ID) params = { 'q': query, 'l': location, 'limit': limit, 'start': start, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)" } search_response = client.search(**params) return search_response
def access_indeed_api(parameters, publisher_key=None): """Access the Indeed API using the given parameters and publisher key. Positional argument: parameters -- a dictionary of the parameters to send to Indeed's API Keyword argument: publisher_key -- the publisher key for Indeed's API, defaults to environment variable """ if publisher_key is None: publisher_key = os.environ['API_KEY'] client = IndeedClient(publisher_key) response = client.search(**parameters) return response
def search_with_api(self, params: dict): client = IndeedClient(publisher=self.user_config.INDEED_API_KEY) search_response = client.search(**params) total_number_hits = search_response['totalResults'] num_loops = int(total_number_hits / IndeedConstants.API.MAX_NUM_RESULTS_PER_REQUEST) counter_start = 0 print('Total number of hits: {0}'.format(total_number_hits)) count_jobs_added = 0 for i in range(0, num_loops): # We can get around MAX_NUM_RESULTS_PER_REQUEST by increasing our start location on each loop! params['start'] = counter_start search_response = client.search(**params) list_jobs = IndeedParser.get_jobs_from_response(search_response) for job in list_jobs: try: # TODO: This sucks, I'm just repeating myself... Job.create(key=job.key, website=job.website, link=job.link, title=job.title, company=job.company, city=job.city, state=job.state, country=job.country, location=job.location, posted_date=job.posted_date, expired=job.expired, easy_apply=job.easy_apply) count_jobs_added += 1 except peewee.IntegrityError as e: # TODO: Can I write a custom exception that catches UNIQUE Errors but not others? if 'UNIQUE' in str(e): pass else: print(str(e)) # Increment start counter_start += IndeedConstants.API.MAX_NUM_RESULTS_PER_REQUEST print('Added {0} new jobs'.format(count_jobs_added))
def get_job_description(input_skills): client = IndeedClient('7863709885041358') params = { 'q': input_skills, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': 25 } job_urls = [] search_response = client.search(**params) for job in search_response['results']: job_urls.append(job['url']) bunch_of_words = [] for each_url in job_urls: bunch_of_words.extend(text_cleaner(each_url)) return bunch_of_words
def fetch_indeed_data(counties,search): from indeed import IndeedClient client = IndeedClient('6437444271691851') params = { 'q' : "analytics", 'l' : "bergen county, nj", 'userip' : "1.2.3.4", 'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'latlong' : 1, 'radius' : 10, 'fromage' : 7, 'limit' : 25 } params['q'] = search results = [] for county in counties: params['l'] = county results.append(client.search(**params)) return(results)
def main(): client = IndeedClient(PUB_ID) search_params = build_params(locations, JOB_QUERY) search_results = [] count = 1 for params in search_params: stdout.flush() stdout.write("\rHtml request: {}/{}".format(count, len(locations))) search_response = client.search(**params) search_results.append(search_response) count += 1 word_filter = ['and', 'to', 'the', 'of', 'a', 'in', 'with', 'you', 'on', 'that', 'are', 'will', 'is', 'your', 'for', 'we', 'from', 'an', 'be', 'have', 'or', 'just', 'can', 'also', 'how', 'at', 'as', 'do', 'other', 'should', 'what', 'us', 'this', 'it', 'if', 'get', '-', '&', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] count = 1 number_of_locations = len(search_results) word_map = Counter() for search in search_results: print "Currently on {}/{}".format(count, number_of_locations) if len(search['results']) == 0: print "Nothing found for: {}".format(search['location']) else: print "Attempting {}...".format(search['location']) for job in search['results']: url = job['url'] html = requests.get(url) word_list = pull_job_description(html.content) for word in word_list: if word.lower() not in word_filter: word_map[word.lower()] += 1 count += 1 save_to_file(OUTPUT_FILE, word_map)
def scrape_indeed(self, api_key, ip_address, places=None): indeed_client = IndeedClient(api_key) indeed_matched_jobs = [] seen_jobs = self.load_titles('indeed_jobs') if not places: places = ['san francisco, ca'] for place, term in [(place, term) for place in places for term in self.filters.keys()]: sys.stderr.write('Searching {} Indeed for {}... '.format( place, term)) # time.sleep(random.randrange(1, 3)) # throttle requests params = { 'q': term, 'l': place, 'userip': ip_address, 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': 25 } search_response = indeed_client.search(**params) job_results = search_response['results'] sys.stdout.write('returned {} items\n'.format(len(job_results))) for job in job_results: job_id = job['jobkey'] if job_id not in seen_jobs: seen_jobs.add(job_id) job_title = job['jobtitle'] if self.filter_title(job_title, self.filters[term]): indeed_matched_jobs.append([ job_title, job['formattedLocationFull'], job['url'], job['snippet'] ]) self.save_titles('indeed_jobs', seen_jobs) return indeed_matched_jobs
#This is for the mail client, by which we will be able to get user base updates app.config.update( DEBUG=True, #Email settings MAIL_SERVER='smtp.gmail.com', MAIL_PORT=465, MAIL_USE_SSL=True, MAIL_USERNAME = credentials.my_email_username, MAIL_PASSWORD = credentials.my_email_password ) mail=Mail(app) #Creating the clients to interact with the APIs twilio_api = TwilioRestClient(credentials.my_twilio_account_sid, credentials.my_twilio_auth_token) #Twilio indeed_api = IndeedClient(publisher = credentials.my_indeed_publisher_id) #Indeed #Client to shorten links with TinyURL shortener = Shortener('Tinyurl', timeout=86400) #Function to find and deliver jobs for each user in the jQuery file. This function is called daily as well as whenever the "admin" user sends a text to the endpoint with the word 'override' def FindAndDeliverJobs(): #Opening up the json file with all the users for reading with open('user_info.json', "r") as load_file: user_list = json.load(load_file) #Loop to iterate through every user inside the json file for user in user_list: #Only look up jobs for the user if they have confirmed their number if user['confirmed'] == 1: #Initializing the parameters for the Indeed search using the users preferences
from indeed import IndeedClient import pymysql from database import addToDatabase client = IndeedClient(publisher = ***************) parameters = {'q' : "python developer", 'l' : "India", 'sort' : "date", 'fromage' : "5", 'limit' : "25", 'filter' : "1", 'userip' : "192.186.176.550:60409", 'useragent' : "Mozilla/5.0" } def get_offers(params): search_results = client.search(**search_params) for elm in search_results['results']: offer = (elm['jobtitle'], elm['formattedLocation'], elm['snippet'], elm['url'], elm['indeedApply'], elm['jobkey'], elm['date']) addToDatabase(offer) def searchAllCities(): current_city = 0
from indeed import IndeedClient import time client = IndeedClient(publisher='') params = { 'q': "internship", 'l': "Zurich", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'radius': 50, 'limit': 100, 'co': 'ch', 'sort': 'date' } search_response = client.search(**params) filename = 'jobs_' + str(time.localtime()[0]) + str(time.localtime()[1]) + str( time.localtime()[2]) + '.txt' with open(r'export path' + filename, 'w') as textfile: textfile.write('acquisition time: ' + str(time.localtime()[3]) + ':' + str(time.localtime()[4]) + '\n\n') for i in range(0, len(search_response)): reltime = search_response['results'][i]['formattedRelativeTime'] jobtitle = search_response['results'][i]['jobtitle'] company = search_response['results'][i]['company'] url = search_response['results'][i]['url'] textfile.write(reltime + '\t' + jobtitle + '\t company: ' + company + '\n' + url + '\n\n')
#importin Indeed Python API module from indeed import IndeedClient client = IndeedClient(publisher=12254335) # we'll do this later parameters = { 'q': "python developer", 'l': "London, GB", 'sort': "date", 'fromage': "5", 'limit': "25", 'filter': "1", 'userip': "192.186.176.550:60409", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)" } # our main search function def get_offers(params): # perform search search_results = client.search(**params) # we want this to be a dictionary # loop through each offer element for elm in search_results['results']: offer = (elm['jobtitle'], elm['formattedLocation'], elm['snippet'], elm['url'], elm['indeedApply'], elm['jobkey'], elm['date'])
# Author: Jichao Sun ([email protected]) # Date: April 26, 2016 # Setup: pip install indeed # pip install requests --upgrade from bs4 import BeautifulSoup from indeed import IndeedClient #import threading, urllib2 import urllib, urllib2, re jichaoID = 278720823964828 client = IndeedClient(publisher=jichaoID) # If salary is non empty, then the ordering of jobs per query is preserved. # Thus can use difference between two queries to find jobs in salary range. # Jobs with no specified salaries are estimated def getRawJobs(what, where, count, jobType, radius, salary): if jobType not in [ "fulltime", "parttime", "contract", "internship", "temporary", "" ]: return [] results = [] params = { 'q': what + "+$" + salary, # Job keywords 'l': where, # Location as a string, 'jt': jobType, # Type of job, fulltime parttime contract etc...
# verifying indeed publisher number from indeed import IndeedClient client = IndeedClient(publisher = 'publisher_number') params = { 'q' : "software engineer", 'l' : "Chicago", 'sort' : "date", 'fromage' : "5", 'limit' : "50", 'filter' : "1", 'userip' : "ip_address", 'useragent' : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36", } # main search function def get_offers(params): search_results = client.search(**params) #perform search for elm in search_results['results']: offer = (elm['jobtitle'], #parsing the offer elm['formattedLocation'], elm['formattedLocation'], elm['snippet'], elm['url'], elm['indeedApply'], elm['jobkey'], elm['date'])
def setup(self): self.client = IndeedClient("YOUR_PUBLISHER_NUMBER") self.params = { 'jobkeys': ("5898e9d8f5c0593f", "c2c41f024581eae5"), }
import json import sys from indeed import IndeedClient client = IndeedClient(publisher=9074116252229934) results = [] i = 0 pagenumber = 0 for x in range(0, 40): pagenumber += 1 params = { 'q': "developer", 'userip': "50.24.191.212", 'useragent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'start': i, 'limit': 25, 'radius': 25 } i += 25 search_response = client.search(**params) search_response = json.loads(json.dumps(search_response)) results += search_response['results'] json_string = json.dumps({'results': results}, sort_keys=True, indent=4) file = open('data.json', 'w') file.write(json_string) file.close()
def setup(self): self.utils = Utils() self.client = IndeedClient('8251007850639120') self.params = { 'jobkeys' : ("7c398c74a8f22c72", "d7802e9ce3b4af7d"), }
"""extracts the number of jobs posted per zip code using the Indeed API""" import datetime from indeed import IndeedClient from sqlalchemy import extract from models.db_models import Indeed, session, ZipCode from config import login_data, LOGGER client = IndeedClient(publisher=login_data['indeed_publisher_id']) def get_num_job_postings(zip_code, params=None): """Retrives the number of job postings for a zip_code, passing in additional optional parameters: { 'q': "", # query "as_phr": "", # exact phrase, "as_any": "", # at least one of these words "as_not": "", # none of these words "as_ttl": "", # all these words, "as_cmp": "", # company name, "jt": "all", # job type, "radius": 0, # distance from location "fromage": 7, # last 7 days "salary": "", # salary range 'l': '', # location 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)" } """
search_indeed_api.py - Retrieve some job records from indeed.com, based on the query parameters. User must provide a valid indeed.com PUBLISHER_NUMBER and some parameters. See this page to create an account (it was free and quick for me): http://www.indeed.com/publisher/ Use a job key to find more data by running the get_details_indeed_api.py program. ''' from indeed import IndeedClient client = IndeedClient('PUBLISHER_NUMBER') params = { 'q' : "python entry", 'l' : "94063", 'userip' : "1.2.3.4", 'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)" } search_response = client.search(**params) # Example search response, Mon2016_0418_22:51. # Entire response is a dictionary. # The dict key 'results' has a list of dicts for its value. # Each dict in the list is one job record, as partly shown here: '''
def __init__(self): # self.jobDataFrame= pd.DataFrame(); self.client = IndeedClient(8836246992678581)
from indeed import IndeedClient client = IndeedClient('1439892112925001') params = { 'q': "python", 'l': "boston", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)" } url = 'http://www.indeed.com/viewjob?jk=89b6ad7a31f7c4be&qd=Edw9zLy29tPtf_aglDLrzkea4GYpkSu9Dn9RxMjGtc-Au7bNkAhEpP8509-8oVyQct6gb9Hh9FwGl317FwNQL73cXKONUJYtCg03YtTr2S0&indpubnum=1439892112925001&atk=1b94foutl5sn398g' import requets requests.get(url) jobkey = '89b6ad7a31f7c4be'
def get_api_results(self, desired_result_count=1): '''return job json objects from the indeed api.''' job_profile = CommonFuncs.get_job_profile() # GET LOCATION IN JOB PROFILE locations = CommonFuncs.get_locations_list(job_profile) # KEYWORDS CONNECTED BY OR query_list = CommonFuncs.build_query_string(job_profile=job_profile, or_delim='or', bracket1='(', bracket2=')', adv_supp=True) query_string = query_list[0] new_jobs_queue = queue.Queue(maxsize=0) new_jobs = None limit = '25' # 25 is the max results per request lookback_period = '60' # default lookback period client_id = {} api = None # CONNECT TO INDEED API FOR JOB QUERIES try: client_id = json.load(open(API_KEYS_PATH, 'r')) api = IndeedClient(publisher=client_id['publisher_id']) except: ValueError('No publisher id found. Filtering aborted.') filters = { 'q': query_string, 'l': '', 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", "raw": "False", "sort": "date", "radius": job_profile.radius, "limit": limit, "fromage": lookback_period, } # FIND NEW JOB JSON OBJECT USING INDEED API # GET NEW JOBS for location in locations: # iterate over each location filters['l'] = location filters['q'] = query_string # THREAD-BRAINED APPROACH to get all results at once def get_results(i): '''get results and check against the db if they are new. add to queue if new''' filters['start'] = i temp_list = [] # get 25 results, using provided filters with start index [ temp_list.append(x) for x in json.loads( CommonFuncs.convertBytesToString(api.search( **filters)))['results'] ] [ new_jobs_queue.put(x) for x in temp_list if new_jobs_queue.unfinished_tasks < desired_result_count ] result_count = int( json.loads( CommonFuncs.convertBytesToString( api.search(**filters)))['totalResults']) list_of_filter_starts = [ str(i) for i in range(0, result_count, 25) ] # build list of start positions for item in list_of_filter_starts: if not new_jobs_queue.unfinished_tasks < desired_result_count: break get_results(item) new_jobs = list( new_jobs_queue.queue) # append query results to list # RETURN JOBS if new_jobs: if desired_result_count == 1: # just return a single job, not in a list return new_jobs[0] elif desired_result_count <= len( new_jobs ): # if we have more than enough new jobs, return those in a list return new_jobs[0:desired_result_count] else: # if more than the available number of new jobs requested, return all that could be found return new_jobs else: return [] # if no new links found
def fullmap(): gmaps = googlemaps.Client(key="AIzaSyAx1j38VITDr2p2-VclAyX8pSOp7C_1-kM") lctn = gmaps.geolocate() #reverse = gmaps.reverse_geocode(latlng = [lctn['location']['lat'],lctn['location']['lng']] ) client = IndeedClient('1905750874242217') params = { 'q': "python", 'l': "Kharkiv", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'co': "UA", 'latlong': 1, 'start': 0, 'limit': 25 } search_response = client.search(**params) jobs = json_normalize(search_response['results']) jobs_markers = [{ 'icon': '//maps.google.com/mapfiles/ms/icons/blue-dot.png', 'lat': lctn['location']['lat'], 'lng': lctn['location']['lng'], 'infobox': "My Location" }] for index, row in jobs.iterrows(): get_address = gmaps.places(query=row['company'] + ' ' + row['city'], location=str(lctn['location']['lat']) + ',' + str(lctn['location']['lng'])) company = json_normalize(get_address['results']) for index, row_company in company.iterrows(): jobs_markers.append({ 'icon': '//maps.google.com/mapfiles/ms/icons/red-dot.png', 'lat': row_company['geometry.location.lat'], 'lng': row_company['geometry.location.lng'], 'infobox': row['company'] + ' - ' + row_company['formatted_address'] + ' snippet:' + row['snippet'] }) #ltn = location() fullmap = Map( identifier="fullmap", varname="fullmap", style=("height:70%;" "width:99%;" "top:50;" "left:10;" "position:absolute;" "z-index:200;"), lat=lctn['location']['lat'], lng=lctn['location']['lng'], markers=jobs_markers, # maptype = "TERRAIN", zoom="11", #cluster=True fit_markers_to_bounds=True) return render_template('example_fullmap.html', fullmap=fullmap, GOOGLEMAPS_KEY=request.args.get('apikey'))
counties = [cs[0] for cs in counties_states] states = [cs[1] for cs in counties_states] counties_by_state = {} for (value, key) in counties_states: counties_by_state.setdefault(key, []) # key might exist already counties_by_state[key].append(value) counties_states_2 = [ str(c) + ', ' + str(state_dict_2[s]) for c, s in zip(counties, states) ] #### GET DATA FROM INDEED #### from indeed import IndeedClient client = IndeedClient(publisher=8924341972846274) query = 'data scientist' # Only search in our domain # Indeed search is supposed to be ANDed but results prove the contrary f = open('/data/w205/W205_final_storage/indeed/txt/indeed.txt', 'w') for county_state in counties_states_2: county = county_state.split(', ')[0] state = county_state.split(', ')[1] jobkeys = [] # To avoid duplicates (in a county) params = { 'q': query, 'l': county_state, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",