def get_data(): client = IndeedClient('7381316591612982') params = { 'q': "front end engineer", 'l': "austin", 'userip': "172.68.141.95", 'useragent': """Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""", 'limit': 25 } search_response = client.search(**params) cities = [ 'New York, NY', 'Austin, TX', 'San Francisco, CA', 'Boston, MA', 'Chicago, IL', 'Miami, FL' ] jobs = [ 'Front End Engineer', 'Back End Engineer', 'Data Science', 'Product Management', 'Director of Engineering', 'Data Engineer', 'Data Analyst', 'Accounting', 'Marketing', 'Finance', 'Nurse', 'Doctor', 'Lawyer', 'Paralegal', 'sales', 'customer_service', 'human resources', 'executive assistant', 'operations', 'teacher', 'maintenance', 'security guards' ] res_list = ['jobs'] for c in cities: for j in jobs: params = { 'q': j, 'l': c, 'userip': "172.68.141.95", 'useragent': """Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""", 'limit': 25 } search_response = client.search(**params) for res in search_response['results']: job_dict = {} if not res['expired']: job_dict['city'] = res['city'] job_dict['date_posted'] = res['date'] job_dict['company'] = res['company'] job_dict['title'] = res['jobtitle'] job_dict['url'] = res['url'] job_dict['job_id'] = res['jobkey'] job_dict['state'] = res['state'] job_dict['snippet'] = res['snippet'] res_list.append(job_dict) return res_list
def scrape_indeed(self, api_key, ip_address, places=None): indeed_client = IndeedClient(api_key) indeed_matched_jobs = [] seen_jobs = self.load_titles('indeed_jobs') if not places: places = ['san francisco, ca'] for place, term in [(place, term) for place in places for term in self.filters.keys()]: sys.stderr.write('Searching {} Indeed for {}... '.format(place, term)) # time.sleep(random.randrange(1, 3)) # throttle requests params = { 'q': term, 'l': place, 'userip': ip_address, 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': 25} search_response = indeed_client.search(**params) job_results = search_response['results'] sys.stdout.write('returned {} items\n'.format(len(job_results))) for job in job_results: job_id = job['jobkey'] if job_id not in seen_jobs: seen_jobs.add(job_id) job_title = job['jobtitle'] if self.filter_title(job_title, self.filters[term]): indeed_matched_jobs.append([ job_title, job['formattedLocationFull'], job['url'], job['snippet']]) self.save_titles('indeed_jobs', seen_jobs) return indeed_matched_jobs
def generate_job_list(params,publisher_id): """ Returns list of jobs that match search criteria """ job_list = [] #since we initiated params['start'] at 0 total_results = 1 while int(params['start']) < total_results: client = IndeedClient(publisher = publisher_id) search_response = client.search(**params) root = ET.fromstring(search_response) params['start'] = str(int(params['start'])+25) total_results = int(root.find('totalresults').text) for job in root.iter('result'): jobtitle = job.find('jobtitle').text company = job.find('company').text city = job.find('city').text #state = job.find('state').text #country = job.find('country').text date = job.find('date').text snippet = job.find('snippet').text sponsored = job.find('sponsored').text url = job.find('url').text job = (unicode(jobtitle),unicode(company),unicode(city),unicode(date)[5:16].replace(" ","-"),unicode(sponsored), unicode(url)) if job not in job_list: job_list.append(job) job_list.insert(0,(unicode("jobtitle"),unicode("company"),unicode("city"),unicode("date"),unicode("sponsored"), unicode("url"))) #add header return job_list
def get_indeed_job_list(query, location, radius): client = IndeedClient(publisher=2863621289879018) progress_bar = pyprind.ProgBar(4, title='Searching For Jobs') results_pd = pd.DataFrame() for numb_results in range(0, 100, 25): params = { 'q': query, 'radius': radius, 'l': location, 'userip': "1.2.3.4", 'limit': '25', 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'start': numb_results } search_response = client.search(**params) results_pd = pd.concat( [results_pd, pd.DataFrame.from_dict(search_response['results'])], axis=0) progress_bar.update() if len(results_pd) == 0: sys.exit('Search did not return any jobs') results_pd.reset_index(drop=True, inplace=True) results_pd['date'] = pd.to_datetime(results_pd.date) results_pd.drop([ 'source', 'expired', 'country', 'formattedLocation', 'formattedLocationFull', 'onmousedown', 'stations', 'state', 'sponsored' ], axis=1, inplace=True) return results_pd # returns the search results as a pandas data frame
def job_search(self, job, location): # publisher=5950869068484812 client = IndeedClient('5950869068484812') #params = generate_advanced_query("python", "Boston", 1, 0, 25) params = self.generate_advanced_query(job, location, 1, 0, 25) search_response = client.search(**params) print "Search Response: %s" % search_response filename = 'indeed_positions_json.txt' self.write_json_to_file(filename, search_response) (positions, total) = self.extract_query_result(search_response) print total jobkeys = [] for position in positions: self.extract_position_info(position, jobkeys) #for i in range(len(jobkeys)): #print "range (%d: %s)" % (i, jobkeys[i]) #print '*' * 100 #job_response = client.jobs(jobkeys = "ad752ce9ae3f1b5e") #print job_response['results'] #print job_response #filename = 'indeed_positions_json.txt' #self.write_json_to_file(filename, job_response) return jobkeys
def search_with_api(self, params: dict): client = IndeedClient(publisher=self.user_config.INDEED_API_KEY) search_response = client.search(**params) total_number_hits = search_response['totalResults'] num_loops = int(total_number_hits / IndeedConstants.API.MAX_NUM_RESULTS_PER_REQUEST) counter_start = 0 print('Total number of hits: {0}'.format(total_number_hits)) count_jobs_added = 0 for i in range(0, num_loops): # We can get around MAX_NUM_RESULTS_PER_REQUEST by increasing our start location on each loop! params['start'] = counter_start search_response = client.search(**params) list_jobs = IndeedParser.get_jobs_from_response(search_response) for job in list_jobs: try: # TODO: This sucks, I'm just repeating myself... Job.create(key=job.key, website=job.website, link=job.link, title=job.title, company=job.company, city=job.city, state=job.state, country=job.country, location=job.location, posted_date=job.posted_date, expired=job.expired, easy_apply=job.easy_apply) count_jobs_added += 1 except peewee.IntegrityError as e: # TODO: Can I write a custom exception that catches UNIQUE Errors but not others? if 'UNIQUE' in str(e): pass else: print(str(e)) # Increment start counter_start += IndeedConstants.API.MAX_NUM_RESULTS_PER_REQUEST print('Added {0} new jobs'.format(count_jobs_added))
def indeed_urls(parameters, publisher_key=None): """Use Indeed publisher ID to retrieve URLs from the Indeed API.""" if publisher_key is None: publisher_key = os.environ["API_KEY"] client = IndeedClient(publisher_key) response = client.search(**parameters) try: urls = [str(links["url"]) for links in response["results"]] return urls except KeyError: raise NameError("Invalid Publisher ID")
def indeed_urls(parameters, publisher_key=None): """Use Indeed publisher ID to retrieve URLs from the Indeed API.""" if publisher_key is None: publisher_key = os.environ['API_KEY'] client = IndeedClient(publisher_key) response = client.search(**parameters) try: urls = [str(links['url']) for links in response['results']] return urls except KeyError: raise NameError('Invalid Publisher ID')
def Search(query, location, limit=10, start=0): client = IndeedClient(publisher=PUBLISHER_ID) params = { 'q': query, 'l': location, 'limit': limit, 'start': start, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)" } search_response = client.search(**params) return search_response
def access_indeed_api(parameters, publisher_key=None): """Access the Indeed API using the given parameters and publisher key. Positional argument: parameters -- a dictionary of the parameters to send to Indeed's API Keyword argument: publisher_key -- the publisher key for Indeed's API, defaults to environment variable """ if publisher_key is None: publisher_key = os.environ['API_KEY'] client = IndeedClient(publisher_key) response = client.search(**parameters) return response
class Threadr(object): def __init__(self, keyword, location): self.conn = boto.connect_s3() # Connecting to S3 self.bucket = self.conn.get_bucket( 'bucketofindeeds') # Accessing the correct bucket self.json_up = Key(self.bucket) # Make sure to name it. self.content_up = Key(self.bucket) # Make sure to name it. self.keyword = keyword self.location = location print('init done') def connect_indeed(self, config_filepath='indeed_cred.yml'): # Store in .ssh # yamload = yaml.load(open(config_filepath)) # credentials = yamload['indeed'] # pub_num = credentials.get('publisher_num') self.c = IndeedClient(publisher='4353162753214099') print('connect_indeed done') def parameters(self, keyword, location): # Make sure to try using multiple keywords ua = UserAgent(fallback='Your favorite Browser') self.params = { 'q': str(keyword), 'l': str(location), 'userip': requests.get("http://icanhazip.com").text, 'useragent': ua.random } print('parameters done') def job_search(self): self.response = self.c.search(**self.params) # This will return a json file. print(len(self.response['results']), 'jobs returned.') def send_json(self): self.json_up.key = 'indeed_jsons/test' self.json_up.set_contents_from_string(str(self.response) + '\n') print('Its Working.') def mine_that(self): self.connect_indeed() self.parameters(self.keyword, self.location) self.job_search() self.send_json()
def get_job_description(input_skills): client = IndeedClient('7863709885041358') params = { 'q': input_skills, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': 25 } job_urls = [] search_response = client.search(**params) for job in search_response['results']: job_urls.append(job['url']) bunch_of_words = [] for each_url in job_urls: bunch_of_words.extend(text_cleaner(each_url)) return bunch_of_words
def main(): # publisher=5950869068484812 client = IndeedClient('5950869068484812') params = generate_advanced_query("python", "Boston", 10, 0, 25) search_response = client.search(**params) #print search_response #filename = 'indeed_positions_json.txt' # write_json_to_file(filename, search_response) (positions, total) = extract_query_result(search_response) print total jobkeys = [] for position in positions: extract_position_info(position, jobkeys) for i in range(len(jobkeys)): print jobkeys[i]
def fetch_indeed_data(counties,search): from indeed import IndeedClient client = IndeedClient('6437444271691851') params = { 'q' : "analytics", 'l' : "bergen county, nj", 'userip' : "1.2.3.4", 'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'latlong' : 1, 'radius' : 10, 'fromage' : 7, 'limit' : 25 } params['q'] = search results = [] for county in counties: params['l'] = county results.append(client.search(**params)) return(results)
def main(): client = IndeedClient(PUB_ID) search_params = build_params(locations, JOB_QUERY) search_results = [] count = 1 for params in search_params: stdout.flush() stdout.write("\rHtml request: {}/{}".format(count, len(locations))) search_response = client.search(**params) search_results.append(search_response) count += 1 word_filter = ['and', 'to', 'the', 'of', 'a', 'in', 'with', 'you', 'on', 'that', 'are', 'will', 'is', 'your', 'for', 'we', 'from', 'an', 'be', 'have', 'or', 'just', 'can', 'also', 'how', 'at', 'as', 'do', 'other', 'should', 'what', 'us', 'this', 'it', 'if', 'get', '-', '&', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] count = 1 number_of_locations = len(search_results) word_map = Counter() for search in search_results: print "Currently on {}/{}".format(count, number_of_locations) if len(search['results']) == 0: print "Nothing found for: {}".format(search['location']) else: print "Attempting {}...".format(search['location']) for job in search['results']: url = job['url'] html = requests.get(url) word_list = pull_job_description(html.content) for word in word_list: if word.lower() not in word_filter: word_map[word.lower()] += 1 count += 1 save_to_file(OUTPUT_FILE, word_map)
def scrape_indeed(self, api_key, ip_address, places=None): indeed_client = IndeedClient(api_key) indeed_matched_jobs = [] seen_jobs = self.load_titles('indeed_jobs') if not places: places = ['san francisco, ca'] for place, term in [(place, term) for place in places for term in self.filters.keys()]: sys.stderr.write('Searching {} Indeed for {}... '.format( place, term)) # time.sleep(random.randrange(1, 3)) # throttle requests params = { 'q': term, 'l': place, 'userip': ip_address, 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': 25 } search_response = indeed_client.search(**params) job_results = search_response['results'] sys.stdout.write('returned {} items\n'.format(len(job_results))) for job in job_results: job_id = job['jobkey'] if job_id not in seen_jobs: seen_jobs.add(job_id) job_title = job['jobtitle'] if self.filter_title(job_title, self.filters[term]): indeed_matched_jobs.append([ job_title, job['formattedLocationFull'], job['url'], job['snippet'] ]) self.save_titles('indeed_jobs', seen_jobs) return indeed_matched_jobs
def fullmap(): gmaps = googlemaps.Client(key="AIzaSyAx1j38VITDr2p2-VclAyX8pSOp7C_1-kM") lctn = gmaps.geolocate() #reverse = gmaps.reverse_geocode(latlng = [lctn['location']['lat'],lctn['location']['lng']] ) client = IndeedClient('1905750874242217') params = { 'q': "python", 'l': "Kharkiv", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'co': "UA", 'latlong': 1, 'start': 0, 'limit': 25 } search_response = client.search(**params) jobs = json_normalize(search_response['results']) jobs_markers = [{ 'icon': '//maps.google.com/mapfiles/ms/icons/blue-dot.png', 'lat': lctn['location']['lat'], 'lng': lctn['location']['lng'], 'infobox': "My Location" }] for index, row in jobs.iterrows(): get_address = gmaps.places(query=row['company'] + ' ' + row['city'], location=str(lctn['location']['lat']) + ',' + str(lctn['location']['lng'])) company = json_normalize(get_address['results']) for index, row_company in company.iterrows(): jobs_markers.append({ 'icon': '//maps.google.com/mapfiles/ms/icons/red-dot.png', 'lat': row_company['geometry.location.lat'], 'lng': row_company['geometry.location.lng'], 'infobox': row['company'] + ' - ' + row_company['formatted_address'] + ' snippet:' + row['snippet'] }) #ltn = location() fullmap = Map( identifier="fullmap", varname="fullmap", style=("height:70%;" "width:99%;" "top:50;" "left:10;" "position:absolute;" "z-index:200;"), lat=lctn['location']['lat'], lng=lctn['location']['lng'], markers=jobs_markers, # maptype = "TERRAIN", zoom="11", #cluster=True fit_markers_to_bounds=True) return render_template('example_fullmap.html', fullmap=fullmap, GOOGLEMAPS_KEY=request.args.get('apikey'))
for loc in locations: params['l'] = loc print "Nuova Location ", loc for jt in job_titles: params['q'] = jt print "Inizio ricerca per ", loc, jt check = False for i in range( 25, 1025, 25): # il secondo parametro va impostato = al n di offerte tot params['limit'] = i params['start'] = i - 25 client.search(**params) request = client.search(**params) #storage.append(request) print "Richiesta: ", loc, jt, ". Ciclo: ", i, ", numero restituite: ", len( request['results']) if (len(request['results']) < 25): if (check): break else: check = True num_errors = 0 #for block in storage: for job in request['results']: data_job = (job['formattedRelativeTime'], job['city'],
class indeed: #jobDataFrame def __init__(self): # self.jobDataFrame= pd.DataFrame(); self.client = IndeedClient(8836246992678581) def skill(self, l, city, jobtype): #print l #print " AND ".join(l) print(jobtype) if jobtype in ['intern', 'internship', 'Internship']: jobtype = 'internship' else: jobtype = 'fulltime' params = { 'q': " AND ".join(l), 'l': city, 'jt': jobtype, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': "25", 'start': 0, 'highlight': 1 } i = 25 search_response = self.client.search(**params) results = [] if (len(search_response['results']) <= 0): return results while (i < 100 and i < search_response['totalResults']): results += search_response['results'] params['start'] += 25 search_response = self.client.search(**params) results += search_response['results'] i += 25 print(params['start']) self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey') self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8') return results def skillOR(self, l, city, jobtype): #print l #print " AND ".join(l) print(jobtype) if jobtype in ['intern', 'internship', 'Internship']: jobtype = 'internship' else: jobtype = 'fulltime' params = { 'q': " OR ".join(l), 'l': city, 'jt': jobtype, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': "50" } i = 25 search_response = self.client.search(**params) results = [] if (len(search_response['results']) <= 0): return results while (i < 100 and i < search_response['totalResults']): results += search_response['results'] params['start'] += 25 search_response = self.client.search(**params) results += search_response['results'] i += 25 print(params['start']) self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey') self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8') return results def similarJobs(self, job): print("the job is" + job) sampledfo = pd.read_csv("sample.csv", encoding='UTF-8') sampledf = sampledfo.copy() del sampledf['stations'] del sampledf['Unnamed: 0'] del sampledf['source'] del sampledf['onmousedown'] del sampledf['formattedLocation'] del sampledf['formattedLocationFull'] del sampledf['url'] del sampledf['date'] del sampledf['formattedRelativeTime'] sampledf['indeedApply'] = [ 0 if x == 'false' else 1 for x in sampledf['indeedApply'] ] sampledf['expired'] = [ 0 if x == 'false' else 1 for x in sampledf['expired'] ] sampledf['sponsored'] = [ 0 if x == 'false' else 1 for x in sampledf['sponsored'] ] jobNo = job self.dataJob = sampledf.loc[sampledf['jobkey'] == jobNo] df = sampledf[sampledf["jobkey"] != jobNo] # df[''] = ['red' if x == 'Z' else 'green' for x in df['Set']] df.ix[df.city == self.dataJob.city.iloc[0], ['city', 'country', 'state']] = 1 df.ix[df.city != 1, ['city', 'country', 'state']] = 0 df.ix[df.company == self.dataJob.company.iloc[0], ['company']] = 1 df.ix[df.company != 1, ['company']] = 0 # df[''] = df.apply(my_test2, axis=1) df['snippet'] = [ textSim.cosine_sim(x, self.dataJob.snippet.iloc[0]) for x in df['snippet'] ] df['jobtitle'] = [ textSim.cosine_sim(x, self.dataJob.jobtitle.iloc[0]) for x in df['jobtitle'] ] df['variance'] = df['city'] + df['company'] + df['country'] + df[ 'expired'] + df[ 'indeedApply'] + 10 * df['snippet'] + 5 * df['jobtitle'] result = df.sort(['variance'], ascending=False) #import pdb; pdb.set_trace() simList = result['jobkey'][:10].tolist() simDict = [] for x in simList: s = sampledfo.loc[sampledfo['jobkey'] == x] simDict.append(s.to_dict(orient='records')[0]) return simDict
def indeedAPI(defTask): params = {} params['userip'] = "1.2.3.4", params['useragent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)" params['start'] = 1 params['latlong'] = 1 params['as_ttl'] = '' params['limit'] = 25 params['fromage']='any' params['radius'] = 0 params['q'] = "grants+management" params['highlight'] = 0 params['jobtitle'] = '' compincr = 25 complevels = 11 indeedapi = IndeedClient(publisher='7423517030312598') print print 'START:',str(time.asctime(time.localtime())) newJobs = 0 expiredJobs = 0 states=["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"] oldlist=[] newlist=[] runStart = time.time() for job in db.fluxxJobs.find(): oldlist.append(job['jobkey']) for state in states: print state, params['l'] = state for c in range(complevels): params['salary'] = "$" + str(c * 25) + "K-$" + str(((c+1) * 25)-1) + "K" if c == (complevels-1): params['salary'] = "$" + str(c*compincr) + "K" sr = indeedapi.search(**params) tr = sr['totalResults'] ps = params['salary'].replace("$","") for apirequests in range((tr/compincr)+1): params['start'] = (apirequests * compincr) sr = indeedapi.search(**params) for joblisting in sr['results']: jobListing = json.loads(json.dumps(joblisting)) newlist.append(jobListing['jobkey']) if joblisting['jobkey'] not in oldlist: newJobs += 1 listed = joblisting['date'].replace('GMT','UTC') joblisting['dateOpen'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S') joblisting['datesOpen'] = timeDictStamp('') joblisting['_id'] = joblisting['jobkey'] joblisting['status'] = 'Open' joblisting['searchparams'] = params joblisting['searchparams']['procTime']=datetime.now().strftime("%Y-%m-%d %H:%M:%S") joblisting['searchparams']['totalResults'] = tr joblisting['compMin'] = c*25000 joblisting['compMax'] = (c+1) * 25000 joblisting['compRange'] = params['salary'] if joblisting['city'] == "": del joblisting['city'] if joblisting['state'] == "": del joblisting['state'] job = joblisting jobID=job['_id'] Title=job['jobtitle'][0:60].replace("'","") Company=job['company'].replace("'","").encode('latin-1','ignore') if 'source' in job: Source=job['source'].replace("'","") else: Source = '' Description=job['snippet'][0:250].replace("'","").encode('latin-1','ignore').replace("'","") Description = cleanup(Description,{'<b>':'','</b>':'','<B>':'','</B>':''}) listed = job['date'].replace('GMT','UTC') job['dateListed'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S') CompRange_min=job['compMin'] CompRange_max=job['compMax'] CompRange=job['compRange'] textURL = "http://www.indeed.com/viewjob?jk=" + job['jobkey'] jd = {} jd['jdText'] = alchemy(textURL,'URLGetText','text') jd['jdConcepts'] = alchemy(textURL,'URLGetRankedConcepts','concepts') jd = json.loads(json.dumps(jd)) job['jobDescription'] = jd if not db.fluxxJobs.find_one({'jobkey':job['jobkey']}): db.grantsJobs.save(job) delisted = set(oldlist).difference(set(newlist)) for jobkey in delisted: expiredJobs+=1 rightnow = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") upResult = db.grantsJobs.update({'jobkey':jobkey},{'$set':{'dateClosed':rightnow,'datesClosed':timeDictStamp(),'status':'Closed','expired':'true'}}) print " " print 'FINISH:',str(time.asctime(time.localtime())) print '=================================================================================================='
import time client = IndeedClient(publisher='') params = { 'q': "internship", 'l': "Zurich", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'radius': 50, 'limit': 100, 'co': 'ch', 'sort': 'date' } search_response = client.search(**params) filename = 'jobs_' + str(time.localtime()[0]) + str(time.localtime()[1]) + str( time.localtime()[2]) + '.txt' with open(r'export path' + filename, 'w') as textfile: textfile.write('acquisition time: ' + str(time.localtime()[3]) + ':' + str(time.localtime()[4]) + '\n\n') for i in range(0, len(search_response)): reltime = search_response['results'][i]['formattedRelativeTime'] jobtitle = search_response['results'][i]['jobtitle'] company = search_response['results'][i]['company'] url = search_response['results'][i]['url'] textfile.write(reltime + '\t' + jobtitle + '\t company: ' + company + '\n' + url + '\n\n') textfile.close()
newJobs = 0 expiredJobs = 0 states=["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"] oldlist=[] newlist=[] runStart = time.time() for job in db.fluxxJobs.find(): oldlist.append(job['jobkey']) for state in states: print state, params['l'] = state for c in range(complevels): params['salary'] = "$" + str(c * 25) + "K-$" + str(((c+1) * 25)-1) + "K" if c == (complevels-1): params['salary'] = "$" + str(c*compincr) + "K" sr = indeedapi.search(**params) tr = sr['totalResults'] ps = params['salary'].replace("$","") for apirequests in range((tr/compincr)+1): params['start'] = (apirequests * compincr) sr = indeedapi.search(**params) for joblisting in sr['results']: jobListing = json.loads(json.dumps(joblisting)) newlist.append(jobListing['jobkey']) if joblisting['jobkey'] not in oldlist: newJobs += 1 listed = joblisting['date'].replace('GMT','UTC') joblisting['dateOpen'] = datetime.datetime.strftime(datetime.datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S') joblisting['datesOpen'] = timeDictStamp() joblisting['_id'] = joblisting['jobkey'] joblisting['status'] = 'Open'
def indeedAPI2(defTask): params = {} params['userip'] = "1.2.3.4", params['useragent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)" params['start'] = 1 params['latlong'] = 1 params['as_ttl'] = '' params['limit'] = 25 params['fromage']='any' params['radius'] = 0 params['q'] = '' params['highlight'] = 0 params['jobtitle'] = '' compincr = 25 complevels = 11 indeedapi = IndeedClient(publisher='7423517030312598') print params print 'START:',str(time.asctime(time.localtime())) newJobs = 0 expiredJobs = 0 # states=["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"] states = ['Climate and Land Use Alliance','John S. and John L. Knight Foundation','Cynthia and George Mitchell Foundation','Atlantic Philanthropies','Council of State Governments','Leukemia & Lymphoma Society','John D and Catherine T. MacArthur Foundation','Unbound Philanthropy','Garfield Foundation','Freedom House','Wikimedia Foundation','AFDO','Getty Foundation','Altman Foundation','Colorado Trust','Jessie Ball duPont Fund','Arthur Vining Davis Foundations','The Christensen Fund','Rita Allen Foundation','NBA Legends','Trio Foundation of St. Louis','Surdna Foundation','Kresge Foundation','Carnegie Corporation of New York','Central Valley Community Foundation','Democracy Fund','Committee to Protect Journalists','American Cancer Society','Winthrop Rockefeller Foundation','Walter and Elise Haas Fund','ClimateWorks Foundation','Zellerbach Family Foundation','Hillman Family Foundations','Bosch Community Fund','The Scan Foundation','Hogg Foundation','Unitarian Universalist Service Committee','Whole Foods Market','Open Road Foundation','Max M. & Marjorie S. Fisher Foundation','ArtPlace America','Grace and Mercy Foundation','Alliance for Early Success','The New York Womens Foundation','DentaQuest','ECMC Foundation','Great Lakes Higher Education Guaranty','The J. Willard and Alice S. Marriott Foundation','Indiana Historical Society','Wallace H. Coulter Foundation'] oldlist=[] newlist=[] runStart = time.time() print params for job in db.fluxxJobs.find(): oldlist.append(job['jobkey']) for state in states: print state params['company'] = state for c in range(complevels): params['salary'] = "$" + str(c * 25) + "K-$" + str(((c+1) * 25)-1) + "K" if c == (complevels-1): params['salary'] = "$" + str(c*compincr) + "K" sr = indeedapi.search(**params) tr = sr['totalResults'] ps = params['salary'].replace("$","") for apirequests in range((tr/compincr)+1): params['start'] = (apirequests * compincr) sr = indeedapi.search(**params) for joblisting in sr['results']: jobListing = json.loads(json.dumps(joblisting)) newlist.append(jobListing['jobkey']) if joblisting['jobkey'] not in oldlist: newJobs += 1 listed = joblisting['date'].replace('GMT','UTC') joblisting['dateOpen'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S') joblisting['datesOpen'] = timeDictStamp('') joblisting['_id'] = joblisting['jobkey'] joblisting['status'] = 'Open' joblisting['searchparams'] = params joblisting['searchparams']['procTime']=datetime.now().strftime("%Y-%m-%d %H:%M:%S") joblisting['searchparams']['totalResults'] = tr joblisting['compMin'] = c*25000 joblisting['compMax'] = (c+1) * 25000 joblisting['compRange'] = params['salary'] if joblisting['city'] == "": del joblisting['city'] if joblisting['state'] == "": del joblisting['state'] job = joblisting jobID=job['_id'] Title=job['jobtitle'][0:60].replace("'","") Company=job['company'].replace("'","").encode('latin-1','ignore') if 'source' in job: Source=job['source'].replace("'","") else: Source = '' Description=job['snippet'][0:250].replace("'","").encode('latin-1','ignore').replace("'","") Description = cleanup(Description,{'<b>':'','</b>':'','<B>':'','</B>':''}) listed = job['date'].replace('GMT','UTC') job['dateListed'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S') CompRange_min=job['compMin'] CompRange_max=job['compMax'] CompRange=job['compRange'] textURL = "http://www.indeed.com/viewjob?jk=" + job['jobkey'] jd = {} jd['jdText'] = alchemy(textURL,'URLGetText','text') jd['jdConcepts'] = alchemy(textURL,'URLGetRankedConcepts','concepts') jd = json.loads(json.dumps(jd)) job['jobDescription'] = jd if not db.fluxxJobs.find_one({'jobkey':job['jobkey']}): db.grantsJobs.save(job) delisted = set(oldlist).difference(set(newlist)) for jobkey in delisted: expiredJobs+=1 rightnow = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") upResult = db.grantsJobs.update({'jobkey':jobkey},{'$set':{'dateClosed':rightnow,'datesClosed':timeDictStamp(),'status':'Closed','expired':'true'}}) print " " print 'FINISH:',str(time.asctime(time.localtime())) print '=================================================================================================='
def search(params): client = IndeedClient(publisher=8201417039877332) res = client.search(**params) return res
class TestSearch: def setup(self): self.client = IndeedClient('8251007850639120') self.params = { 'q': "python", 'l': "austin", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", } self.utils = Utils() def teardown(self): self.client = None self.params = None @with_setup(setup, teardown) def test_search(self): search_response = self.client.search(**self.params) assert type(search_response) is dict self.utils.output_to_file('sample', search_response) @with_setup(setup, teardown) def test_missing_one_required(self): del self.params['l'] search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) @raises(IndeedClientException) def test_missing_both_required(self): del self.params['q'] del self.params['l'] search_esponse = self.client.search(**self.params) @with_setup(setup, teardown) @raises(IndeedClientException) def test_missing_userip(self): del self.params['userip'] search_response = self.client.search(**self.params) @with_setup(setup, teardown) @raises(IndeedClientException) def test_missing_useragent(self): del self.params['useragent'] search_response = self.client.search(**self.params) @with_setup(setup, teardown) def test_raw_json(self): self.params['raw'] = True search_response = self.client.search(**self.params) assert isinstance(search_response, basestring) assert type(json.loads(search_response)) is dict @with_setup(setup, teardown) def test_raw_xml_with_paramter(self): self.params['format'] = "xml" self.params['raw'] = True search_response = self.client.search(**self.params) assert isinstance(search_response, basestring) assert parseString(search_response) @with_setup(setup, teardown) def test_raw_xml_without_paramter(self): self.params['format'] = "xml" search_response = self.client.search(**self.params) assert isinstance(search_response, basestring) assert parseString(search_response) ''' Few Tests written by me ''' @with_setup(setup, teardown) def test_search_extra(self): search_response = self.client.search(**self.params) assert type(search_response) is dict assert len( self.utils.find_all_jobs_not_contains_job_parameter( search_response, 'city', 'austin')) == 0 assert len(self.utils.find_all_jobs_not_contains_job_parameter(search_response, 'country', 'US'))\ == 0 assert len(self.utils.find_all_jobs_not_contains_job_parameter(search_response, 'language', 'en')) \ == 0 assert self.utils.get_num_jobs(search_response) == 10 @with_setup(setup, teardown) def test_sort(self): self.params['sort'] = "date" search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_start(self): self.params['start'] = "2" search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_limit(self): self.params['limit'] = "25" search_response = self.client.search(**self.params) assert type(search_response) is dict assert self.utils.get_num_jobs(search_response) == 25 @with_setup(setup, teardown) def test_fromage(self): self.params['fromage'] = "2" search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_limit(self): self.params['limit'] = "25" search_response = self.client.search(**self.params) assert type(search_response) is dict assert self.utils.get_num_jobs(search_response) == 25 @with_setup(setup, teardown) def test_highlight(self): self.params['highlight'] = "1" search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_duplicate(self): self.params['duplicate'] = "1" search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_co(self): self.params['co'] = "ca" self.params['l'] = "toronto" search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_invalid_limit(self): self.params['limit'] = '-100' search_response = self.client.search(**self.params) assert self.utils.get_num_jobs(search_response) == 0 # trying a bunch of invalid parameters, I noticed that no error is thrown. Instead it seems to ignore. It this correct? # ie. negative fromage, string instead of ints and vs versa @with_setup(setup, teardown) def test_several_params(self): self.params['co'] = "ca" self.params['l'] = "toronto" self.params['duplicate'] = "1" self.params['highlight'] = "1" self.params['limit'] = "25" self.params['fromage'] = "10" self.params['start'] = "2" search_response = self.client.search(**self.params) assert type(search_response) is dict assert self.utils.get_num_jobs(search_response) == 25
# create a session Session = sessionmaker(bind=engine) session = Session() # publisher=5950869068484812 client = IndeedClient('5950869068484812') params = { 'q': "python", 'l': "Palo Alto", 'userip': "168.159.213.210", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4)", 'limit': "50", 'sort': "date", 'start': "0" } search_response = client.search(**params) print search_response # print search_response['results'] # use JSON editor online to view result # http://www.jsoneditoronline.org/ with open('indeed_positions_json.txt', 'w') as outfile: jobs = json.dump(search_response, outfile) #jobs = json.load(search_response) for key, value in search_response.iteritems(): #print "%s: %s" % (key, value) if key == "results": res = value for index in value: #print i
def get_api_results(self, desired_result_count=1): '''return job json objects from the indeed api.''' job_profile = CommonFuncs.get_job_profile() # GET LOCATION IN JOB PROFILE locations = CommonFuncs.get_locations_list(job_profile) # KEYWORDS CONNECTED BY OR query_list = CommonFuncs.build_query_string(job_profile=job_profile, or_delim='or', bracket1='(', bracket2=')', adv_supp=True) query_string = query_list[0] new_jobs_queue = queue.Queue(maxsize=0) new_jobs = None limit = '25' # 25 is the max results per request lookback_period = '60' # default lookback period client_id = {} api = None # CONNECT TO INDEED API FOR JOB QUERIES try: client_id = json.load(open(API_KEYS_PATH, 'r')) api = IndeedClient(publisher=client_id['publisher_id']) except: ValueError('No publisher id found. Filtering aborted.') filters = { 'q': query_string, 'l': '', 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", "raw": "False", "sort": "date", "radius": job_profile.radius, "limit": limit, "fromage": lookback_period, } # FIND NEW JOB JSON OBJECT USING INDEED API # GET NEW JOBS for location in locations: # iterate over each location filters['l'] = location filters['q'] = query_string # THREAD-BRAINED APPROACH to get all results at once def get_results(i): '''get results and check against the db if they are new. add to queue if new''' filters['start'] = i temp_list = [] # get 25 results, using provided filters with start index [ temp_list.append(x) for x in json.loads( CommonFuncs.convertBytesToString(api.search( **filters)))['results'] ] [ new_jobs_queue.put(x) for x in temp_list if new_jobs_queue.unfinished_tasks < desired_result_count ] result_count = int( json.loads( CommonFuncs.convertBytesToString( api.search(**filters)))['totalResults']) list_of_filter_starts = [ str(i) for i in range(0, result_count, 25) ] # build list of start positions for item in list_of_filter_starts: if not new_jobs_queue.unfinished_tasks < desired_result_count: break get_results(item) new_jobs = list( new_jobs_queue.queue) # append query results to list # RETURN JOBS if new_jobs: if desired_result_count == 1: # just return a single job, not in a list return new_jobs[0] elif desired_result_count <= len( new_jobs ): # if we have more than enough new jobs, return those in a list return new_jobs[0:desired_result_count] else: # if more than the available number of new jobs requested, return all that could be found return new_jobs else: return [] # if no new links found
class indeed: #jobDataFrame def __init__(self): # self.jobDataFrame= pd.DataFrame(); self.client = IndeedClient(8836246992678581) def skill(self, l, city, jobtype): #print l #print " AND ".join(l) print(jobtype) if jobtype in ['intern', 'internship', 'Internship']: jobtype = 'internship' else: jobtype = 'fulltime' params = { 'q': " AND ".join(l), 'l': city, 'jt': jobtype, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': "25", 'start': 0, 'highlight': 1 } i = 25 search_response = self.client.search(**params) results = [] if (len(search_response['results']) <= 0): return results while (i < 100 and i < search_response['totalResults']): results += search_response['results'] params['start'] += 25 search_response = self.client.search(**params) results += search_response['results'] i += 25 print(params['start']) self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey') self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8') return results def skillOR(self, l, city, jobtype): #print l #print " AND ".join(l) print(jobtype) if jobtype in ['intern', 'internship', 'Internship']: jobtype = 'internship' else: jobtype = 'fulltime' params = { 'q': " OR ".join(l), 'l': city, 'jt': jobtype, 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'limit': "50" } i = 25 search_response = self.client.search(**params) results = [] if (len(search_response['results']) <= 0): return results while (i < 100 and i < search_response['totalResults']): results += search_response['results'] params['start'] += 25 search_response = self.client.search(**params) results += search_response['results'] i += 25 print(params['start']) self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey') self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8') return results
from indeed import IndeedClient import csv client = IndeedClient(publisher = 2186395790213512) tot = [] for i in range(0, 8): params = { 'q' : "marketing", 'userip' : "1.2.3.4", 'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", 'format' : 'json', 'limit' : 25, 'start' : i*25 } sr = client.search(**params) for j in range(0, len(sr['results'])): tot.append(sr['results'][j]) allJobs = [] for i in range(0, len(tot)): currJob = [] currJob.append(tot[i]['jobtitle'].encode('ascii', 'ignore')) currJob.append(tot[i]['url'].encode('ascii', 'ignore')) currJob.append(tot[i]['city'].encode('ascii', 'ignore')) currJob.append(tot[i]['date'].encode('ascii', 'ignore')) currJob.append(tot[i]['company'].encode('ascii', 'ignore')) currJob.append(tot[i]['snippet'].encode('ascii', 'ignore')) currJob.append(tot[i]['source'].encode('ascii', 'ignore')) currJob.append(tot[i]['jobkey'].encode('ascii', 'ignore')) allJobs.append(currJob)
class TestSearch: def setup(self): self.client = IndeedClient("YOUR_PUBLISHER_NUMBER") self.params = { 'q': "python", 'l': "austin", 'userip': "1.2.3.4", 'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)", } def teardown(self): self.client = None self.params = None @with_setup(setup, teardown) def test_search(self): search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) def test_missing_one_required(self): del self.params['l'] search_response = self.client.search(**self.params) assert type(search_response) is dict @with_setup(setup, teardown) @raises(IndeedClientException) def test_missing_both_required(self): del self.params['q'] del self.params['l'] search_response = self.client.search(**self.params) @with_setup(setup, teardown) @raises(IndeedClientException) def test_missing_userip(self): del self.params['userip'] search_response = self.client.search(**self.params) @with_setup(setup, teardown) @raises(IndeedClientException) def test_missing_useragent(self): del self.params['useragent'] search_response = self.client.search(**self.params) @with_setup(setup, teardown) def test_raw_json(self): self.params['raw'] = True search_response = self.client.search(**self.params) assert isinstance(search_response, basestring) assert type(json.loads(search_response)) is dict @with_setup(setup, teardown) def test_raw_xml_with_paramter(self): self.params['format'] = "xml" self.params['raw'] = True search_response = self.client.search(**self.params) assert isinstance(search_response, basestring) assert parseString(search_response) @with_setup(setup, teardown) def test_raw_xml_without_paramter(self): self.params['format'] = "xml" search_response = self.client.search(**self.params) assert isinstance(search_response, basestring) assert parseString(search_response)