def CareerJet(locale,loc,key): cj = CareerjetAPIClient(locale) result_json = cj.search({ 'location' : loc, 'keywords' : key, 'affid' : 'c3fd1d19a754927bc31347c56f397b0f', 'pagesize' : '10', 'page' : 1, 'user_ip' : '192.168.0.102', 'url' : 'http://www.google.com/', 'user_agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' }); max_pages = result_json['pages'] print(max_pages) for value in range(1,2): result_json = cj.search({ 'location' : loc, 'keywords' : key, 'affid' : 'c3fd1d19a754927bc31347c56f397b0f', 'pagesize' : '99', 'page' : value, 'user_ip' : '192.168.0.102', 'url' : 'http://www.google.com/', 'user_agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' }); jobs = result_json['jobs'] # print(jobs) # this for loop is for going to the redirected link which contains full_description for url in jobs: # print(url['url']) url2 = url['url'] url3 = url2.split('/') url4 = url3[len(url3)-1] url5 = LOCALES[locale]+"/jobview/"+url4 url6 = url4.split('.') id1 = url6[0] # print(id) date = url['date'] mini_description = url['description'] website = url['site'] salary = url['salary'] # print(date) # print(mini_description) # print(website) # print(salary) # print(url5) trade_spider(url5,locale,id1,date,mini_description,website,salary) with open("careerjet1.json", "w") as outf: json.dump(data, outf)
def csv_tech(): try: cj = CareerjetAPIClient("en_US") #This tells the api what country to look for result_json = cj.search({ 'location': 'USA', 'keywords': 'remodeling', 'affid': '213e213hd12344552', 'user_ip': '129.114.19.4', 'url': 'https://www.careerjet.com/search/jobs?s=remodeling&l=', 'user_agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' }) #This is the information is used to do a get the information result_json_1 = ast.literal_eval(json.dumps(result_json)) #This is used to strip u' from all the dictionaries jobs = (result_json_1['jobs']) #This is used to get the jobs from the results of the api cal dictOfWords = {i: jobs[i] for i in range(0, len(jobs))} #This is used to give a number for each job and its information joblist1 = [] joblist2 = [] joblist3 = [] joblist4 = [] joblist5 = [] joblist6 = [] joblist7 = [] joblist8 = [] #A list used to get the information of each job for i in range(0, len(dictOfWords)): job_dict = dictOfWords.get(i) joblist1.append(job_dict.get('salary')) if (job_dict.get('description') != ''): joblist2.append(job_dict.get('description')) joblist3.append(job_dict.get('title')) joblist4.append(job_dict.get('url')) joblist5.append(job_dict.get('company')) joblist6.append(job_dict.get('locations')) joblist7.append(job_dict.get('site')) joblist8.append(job_dict.get('date')) #This for loop spilts the information of each job into its own index in the list except: print 'Failure!' return joblist1, joblist2, joblist3, joblist4, joblist5, joblist6, joblist7, joblist8
def crawl(): cj = CareerjetAPIClient("en_GB") result_json = cj.search({'location':'london', 'affid':'15dab4607182f9977616ac6edc0ddd75', 'user_ip':'192.168.1.64', 'user_agent':'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0', 'url':'http://www.example.com/jobsearch?q=python&l=london'}) jobs = result_json['jobs'] db = mysql.connect(host="localhost", user="******", passwd="iiits_12345", db="onet") cur = db.cursor() cur.execute("""SELECT title, description ,onetsoc_code FROM occupation_data """) RI = [] db_results = cur.fetchall() for j in range(len(jobs)): RI.append([]) title = str(jobs[j]['title'].encode('utf8')) description = str(jobs[j]['description'].encode('utf8')) for _c in range(len(db_results)): onet_title = db_results[_c][0] onet_desc = db_results[_c][1] onet_code = db_results[_c][2] cur.execute("""SELECT alternate_title, short_title from alternate_titles WHERE onetsoc_code = """ + "'"+str(onet_code)+"'") alter = cur.fetchall(); bias = 0.0 for a in alter: tbias = similarity(title, a[0]) if a[1] != None: tbias += similarity(title, a[1]) bias = max(tbias, bias) score = bias + similarity(title,onet_title )+ 0.5*similarity(title, onet_desc)+0.5*similarity(description, onet_title) + 0.1*similarity(description, onet_desc) RI[j].append(tuple([onet_code, onet_title, score])) RI[j] = tuple([title,sorted(RI[j], key=lambda x: x[2], reverse=True)]) for r in RI: print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" print r[0] for x in r[1][:10]: print x print "---------------------------------------------------------------" return RI
import requests import numpy as np from requests.exceptions import HTTPError # Github link for more description on this API: https://github.com/careerjet/careerjet-api-client-python projectId = "nth-honor-259919" key = "5f414fcedee8f11f5c8116653559c13daa2dbe03" affId = "8acafed2c2c1c95fdd17ea85633d394a" from careerjet_api_client import CareerjetAPIClient cj = CareerjetAPIClient("en_US") result_json = cj.search({ 'location': 'seattle', 'keywords': 'java&python&aws', 'pagesize': '5', 'affid': "8acafed2c2c1c95fdd17ea85633d394a", 'user_ip': '209.141.193.102', 'url': 'https://www.seekrlabs.com/jobsearch?q=python&l=london', 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36' }) print(result_json)
from careerjet_api_client import CareerjetAPIClient import json, ast, csv try: cj = CareerjetAPIClient("en_US") #This tells the api what country to look for result_json = cj.search({ 'location': 'USA', 'keywords': 'remodeling', 'affid': '213e213hd12344552', 'user_ip': '129.114.19.4', 'url': 'https://www.careerjet.com/search/jobs?s=remodeling&l=', 'user_agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' }) #This is the information is used to do a get the information result_json_1 = ast.literal_eval(json.dumps(result_json)) #This is used to strip u' from all the dictionaries jobs = (result_json_1['jobs']) #This is used to get the jobs from the results of the api cal dictOfWords = {i: jobs[i] for i in range(0, len(jobs))} #This is used to give a number for each job and its information joblist = [] #A list used to get the information of each job for i in range(0, len(dictOfWords)):
from careerjet_api_client import CareerjetAPIClient cj = CareerjetAPIClient("en_US") result_json = cj.search({ 'location': 'london', 'keywords': 'python', 'affid': '213e213hd12344552', 'user_ip': '11.22.33.44', 'url': 'http://www.example.com/jobsearch?q=python&l=london', 'user_agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' }) print len(result_json)
import json import os import urllib2 from urllib import urlencode from careerjet_api_client import CareerjetAPIClient from django.http import JsonResponse, HttpResponseBadRequest, HttpResponse from models import SearchResult, SavedJob, Job, User careerjet_key = os.environ.get('CAREERJET_KEY') cj = CareerjetAPIClient("en_US") indeed_key = os.environ.get('INDEED_KEY') def careerjet_query(job, location, url, user_ip, user_agent): cj_query = { 'location': location, 'keywords': job, 'affid': careerjet_key, 'user_ip': user_ip, 'url': url, 'user_agent': user_agent, } # Get the first results page result_json = cj.search(cj_query) # Get the other pages all_jobs = result_json['jobs'] for page in range(2, result_json['pages']):
from careerjet_api_client import CareerjetAPIClient import time, json from config import affid, user_ip, url cj = CareerjetAPIClient("en_GB"); locations = ['africa', 'asia', 'australia', 'europe', 'north america', 'south america'] for loc in locations: result_json = cj.search({ 'location' : loc, 'keywords' : 'data scientist', 'sort' : 'salary', 'affid' : affid, 'user_ip' : user_ip, 'url' : url, 'user_agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' }); with open(loc + '.json', 'w') as outfile: json.dump(result_json, outfile) time.sleep(20)
# Second Argument: 'USER_AGENT' - The User Agent of the user's browser # e.x.: 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0' # IMPORTANT ----- Due to how command line works, we replace '(' and ')' with '*' # and ';' with '^'. Need to change that in this program. # Third Argument: 'URL' - The URL that this information gained will be displaying on. # I'm guessing during test it will be 'http://localhost/search?example=true' # ---------------------NOT REQUIRED # Fourth Argument: 'Keywords' - Keywords to match the title, content, or company name of a job # # Returns: This will return an object to stdout. from careerjet_api_client import CareerjetAPIClient import sys import re import json cj = CareerjetAPIClient("en_US") IP = sys.argv[1] USER_AGENT = (" ").join(sys.argv[2].replace('*', '(').replace('+', ')').replace( '^', ';').split('-')) AFFILIATE_ID = sys.argv[5] #'12675a3a68f21b112085faa208e88b9d' URL = sys.argv[3] KEYWORDS = sys.argv[4] # print IP, USER_AGENT, URL, keywords result = cj.search({ 'keywords': KEYWORDS, 'affid': AFFILIATE_ID, 'user_ip': IP, 'url': URL,
return ' '.join(resultwords) comp["Companies"] = comp["Companies"].apply(lambda x: x.lower()) comp["Companies"] = comp["Companies"].apply(lambda x: removewords(x)) comp["Companies"] = comp["Companies"].apply(lambda x: x.replace(" ", "+")) #function to request data from Careerjets API. You will need to sign up for an account to use this: http://www.careerjet.co.uk/partners #Add in you affid (top right of screen), IP address (open terminal, type ipconfig if you don't know this) # and put in your user_agent (web browser in) #More informatino can be found here: https://github.com/careerjet/careerjet-api-client-python #this will return the count of jobs for each keyword search however, if you remove the ["hits"] there is a lot more information available from careerjet_api_client import CareerjetAPIClient cj = CareerjetAPIClient("en_GB") def careerjetAPI(keywords): result_json = cj.search({ 'keywords': keywords, 'affid': '1fda10a87c250ec936a0082f7244ec97', 'user_ip': '172.28.3.16', 'url': 'http://www.careerjet.co.uk/jobsearch?q=' + keywords, 'user_agent': 'Firefox/31.0' }) time.sleep(2) return result_json["hits"] #Cycle through the URL's changing the keywords