Esempio n. 1
0
def CareerJet(locale,loc,key):
    cj  =  CareerjetAPIClient(locale)

    result_json = cj.search({
                        'location'    : loc,
                        'keywords'    : key,
                        'affid'       : 'c3fd1d19a754927bc31347c56f397b0f',
                        'pagesize'   : '10',
                        'page'       : 1,
                        'user_ip'     : '192.168.0.102',
                        'url'         : 'http://www.google.com/',
                        'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
                      });

    max_pages = result_json['pages']
    print(max_pages)

    for value in range(1,2):
        result_json = cj.search({
                        'location'    : loc,
                        'keywords'    : key,
                        'affid'       : 'c3fd1d19a754927bc31347c56f397b0f',
                        'pagesize'   : '99',
                        'page'       : value,
                        'user_ip'     : '192.168.0.102',
                        'url'         : 'http://www.google.com/',
                        'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
                      });

        jobs = result_json['jobs']
        # print(jobs)

        #   this for loop is for going to the redirected link which contains full_description
        for url in jobs:
            # print(url['url'])
            url2 = url['url']
            url3 = url2.split('/')
            url4 = url3[len(url3)-1]
            url5 = LOCALES[locale]+"/jobview/"+url4
            url6 = url4.split('.')
            id1 = url6[0]
            # print(id)

            date = url['date']
            mini_description = url['description']
            website = url['site']
            salary = url['salary']

            # print(date)
            # print(mini_description)
            # print(website)
            # print(salary)

            # print(url5)
            trade_spider(url5,locale,id1,date,mini_description,website,salary)

    with open("careerjet1.json", "w") as outf:
            json.dump(data, outf)
Esempio n. 2
0
def csv_tech():
    try:
        cj = CareerjetAPIClient("en_US")
        #This tells the api what country to look for
        result_json = cj.search({
            'location':
            'USA',
            'keywords':
            'remodeling',
            'affid':
            '213e213hd12344552',
            'user_ip':
            '129.114.19.4',
            'url':
            'https://www.careerjet.com/search/jobs?s=remodeling&l=',
            'user_agent':
            'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
        })
        #This is the information is used to do a get the information
        result_json_1 = ast.literal_eval(json.dumps(result_json))
        #This is used to strip u' from all the dictionaries
        jobs = (result_json_1['jobs'])
        #This is used to get the jobs from the results of the api cal
        dictOfWords = {i: jobs[i] for i in range(0, len(jobs))}
        #This is used to give a number for each job and its information
        joblist1 = []
        joblist2 = []
        joblist3 = []
        joblist4 = []
        joblist5 = []
        joblist6 = []
        joblist7 = []
        joblist8 = []
        #A list used to get the information of each job
        for i in range(0, len(dictOfWords)):
            job_dict = dictOfWords.get(i)
            joblist1.append(job_dict.get('salary'))
            if (job_dict.get('description') != ''):
                joblist2.append(job_dict.get('description'))
                joblist3.append(job_dict.get('title'))
                joblist4.append(job_dict.get('url'))
                joblist5.append(job_dict.get('company'))
                joblist6.append(job_dict.get('locations'))
                joblist7.append(job_dict.get('site'))
                joblist8.append(job_dict.get('date'))
            #This for loop spilts the information of each job into its own index in the list
    except:
        print 'Failure!'

    return joblist1, joblist2, joblist3, joblist4, joblist5, joblist6, joblist7, joblist8
Esempio n. 3
0
def crawl():	
	cj  =  CareerjetAPIClient("en_GB")
	result_json = cj.search({'location':'london', 
		'affid':'15dab4607182f9977616ac6edc0ddd75',
		'user_ip':'192.168.1.64', 
		'user_agent':'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0', 
		'url':'http://www.example.com/jobsearch?q=python&l=london'})
	jobs = result_json['jobs']
	db = mysql.connect(host="localhost", user="******", passwd="iiits_12345", db="onet")        
	cur = db.cursor()
	cur.execute("""SELECT title, description ,onetsoc_code FROM occupation_data """)	

	RI = []
	db_results = cur.fetchall()
	for j in range(len(jobs)):
		RI.append([]) 
		title =  str(jobs[j]['title'].encode('utf8'))
		description  = str(jobs[j]['description'].encode('utf8'))

		for _c in range(len(db_results)):
			onet_title = db_results[_c][0]  
			onet_desc = db_results[_c][1]
			onet_code = db_results[_c][2]
			cur.execute("""SELECT alternate_title, short_title from alternate_titles WHERE onetsoc_code = """ + "'"+str(onet_code)+"'")
			alter = cur.fetchall();
			bias = 0.0
			for a in alter:
				tbias = similarity(title, a[0]) 
				if a[1] != None:
					tbias += similarity(title, a[1])
				bias = max(tbias, bias)
								
			score = bias + similarity(title,onet_title )+ 0.5*similarity(title, onet_desc)+0.5*similarity(description, onet_title) + 0.1*similarity(description, onet_desc)
	       		RI[j].append(tuple([onet_code, onet_title, score]))
	
		RI[j] = tuple([title,sorted(RI[j], key=lambda x: x[2], reverse=True)])

	for r in RI:
		print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
		print r[0]
		for x in r[1][:10]:
			print x
			print "---------------------------------------------------------------"
	return RI		
Esempio n. 4
0
import requests
import numpy as np
from requests.exceptions import HTTPError

# Github link for more description on this API: https://github.com/careerjet/careerjet-api-client-python
projectId = "nth-honor-259919"
key = "5f414fcedee8f11f5c8116653559c13daa2dbe03"

affId = "8acafed2c2c1c95fdd17ea85633d394a"
from careerjet_api_client import CareerjetAPIClient

cj = CareerjetAPIClient("en_US")
result_json = cj.search({
    'location':
    'seattle',
    'keywords':
    'java&python&aws',
    'pagesize':
    '5',
    'affid':
    "8acafed2c2c1c95fdd17ea85633d394a",
    'user_ip':
    '209.141.193.102',
    'url':
    'https://www.seekrlabs.com/jobsearch?q=python&l=london',
    'user_agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
})

print(result_json)
Esempio n. 5
0
from careerjet_api_client import CareerjetAPIClient
import json, ast, csv

try:
    cj = CareerjetAPIClient("en_US")
    #This tells the api what country to look for

    result_json = cj.search({
        'location':
        'USA',
        'keywords':
        'remodeling',
        'affid':
        '213e213hd12344552',
        'user_ip':
        '129.114.19.4',
        'url':
        'https://www.careerjet.com/search/jobs?s=remodeling&l=',
        'user_agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
    })
    #This is the information is used to do a get the information
    result_json_1 = ast.literal_eval(json.dumps(result_json))
    #This is used to strip u' from all the dictionaries
    jobs = (result_json_1['jobs'])
    #This is used to get the jobs from the results of the api cal
    dictOfWords = {i: jobs[i] for i in range(0, len(jobs))}
    #This is used to give a number for each job and its information
    joblist = []
    #A list used to get the information of each job
    for i in range(0, len(dictOfWords)):
Esempio n. 6
0
from careerjet_api_client import CareerjetAPIClient

cj = CareerjetAPIClient("en_US")

result_json = cj.search({
    'location':
    'london',
    'keywords':
    'python',
    'affid':
    '213e213hd12344552',
    'user_ip':
    '11.22.33.44',
    'url':
    'http://www.example.com/jobsearch?q=python&l=london',
    'user_agent':
    'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
})

print len(result_json)
Esempio n. 7
0
import json
import os
import urllib2
from urllib import urlencode

from careerjet_api_client import CareerjetAPIClient
from django.http import JsonResponse, HttpResponseBadRequest, HttpResponse

from models import SearchResult, SavedJob, Job, User

careerjet_key = os.environ.get('CAREERJET_KEY')
cj = CareerjetAPIClient("en_US")
indeed_key = os.environ.get('INDEED_KEY')


def careerjet_query(job, location, url, user_ip, user_agent):
    cj_query = {
        'location': location,
        'keywords': job,
        'affid': careerjet_key,
        'user_ip': user_ip,
        'url': url,
        'user_agent': user_agent,
    }

    # Get the first results page
    result_json = cj.search(cj_query)

    # Get the other pages
    all_jobs = result_json['jobs']
    for page in range(2, result_json['pages']):
from careerjet_api_client import CareerjetAPIClient
import time, json
from config import affid, user_ip, url

cj  =  CareerjetAPIClient("en_GB");

locations = ['africa', 'asia', 'australia', 'europe', 'north america', 'south america']

for loc in locations:
	result_json = cj.search({
	                        'location'    : loc,
	                        'keywords'    : 'data scientist',
	                        'sort'        : 'salary',
	                        'affid'       : affid,
	                        'user_ip'     : user_ip,
	                        'url'         : url,
	                        'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
	                      });
	
	with open(loc + '.json', 'w') as outfile:
	    json.dump(result_json, outfile)
	
	time.sleep(20)
	
Esempio n. 9
0
# 	Second Argument: 'USER_AGENT' - The User Agent of the user's browser
# 						e.x.: 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0'
# 						IMPORTANT ----- Due to how command line works, we replace '(' and ')' with '*'
#							and ';' with '^'. Need to change that in this program.
# 	Third Argument: 'URL' - The URL that this information gained will be displaying on.
# 						I'm guessing during test it will be 'http://localhost/search?example=true'
# ---------------------NOT REQUIRED
#	Fourth Argument: 'Keywords' - Keywords to match the title, content, or company name of a job
#
# Returns: This will return an object to stdout.
from careerjet_api_client import CareerjetAPIClient
import sys
import re
import json

cj = CareerjetAPIClient("en_US")
IP = sys.argv[1]
USER_AGENT = (" ").join(sys.argv[2].replace('*',
                                            '(').replace('+', ')').replace(
                                                '^', ';').split('-'))
AFFILIATE_ID = sys.argv[5]  #'12675a3a68f21b112085faa208e88b9d'
URL = sys.argv[3]
KEYWORDS = sys.argv[4]

# print IP, USER_AGENT, URL, keywords

result = cj.search({
    'keywords': KEYWORDS,
    'affid': AFFILIATE_ID,
    'user_ip': IP,
    'url': URL,
Esempio n. 10
0
    return ' '.join(resultwords)


comp["Companies"] = comp["Companies"].apply(lambda x: x.lower())
comp["Companies"] = comp["Companies"].apply(lambda x: removewords(x))
comp["Companies"] = comp["Companies"].apply(lambda x: x.replace(" ", "+"))

#function to request data from Careerjets API. You will need to sign up for an account to use this: http://www.careerjet.co.uk/partners
#Add in you affid (top right of screen), IP address (open terminal, type ipconfig if you don't know this)
# and put in your user_agent (web browser in)
#More informatino can be found here: https://github.com/careerjet/careerjet-api-client-python

#this will return the count of jobs for each keyword search however, if you remove the ["hits"] there is a lot more information available
from careerjet_api_client import CareerjetAPIClient

cj = CareerjetAPIClient("en_GB")


def careerjetAPI(keywords):
    result_json = cj.search({
        'keywords': keywords,
        'affid': '1fda10a87c250ec936a0082f7244ec97',
        'user_ip': '172.28.3.16',
        'url': 'http://www.careerjet.co.uk/jobsearch?q=' + keywords,
        'user_agent': 'Firefox/31.0'
    })
    time.sleep(2)
    return result_json["hits"]


#Cycle through the URL's changing the keywords