def scrape_indeed(self, api_key, ip_address, places=None):
        indeed_client = IndeedClient(api_key)
        indeed_matched_jobs = []
        seen_jobs = self.load_titles('indeed_jobs')

        if not places:
            places = ['san francisco, ca']

        for place, term in [(place, term)
                            for place in places 
                            for term in self.filters.keys()]:
            sys.stderr.write('Searching {} Indeed for {}... '.format(place, term))
            # time.sleep(random.randrange(1, 3))  # throttle requests
            params = {
                'q': term,
                'l': place,
                'userip': ip_address,
                'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
                'limit': 25}
            search_response = indeed_client.search(**params)
            job_results = search_response['results']
            sys.stdout.write('returned {} items\n'.format(len(job_results)))

            for job in job_results:
                job_id = job['jobkey']
                if job_id not in seen_jobs:
                    seen_jobs.add(job_id)
                    job_title = job['jobtitle']
                    if self.filter_title(job_title, self.filters[term]):
                        indeed_matched_jobs.append([
                            job_title, job['formattedLocationFull'], job['url'], job['snippet']])

        self.save_titles('indeed_jobs', seen_jobs)
        return indeed_matched_jobs
    def get(self):
        """Interact with `get` request from front-end
        Currently only does Indeed, other APIs to take into consideration:
            USAjobs.gov, key:  EwesKi7XhFETegcAroJCod5jeP9wwBkzanA1qatBMRY=
            AuthenticJobs.com, key: de1d14f970eaf280a271b1d5beffafe9
        """
        indeed_key = '4970113146490412' # not secure, coz repo, but whatevs
        client = IndeedClient(indeed_key)
        query = self.request.get_all("q")
        location = self.request.get_all("l")
        jobids = self.request.get_all("jobids")
        all = self.request.get_all("all")
        output = ""

        if query and not jobids:
            output = self.jobs(client, query, location)
        elif jobids and not query:
            output = client.jobs(tuple(jobids.split(',')))
        elif all != '':
            jobs = self.jobs(client, query, location)
            output = client.jobs(tuple([job.jobkey for job in jobs.results]))

            # for job in jobs get jobkey & use that to get all data then
            # colloide data on job key


        self.response.headers['Content-Type'] = 'application/json'
        self.response.out.write(json.dumps(output))
Beispiel #3
0
def generate_job_list(params,publisher_id):
    """
    Returns list of jobs that match search criteria
    """
    job_list = []
    #since we initiated params['start'] at 0
    total_results = 1     
    while int(params['start']) < total_results:
        client = IndeedClient(publisher = publisher_id)
        search_response = client.search(**params)
        root = ET.fromstring(search_response)
        params['start']  = str(int(params['start'])+25) 
        total_results = int(root.find('totalresults').text)         
        for job in root.iter('result'):
            jobtitle = job.find('jobtitle').text 
            company = job.find('company').text
            city = job.find('city').text
            #state = job.find('state').text
            #country = job.find('country').text
            date = job.find('date').text
            snippet = job.find('snippet').text
            sponsored = job.find('sponsored').text
            url = job.find('url').text
            job = (unicode(jobtitle),unicode(company),unicode(city),unicode(date)[5:16].replace(" ","-"),unicode(sponsored), unicode(url))
            if job not in job_list:
                job_list.append(job)         
            
    job_list.insert(0,(unicode("jobtitle"),unicode("company"),unicode("city"),unicode("date"),unicode("sponsored"), unicode("url"))) #add header    
    return job_list
 def connect_indeed(self,
                    config_filepath='indeed_cred.yml'):  # Store in .ssh
     # yamload = yaml.load(open(config_filepath))
     # credentials = yamload['indeed']
     # pub_num = credentials.get('publisher_num')
     self.c = IndeedClient(publisher='4353162753214099')
     print('connect_indeed done')
def get_indeed_job_list(query, location, radius):
    client = IndeedClient(publisher=2863621289879018)
    progress_bar = pyprind.ProgBar(4, title='Searching For Jobs')
    results_pd = pd.DataFrame()
    for numb_results in range(0, 100, 25):
        params = {
            'q': query,
            'radius': radius,
            'l': location,
            'userip': "1.2.3.4",
            'limit': '25',
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            'start': numb_results
        }
        search_response = client.search(**params)
        results_pd = pd.concat(
            [results_pd,
             pd.DataFrame.from_dict(search_response['results'])],
            axis=0)
        progress_bar.update()
    if len(results_pd) == 0:
        sys.exit('Search did not return any jobs')
    results_pd.reset_index(drop=True, inplace=True)
    results_pd['date'] = pd.to_datetime(results_pd.date)
    results_pd.drop([
        'source', 'expired', 'country', 'formattedLocation',
        'formattedLocationFull', 'onmousedown', 'stations', 'state',
        'sponsored'
    ],
                    axis=1,
                    inplace=True)
    return results_pd  # returns the search results as a pandas data frame
Beispiel #6
0
    def job_search(self, job, location):
        # publisher=5950869068484812
        client = IndeedClient('5950869068484812')

        #params = generate_advanced_query("python", "Boston", 1, 0, 25)
        params = self.generate_advanced_query(job, location, 1, 0, 25)
        search_response = client.search(**params)
        print "Search Response: %s" % search_response

        filename = 'indeed_positions_json.txt'
        self.write_json_to_file(filename, search_response)


        (positions, total) = self.extract_query_result(search_response)
        print total

        jobkeys = []
        for position in positions:
            self.extract_position_info(position, jobkeys)

        #for i in range(len(jobkeys)):
            #print "range (%d: %s)" % (i, jobkeys[i])

            #print '*' * 100
            #job_response = client.jobs(jobkeys = "ad752ce9ae3f1b5e")
            #print job_response['results']
            #print job_response
            #filename = 'indeed_positions_json.txt'
            #self.write_json_to_file(filename, job_response)
        return jobkeys
Beispiel #7
0
 def setup(self):
     self.client = IndeedClient("YOUR_PUBLISHER_NUMBER")
     self.params = {
         'q': "python",
         'l': "austin",
         'userip': "1.2.3.4",
         'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
     }
Beispiel #8
0
 def setup(self):
     self.client = IndeedClient('8251007850639120')
     self.params = {
         'q': "python",
         'l': "austin",
         'userip': "1.2.3.4",
         'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
     }
     self.utils = Utils()
Beispiel #9
0
class TestJobs():


    def setup(self):
        self.utils = Utils()
        self.client = IndeedClient('8251007850639120')
        self.params = {
            'jobkeys' : ("7c398c74a8f22c72", "d7802e9ce3b4af7d"),
        }

    def teardown(self):
        self.client = None
        self.params = None

    @with_setup(setup, teardown)
    def test_jobs(self):
        jobs_response = self.client.jobs(**self.params)
        assert type(jobs_response) is dict
        print jobs_response
        self.utils.output_to_file('output2', jobs_response)
        self.utils.open_with_subl('output2')
        self.utils.find_all_jobs_that_contains_job_parameter()
        # self.utils.output_to_file('sample.json', str(j))
        # self.utils.open_with_subl('sample.json')

    @with_setup(setup, teardown)
    @raises(IndeedClientException)
    def test_missing_jobkeys(self):
        del self.params['jobkeys']
        jobs_response = self.client.jobs(**self.params)

    @with_setup(setup, teardown)
    def test_raw_json(self):
        self.params['raw'] = True
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert type(json.loads(jobs_response)) is dict

    @with_setup(setup, teardown)
    def test_raw_xml_with_paramter(self):
        self.params['format'] = "xml"
        self.params['raw'] = True
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert parseString(jobs_response)

    @with_setup(setup, teardown)
    def test_raw_xml_without_paramter(self):
        self.params['format'] = "xml"
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert parseString(jobs_response)

    '''New test cases not included in GIT'''
    # @with_setup(setup, teardown)
    # def test_invalid_jobkey
Beispiel #10
0
def indeed_urls(parameters, publisher_key=None):
    """Use Indeed publisher ID to retrieve URLs from the Indeed API."""
    if publisher_key is None:
        publisher_key = os.environ['API_KEY']
    client = IndeedClient(publisher_key)
    response = client.search(**parameters)
    try:
        urls = [str(links['url']) for links in response['results']]
        return urls
    except KeyError:
        raise NameError('Invalid Publisher ID')
def indeed_urls(parameters, publisher_key=None):
    """Use Indeed publisher ID to retrieve URLs from the Indeed API."""
    if publisher_key is None:
        publisher_key = os.environ["API_KEY"]
    client = IndeedClient(publisher_key)
    response = client.search(**parameters)
    try:
        urls = [str(links["url"]) for links in response["results"]]
        return urls
    except KeyError:
        raise NameError("Invalid Publisher ID")
def Search(query, location, limit=10, start=0):
    client = IndeedClient(publisher=PUBLISHER_ID)
    params = {
        'q': query,
        'l': location,
        'limit': limit,
        'start': start,
        'userip': "1.2.3.4",
        'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)"
    }
    search_response = client.search(**params)
    return search_response
Beispiel #13
0
def get_data():
    client = IndeedClient('7381316591612982')
    params = {
        'q': "front end engineer",
        'l': "austin",
        'userip': "172.68.141.95",
        'useragent':
        """Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36
                        (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""",
        'limit': 25
    }
    search_response = client.search(**params)

    cities = [
        'New York, NY', 'Austin, TX', 'San Francisco, CA', 'Boston, MA',
        'Chicago, IL', 'Miami, FL'
    ]
    jobs = [
        'Front End Engineer', 'Back End Engineer', 'Data Science',
        'Product Management', 'Director of Engineering', 'Data Engineer',
        'Data Analyst', 'Accounting', 'Marketing', 'Finance', 'Nurse',
        'Doctor', 'Lawyer', 'Paralegal', 'sales', 'customer_service',
        'human resources', 'executive assistant', 'operations', 'teacher',
        'maintenance', 'security guards'
    ]

    res_list = ['jobs']
    for c in cities:
        for j in jobs:
            params = {
                'q': j,
                'l': c,
                'userip': "172.68.141.95",
                'useragent':
                """Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36
                                (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36""",
                'limit': 25
            }
            search_response = client.search(**params)
            for res in search_response['results']:
                job_dict = {}
                if not res['expired']:
                    job_dict['city'] = res['city']
                    job_dict['date_posted'] = res['date']
                    job_dict['company'] = res['company']
                    job_dict['title'] = res['jobtitle']
                    job_dict['url'] = res['url']
                    job_dict['job_id'] = res['jobkey']
                    job_dict['state'] = res['state']
                    job_dict['snippet'] = res['snippet']
                res_list.append(job_dict)

    return res_list
Beispiel #14
0
def Search(query, location, limit=10, start=0):
    client = IndeedClient(publisher=PUBLISHER_ID)
    params = {
        'q': query,
        'l': location,
        'limit': limit,
        'start': start,
        'userip': "1.2.3.4",
        'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)"
    }
    search_response = client.search(**params)
    return search_response
Beispiel #15
0
def access_indeed_api(parameters, publisher_key=None):
    """Access the Indeed API using the given parameters and publisher key.

    Positional argument:
    parameters -- a dictionary of the parameters to send to Indeed's API

    Keyword argument:
    publisher_key -- the publisher key for Indeed's API, defaults to environment variable
    """
    if publisher_key is None:
        publisher_key = os.environ['API_KEY']
    client = IndeedClient(publisher_key)
    response = client.search(**parameters)
    return response
Beispiel #16
0
    def search_with_api(self, params: dict):
        client = IndeedClient(publisher=self.user_config.INDEED_API_KEY)
        search_response = client.search(**params)

        total_number_hits = search_response['totalResults']
        num_loops = int(total_number_hits /
                        IndeedConstants.API.MAX_NUM_RESULTS_PER_REQUEST)
        counter_start = 0

        print('Total number of hits: {0}'.format(total_number_hits))
        count_jobs_added = 0

        for i in range(0, num_loops):
            # We can get around MAX_NUM_RESULTS_PER_REQUEST by increasing our start location on each loop!
            params['start'] = counter_start

            search_response = client.search(**params)
            list_jobs = IndeedParser.get_jobs_from_response(search_response)
            for job in list_jobs:
                try:
                    # TODO: This sucks, I'm just repeating myself...
                    Job.create(key=job.key,
                               website=job.website,
                               link=job.link,
                               title=job.title,
                               company=job.company,
                               city=job.city,
                               state=job.state,
                               country=job.country,
                               location=job.location,
                               posted_date=job.posted_date,
                               expired=job.expired,
                               easy_apply=job.easy_apply)
                    count_jobs_added += 1

                except peewee.IntegrityError as e:
                    # TODO: Can I write a custom exception that catches UNIQUE Errors but not others?
                    if 'UNIQUE' in str(e):
                        pass
                    else:
                        print(str(e))

            # Increment start
            counter_start += IndeedConstants.API.MAX_NUM_RESULTS_PER_REQUEST

        print('Added {0} new jobs'.format(count_jobs_added))
Beispiel #17
0
def get_job_description(input_skills):
    client = IndeedClient('7863709885041358')

    params = {
        'q': input_skills,
        'userip': "1.2.3.4",
        'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
        'limit': 25
    }
    job_urls = []
    search_response = client.search(**params)
    for job in search_response['results']:
        job_urls.append(job['url'])
    bunch_of_words = []
    for each_url in job_urls:
        bunch_of_words.extend(text_cleaner(each_url))

    return bunch_of_words
Beispiel #18
0
class TestJobs:

    def setup(self):
        self.client = IndeedClient("YOUR_PUBLISHER_NUMBER")
        self.params = {
            'jobkeys' : ("5898e9d8f5c0593f", "c2c41f024581eae5"),
        }

    def teardown(self):
        self.client = None
        self.params = None

    @with_setup(setup, teardown)
    def test_jobs(self):
        jobs_response = self.client.jobs(**self.params)
        assert type(jobs_response) is dict

    @with_setup(setup, teardown)
    @raises(IndeedClientException)
    def test_missing_jobkeys(self):
        del self.params['jobkeys']
        jobs_response = self.client.jobs(**self.params)

    @with_setup(setup, teardown)
    def test_raw_json(self):
        self.params['raw'] = True
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert type(json.loads(jobs_response)) is dict

    @with_setup(setup, teardown)
    def test_raw_xml_with_paramter(self):
        self.params['format'] = "xml"
        self.params['raw'] = True
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert parseString(jobs_response)

    @with_setup(setup, teardown)
    def test_raw_xml_without_paramter(self):
        self.params['format'] = "xml"
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert parseString(jobs_response)
Beispiel #19
0
class TestJobs:
    def setup(self):
        self.client = IndeedClient("YOUR_PUBLISHER_NUMBER")
        self.params = {
            'jobkeys': ("5898e9d8f5c0593f", "c2c41f024581eae5"),
        }

    def teardown(self):
        self.client = None
        self.params = None

    @with_setup(setup, teardown)
    def test_jobs(self):
        jobs_response = self.client.jobs(**self.params)
        assert type(jobs_response) is dict

    @with_setup(setup, teardown)
    @raises(IndeedClientException)
    def test_missing_jobkeys(self):
        del self.params['jobkeys']
        jobs_response = self.client.jobs(**self.params)

    @with_setup(setup, teardown)
    def test_raw_json(self):
        self.params['raw'] = True
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert type(json.loads(jobs_response)) is dict

    @with_setup(setup, teardown)
    def test_raw_xml_with_paramter(self):
        self.params['format'] = "xml"
        self.params['raw'] = True
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert parseString(jobs_response)

    @with_setup(setup, teardown)
    def test_raw_xml_without_paramter(self):
        self.params['format'] = "xml"
        jobs_response = self.client.jobs(**self.params)
        assert isinstance(jobs_response, basestring)
        assert parseString(jobs_response)
Beispiel #20
0
def main():
    # publisher=5950869068484812
    client = IndeedClient('5950869068484812')

    params = generate_advanced_query("python", "Boston", 10, 0, 25)
    search_response = client.search(**params)
    #print search_response

    #filename = 'indeed_positions_json.txt'
    # write_json_to_file(filename, search_response)

    (positions, total) = extract_query_result(search_response)
    print total

    jobkeys = []
    for position in positions:
        extract_position_info(position, jobkeys)

    for i in range(len(jobkeys)):
        print jobkeys[i]
Beispiel #21
0
def fetch_indeed_data(counties,search): 
    from indeed import IndeedClient
    client = IndeedClient('6437444271691851')
    params = {
        'q' : "analytics",
        'l' : "bergen county, nj",
        'userip' : "1.2.3.4",
        'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
        'latlong' : 1,
        'radius' : 10,
        'fromage' : 7,
        'limit' : 25
    }
    params['q'] = search
    
    results = []
    for county in counties:
        params['l'] = county
        results.append(client.search(**params))
        
    return(results)
def main():
    client = IndeedClient(PUB_ID)
    search_params = build_params(locations, JOB_QUERY)

    search_results = []
    count = 1
    for params in search_params:
        stdout.flush()
        stdout.write("\rHtml request: {}/{}".format(count, len(locations)))
        search_response = client.search(**params)
        search_results.append(search_response)
        count += 1

    word_filter = ['and', 'to', 'the', 'of', 'a', 'in', 'with', 'you', 'on', 'that', 'are', 'will', 'is', 'your', 'for',
                   'we', 'from', 'an', 'be', 'have', 'or', 'just', 'can', 'also', 'how', 'at', 'as', 'do', 'other',
                   'should', 'what', 'us', 'this', 'it', 'if', 'get', '-', '&', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
                   'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    count = 1
    number_of_locations = len(search_results)
    word_map = Counter()

    for search in search_results:
        print "Currently on {}/{}".format(count, number_of_locations)
        if len(search['results']) == 0:
            print "Nothing found for: {}".format(search['location'])
        else:
            print "Attempting {}...".format(search['location'])
        for job in search['results']:
            url = job['url']
            html = requests.get(url)
            word_list = pull_job_description(html.content)

            for word in word_list:
                if word.lower() not in word_filter:
                    word_map[word.lower()] += 1
        count += 1

        save_to_file(OUTPUT_FILE, word_map)
    def scrape_indeed(self, api_key, ip_address, places=None):
        indeed_client = IndeedClient(api_key)
        indeed_matched_jobs = []
        seen_jobs = self.load_titles('indeed_jobs')

        if not places:
            places = ['san francisco, ca']

        for place, term in [(place, term) for place in places
                            for term in self.filters.keys()]:
            sys.stderr.write('Searching {} Indeed for {}... '.format(
                place, term))
            # time.sleep(random.randrange(1, 3))  # throttle requests
            params = {
                'q': term,
                'l': place,
                'userip': ip_address,
                'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
                'limit': 25
            }
            search_response = indeed_client.search(**params)
            job_results = search_response['results']
            sys.stdout.write('returned {} items\n'.format(len(job_results)))

            for job in job_results:
                job_id = job['jobkey']
                if job_id not in seen_jobs:
                    seen_jobs.add(job_id)
                    job_title = job['jobtitle']
                    if self.filter_title(job_title, self.filters[term]):
                        indeed_matched_jobs.append([
                            job_title, job['formattedLocationFull'],
                            job['url'], job['snippet']
                        ])

        self.save_titles('indeed_jobs', seen_jobs)
        return indeed_matched_jobs
class Threadr(object):
    def __init__(self, keyword, location):
        self.conn = boto.connect_s3()  # Connecting to S3
        self.bucket = self.conn.get_bucket(
            'bucketofindeeds')  # Accessing the correct bucket
        self.json_up = Key(self.bucket)  # Make sure to name it.
        self.content_up = Key(self.bucket)  # Make sure to name it.
        self.keyword = keyword
        self.location = location
        print('init done')

    def connect_indeed(self,
                       config_filepath='indeed_cred.yml'):  # Store in .ssh
        # yamload = yaml.load(open(config_filepath))
        # credentials = yamload['indeed']
        # pub_num = credentials.get('publisher_num')
        self.c = IndeedClient(publisher='4353162753214099')
        print('connect_indeed done')

    def parameters(self, keyword,
                   location):  # Make sure to try using multiple keywords
        ua = UserAgent(fallback='Your favorite Browser')
        self.params = {
            'q': str(keyword),
            'l': str(location),
            'userip': requests.get("http://icanhazip.com").text,
            'useragent': ua.random
        }
        print('parameters done')

    def job_search(self):
        self.response = self.c.search(**self.params)
        # This will return a json file.
        print(len(self.response['results']), 'jobs returned.')

    def send_json(self):
        self.json_up.key = 'indeed_jsons/test'
        self.json_up.set_contents_from_string(str(self.response) + '\n')
        print('Its Working.')

    def mine_that(self):
        self.connect_indeed()
        self.parameters(self.keyword, self.location)
        self.job_search()
        self.send_json()
Beispiel #25
0
from indeed import IndeedClient
import time

client = IndeedClient(publisher='')

params = {
    'q': "internship",
    'l': "Zurich",
    'userip': "1.2.3.4",
    'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
    'radius': 50,
    'limit': 100,
    'co': 'ch',
    'sort': 'date'
}

search_response = client.search(**params)

filename = 'jobs_' + str(time.localtime()[0]) + str(time.localtime()[1]) + str(
    time.localtime()[2]) + '.txt'

with open(r'export path' + filename, 'w') as textfile:
    textfile.write('acquisition time: ' + str(time.localtime()[3]) + ':' +
                   str(time.localtime()[4]) + '\n\n')
    for i in range(0, len(search_response)):
        reltime = search_response['results'][i]['formattedRelativeTime']
        jobtitle = search_response['results'][i]['jobtitle']
        company = search_response['results'][i]['company']
        url = search_response['results'][i]['url']
        textfile.write(reltime + '\t' + jobtitle + '\t company: ' + company +
                       '\n' + url + '\n\n')
# Author: Jichao Sun ([email protected])
# Date: April 26, 2016

# Setup: pip install indeed
#        pip install requests --upgrade

from bs4 import BeautifulSoup
from indeed import IndeedClient

#import threading, urllib2
import urllib, urllib2, re

jichaoID = 278720823964828
client = IndeedClient(publisher=jichaoID)


# If salary is non empty, then the ordering of jobs per query is preserved.
# Thus can use difference between two queries to find jobs in salary range.
# Jobs with no specified salaries are estimated
def getRawJobs(what, where, count, jobType, radius, salary):
    if jobType not in [
            "fulltime", "parttime", "contract", "internship", "temporary", ""
    ]:
        return []

    results = []

    params = {
        'q': what + "+$" + salary,  # Job keywords
        'l': where,  # Location as a string,
        'jt': jobType,  # Type of job, fulltime parttime contract etc...
Beispiel #27
0
#importin Indeed Python API module
from indeed import IndeedClient

client = IndeedClient(publisher=12254335)  # we'll do this later

parameters = {
    'q': "python developer",
    'l': "London, GB",
    'sort': "date",
    'fromage': "5",
    'limit': "25",
    'filter': "1",
    'userip': "192.186.176.550:60409",
    'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)"
}

# our main search function


def get_offers(params):
    # perform search
    search_results = client.search(**params)  # we want this to be a dictionary

    # loop through each offer element
    for elm in search_results['results']:
        offer = (elm['jobtitle'], elm['formattedLocation'], elm['snippet'],
                 elm['url'], elm['indeedApply'], elm['jobkey'], elm['date'])
Beispiel #28
0
def search(params):
    client = IndeedClient(publisher=8201417039877332)
    res = client.search(**params)
    return res
# verifying indeed publisher number 
from indeed import IndeedClient 
client = IndeedClient(publisher = 'publisher_number')

params = {
        'q' : "software engineer",
        'l' : "Chicago",
        'sort' : "date",
        'fromage' : "5",
        'limit' : "50",
        'filter' : "1",
        'userip' : "ip_address",
        'useragent' : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
        }

# main search function 
def get_offers(params):
    search_results = client.search(**params) #perform search 
    for elm in search_results['results']:
         offer = (elm['jobtitle'], #parsing the offer 
                 elm['formattedLocation'],
                 elm['formattedLocation'],
                 elm['snippet'], 
                 elm['url'],
                 elm['indeedApply'],
                 elm['jobkey'],
                 elm['date'])
        
                 
User must provide a valid indeed.com PUBLISHER_NUMBER and one or
more job keys.
See this page to create an account (it was free and
quick for me):
  http://www.indeed.com/publisher/


Get each job key by running the search_indeed_api.py program.
'''

from indeed import IndeedClient
import time

site = 'indeed.com'

client = IndeedClient('PUBLISHER_NUMBER')

# Job keys copied from the response to the search program:
jk_l = ["e8930d8d162c4b70", "6bb8f41ea97bd6f8"]

job_response = client.jobs(jobkeys = (jk_l ))

# Example job response, Mon2016_0418.
#   Entire response is a dictionary.
#   The key 'results' has value of a list of dicts, eg:
#   {u'version': 2, u'results': [{u'formattedRelativeTime': u'30+ days ago', ...
#
'''
Each dict in the list is one job record, as shown here:

{u'formattedRelativeTime': u'30+ days ago', u'city': u'San Francisco',
Beispiel #31
0
 def __init__(self):
     #        self.jobDataFrame= pd.DataFrame();
     self.client = IndeedClient(8836246992678581)
Beispiel #32
0
from indeed import IndeedClient

client = IndeedClient('1439892112925001')

params = {
    'q': "python",
    'l': "boston",
    'userip': "1.2.3.4",
    'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)"
}

url = 'http://www.indeed.com/viewjob?jk=89b6ad7a31f7c4be&qd=Edw9zLy29tPtf_aglDLrzkea4GYpkSu9Dn9RxMjGtc-Au7bNkAhEpP8509-8oVyQct6gb9Hh9FwGl317FwNQL73cXKONUJYtCg03YtTr2S0&indpubnum=1439892112925001&atk=1b94foutl5sn398g'

import requets
requests.get(url)

jobkey = '89b6ad7a31f7c4be'
Beispiel #33
0
counties = [cs[0] for cs in counties_states]
states = [cs[1] for cs in counties_states]

counties_by_state = {}
for (value, key) in counties_states:
    counties_by_state.setdefault(key, [])  # key might exist already
    counties_by_state[key].append(value)

counties_states_2 = [
    str(c) + ', ' + str(state_dict_2[s]) for c, s in zip(counties, states)
]

#### GET DATA FROM INDEED ####
from indeed import IndeedClient

client = IndeedClient(publisher=8924341972846274)

query = 'data scientist'  # Only search in our domain
# Indeed search is supposed to be ANDed but results prove the contrary

f = open('/data/w205/W205_final_storage/indeed/txt/indeed.txt', 'w')

for county_state in counties_states_2:
    county = county_state.split(', ')[0]
    state = county_state.split(', ')[1]
    jobkeys = []  # To avoid duplicates (in a county)
    params = {
        'q': query,
        'l': county_state,
        'userip': "1.2.3.4",
        'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
Beispiel #34
0
class indeed:

    #jobDataFrame

    def __init__(self):
        #        self.jobDataFrame= pd.DataFrame();
        self.client = IndeedClient(8836246992678581)

    def skill(self, l, city, jobtype):
        #print l
        #print " AND ".join(l)
        print(jobtype)
        if jobtype in ['intern', 'internship', 'Internship']:
            jobtype = 'internship'
        else:
            jobtype = 'fulltime'
        params = {
            'q': " AND ".join(l),
            'l': city,
            'jt': jobtype,
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            'limit': "25",
            'start': 0,
            'highlight': 1
        }
        i = 25
        search_response = self.client.search(**params)
        results = []
        if (len(search_response['results']) <= 0):
            return results

        while (i < 100 and i < search_response['totalResults']):
            results += search_response['results']
            params['start'] += 25
            search_response = self.client.search(**params)
            results += search_response['results']
            i += 25
            print(params['start'])

        self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey')
        self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8')
        return results

    def skillOR(self, l, city, jobtype):
        #print l
        #print " AND ".join(l)
        print(jobtype)
        if jobtype in ['intern', 'internship', 'Internship']:
            jobtype = 'internship'
        else:
            jobtype = 'fulltime'
        params = {
            'q': " OR ".join(l),
            'l': city,
            'jt': jobtype,
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            'limit': "50"
        }
        i = 25
        search_response = self.client.search(**params)
        results = []
        if (len(search_response['results']) <= 0):
            return results

        while (i < 100 and i < search_response['totalResults']):
            results += search_response['results']
            params['start'] += 25
            search_response = self.client.search(**params)
            results += search_response['results']
            i += 25
            print(params['start'])

        self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey')
        self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8')
        return results
Beispiel #35
0
 def setup(self):
     self.client = IndeedClient("YOUR_PUBLISHER_NUMBER")
     self.params = {
         'jobkeys' : ("5898e9d8f5c0593f", "c2c41f024581eae5"),
     }
from indeed import IndeedClient
import csv

client = IndeedClient(publisher = 2186395790213512)

tot = []
for i in range(0, 8):
	params = {
	    'q' : "marketing",
	    'userip' : "1.2.3.4",
	    'useragent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
	    'format' : 'json',
	    'limit' : 25,
	    'start' : i*25
	}
	sr = client.search(**params)
	for j in range(0, len(sr['results'])):
		tot.append(sr['results'][j])

allJobs = []
for i in range(0, len(tot)):
	currJob = []
	currJob.append(tot[i]['jobtitle'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['url'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['city'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['date'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['company'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['snippet'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['source'].encode('ascii', 'ignore'))
	currJob.append(tot[i]['jobkey'].encode('ascii', 'ignore'))
	allJobs.append(currJob)
Beispiel #37
0
    return results

#  ========================   Q U E R Y    P A R A M E T E R S     ==================================
params['fromage']='any'
params['radius'] = 0
params['q'] =  ""
params['highlight'] = 0
params['jobtitle']
#Alteryx OR Cassandra OR Clojure OR Cloudera OR D3 OR Elasticsearch# OR GraphLab OR Dato OR Hadoop OR PureData OR SPSS  OR Julia OR Kafka OR Looker OR Medidata OR MongoDB OR Neo4j OR NLTK OR NumPy OR Orange OR pandas OR Pentaho OR Pig OR PostGIS OR PostgreSQL OR Python OR R OR edis OR Redshift OR BusinessObjects OR SAS OR Scala OR scikit-learn OR SciPy OR Spark OR SPSS OR SQL OR Stata OR Storm OR Tableau OR Vertica OR Vowpal"

compincr = 25
complevels = 11


##  ==================================      A P Is      ==============================================
indeedapi = IndeedClient(publisher='7423517030312598')
mongoclient = MongoClient()
db=mongoclient.fluxx

def alchemy(subjectURL,endpoint,subdict):
   # ENDPOINTS:
   # URLGetRankedNamedEntities ---- status, usage, url, language, entities, ['entities'] in results iterable
   # URLGetPubDate --- ['publicationDate']['date'] in results single value
   # URLGetRankedTaxonomy -- ['taxonomy'] in results iterable
   # URLGetRankedConcepts -- ['concepts'] in results iterable
   # URLGetRelations -- ['relations'] in results iterable --- sentiment and entities = 1 (does it apply to the calls above?)
   # URLGetText -- ['text'] in results single value
   key= '86bc3f87cb329d5be230ebb58d3b6c05f52e2417'
   query = urllib.quote(subjectURL)
   user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)'
   url = "http://access.alchemyapi.com/calls/url/" + endpoint + "?url=" + subjectURL + '&apikey=' + key + '&outputMode=json'
Beispiel #38
0
class TestSearch:
    def setup(self):
        self.client = IndeedClient('8251007850639120')
        self.params = {
            'q': "python",
            'l': "austin",
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
        }
        self.utils = Utils()

    def teardown(self):
        self.client = None
        self.params = None

    @with_setup(setup, teardown)
    def test_search(self):
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict
        self.utils.output_to_file('sample', search_response)

    @with_setup(setup, teardown)
    def test_missing_one_required(self):
        del self.params['l']
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    @raises(IndeedClientException)
    def test_missing_both_required(self):
        del self.params['q']
        del self.params['l']
        search_esponse = self.client.search(**self.params)

    @with_setup(setup, teardown)
    @raises(IndeedClientException)
    def test_missing_userip(self):
        del self.params['userip']
        search_response = self.client.search(**self.params)

    @with_setup(setup, teardown)
    @raises(IndeedClientException)
    def test_missing_useragent(self):
        del self.params['useragent']
        search_response = self.client.search(**self.params)

    @with_setup(setup, teardown)
    def test_raw_json(self):
        self.params['raw'] = True
        search_response = self.client.search(**self.params)
        assert isinstance(search_response, basestring)
        assert type(json.loads(search_response)) is dict

    @with_setup(setup, teardown)
    def test_raw_xml_with_paramter(self):
        self.params['format'] = "xml"
        self.params['raw'] = True
        search_response = self.client.search(**self.params)
        assert isinstance(search_response, basestring)
        assert parseString(search_response)

    @with_setup(setup, teardown)
    def test_raw_xml_without_paramter(self):
        self.params['format'] = "xml"
        search_response = self.client.search(**self.params)
        assert isinstance(search_response, basestring)
        assert parseString(search_response)

    ''' Few Tests written by me '''

    @with_setup(setup, teardown)
    def test_search_extra(self):
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict
        assert len(
            self.utils.find_all_jobs_not_contains_job_parameter(
                search_response, 'city', 'austin')) == 0
        assert len(self.utils.find_all_jobs_not_contains_job_parameter(search_response, 'country', 'US'))\
               == 0
        assert len(self.utils.find_all_jobs_not_contains_job_parameter(search_response, 'language', 'en')) \
               == 0
        assert self.utils.get_num_jobs(search_response) == 10

    @with_setup(setup, teardown)
    def test_sort(self):
        self.params['sort'] = "date"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    def test_start(self):
        self.params['start'] = "2"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    def test_limit(self):
        self.params['limit'] = "25"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict
        assert self.utils.get_num_jobs(search_response) == 25

    @with_setup(setup, teardown)
    def test_fromage(self):
        self.params['fromage'] = "2"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    def test_limit(self):
        self.params['limit'] = "25"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict
        assert self.utils.get_num_jobs(search_response) == 25

    @with_setup(setup, teardown)
    def test_highlight(self):
        self.params['highlight'] = "1"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    def test_duplicate(self):
        self.params['duplicate'] = "1"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    def test_co(self):
        self.params['co'] = "ca"
        self.params['l'] = "toronto"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict

    @with_setup(setup, teardown)
    def test_invalid_limit(self):
        self.params['limit'] = '-100'
        search_response = self.client.search(**self.params)
        assert self.utils.get_num_jobs(search_response) == 0

    # trying a bunch of invalid parameters, I noticed that no error is thrown. Instead it seems to ignore. It this correct?
    # ie. negative fromage, string instead of ints and vs versa

    @with_setup(setup, teardown)
    def test_several_params(self):
        self.params['co'] = "ca"
        self.params['l'] = "toronto"
        self.params['duplicate'] = "1"
        self.params['highlight'] = "1"
        self.params['limit'] = "25"
        self.params['fromage'] = "10"
        self.params['start'] = "2"
        search_response = self.client.search(**self.params)
        assert type(search_response) is dict
        assert self.utils.get_num_jobs(search_response) == 25
from indeed import IndeedClient
import pymysql
from database import addToDatabase

client = IndeedClient(publisher = ***************)

parameters = {'q' : "python developer",
			  'l' : "India",
			  'sort' : "date",
			  'fromage' : "5",
			  'limit' : "25",
			  'filter' : "1",
			  'userip' : "192.186.176.550:60409",
			  'useragent' : "Mozilla/5.0"
			 }

def get_offers(params):    
	search_results = client.search(**search_params)    
	for elm in search_results['results']:
				
		offer = (elm['jobtitle'], 
				 elm['formattedLocation'], 
				 elm['snippet'], 
				 elm['url'], 
				 elm['indeedApply'], 
				 elm['jobkey'], 
				 elm['date'])
		addToDatabase(offer)

def searchAllCities():
	current_city = 0
Beispiel #40
0
 def setup(self):
     self.client = IndeedClient("YOUR_PUBLISHER_NUMBER")
     self.params = {
         'jobkeys': ("5898e9d8f5c0593f", "c2c41f024581eae5"),
     }
Beispiel #41
0
#This is for the mail client, by which we will be able to get user base updates
app.config.update(
	DEBUG=True,
	#Email settings
	MAIL_SERVER='smtp.gmail.com',
	MAIL_PORT=465,
	MAIL_USE_SSL=True,
	MAIL_USERNAME = credentials.my_email_username,
	MAIL_PASSWORD = credentials.my_email_password
	)
mail=Mail(app)

#Creating the clients to interact with the APIs
twilio_api = TwilioRestClient(credentials.my_twilio_account_sid, credentials.my_twilio_auth_token) #Twilio
indeed_api = IndeedClient(publisher = credentials.my_indeed_publisher_id) #Indeed

#Client to shorten links with TinyURL
shortener = Shortener('Tinyurl', timeout=86400)

#Function to find and deliver jobs for each user in the jQuery file. This function is called daily as well as whenever the "admin" user sends a text to the endpoint with the word 'override'
def FindAndDeliverJobs():
	#Opening up the json file with all the users for reading
    with open('user_info.json', "r") as load_file:
        user_list = json.load(load_file)

	#Loop to iterate through every user inside the json file
    for user in user_list:
		#Only look up jobs for the user if they have confirmed their number
        if user['confirmed'] == 1:
			#Initializing the parameters for the Indeed search using the users preferences
Beispiel #42
0
def indeedAPI2(defTask):
    params = {}
    params['userip'] = "1.2.3.4",
    params['useragent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)"
    params['start'] = 1
    params['latlong'] = 1
    params['as_ttl'] = ''
    params['limit'] = 25
    params['fromage']='any'
    params['radius'] = 0
    params['q'] =  ''
    params['highlight'] = 0
    params['jobtitle'] = ''
    compincr = 25
    complevels = 11
    indeedapi = IndeedClient(publisher='7423517030312598')
    print params
    print 'START:',str(time.asctime(time.localtime()))
    newJobs = 0
    expiredJobs = 0
    # states=["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
    states = ['Climate and Land Use Alliance','John S. and John L. Knight Foundation','Cynthia and George Mitchell Foundation','Atlantic Philanthropies','Council of State Governments','Leukemia & Lymphoma Society','John D and Catherine T. MacArthur Foundation','Unbound Philanthropy','Garfield Foundation','Freedom House','Wikimedia Foundation','AFDO','Getty Foundation','Altman Foundation','Colorado Trust','Jessie Ball duPont Fund','Arthur Vining Davis Foundations','The Christensen Fund','Rita Allen Foundation','NBA Legends','Trio Foundation of St. Louis','Surdna Foundation','Kresge Foundation','Carnegie Corporation of New York','Central Valley Community Foundation','Democracy Fund','Committee to Protect Journalists','American Cancer Society','Winthrop Rockefeller Foundation','Walter and Elise Haas Fund','ClimateWorks Foundation','Zellerbach Family Foundation','Hillman Family Foundations','Bosch Community Fund','The Scan Foundation','Hogg Foundation','Unitarian Universalist Service Committee','Whole Foods Market','Open Road Foundation','Max M. & Marjorie S. Fisher Foundation','ArtPlace America','Grace and Mercy Foundation','Alliance for Early Success','The New York Womens Foundation','DentaQuest','ECMC Foundation','Great Lakes Higher Education Guaranty','The J. Willard and Alice S. Marriott Foundation','Indiana Historical Society','Wallace H. Coulter Foundation']
    oldlist=[]
    newlist=[]
    runStart = time.time()
    print params
    for job in db.fluxxJobs.find():
        oldlist.append(job['jobkey'])
    for state in states:
        print state
        params['company'] = state
        for c in range(complevels):
            params['salary'] = "$" + str(c * 25) + "K-$" + str(((c+1) * 25)-1) + "K"
            if c == (complevels-1):
                params['salary'] = "$" + str(c*compincr) + "K"
            sr = indeedapi.search(**params)
            tr = sr['totalResults']
            ps = params['salary'].replace("$","")
            for apirequests in range((tr/compincr)+1):
                params['start'] = (apirequests * compincr)
                sr = indeedapi.search(**params)
                for joblisting in sr['results']:
                    jobListing = json.loads(json.dumps(joblisting))
                    newlist.append(jobListing['jobkey'])
                    if joblisting['jobkey'] not in oldlist:
                        newJobs += 1
                        listed = joblisting['date'].replace('GMT','UTC')
                        joblisting['dateOpen'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S')
                        joblisting['datesOpen'] = timeDictStamp('')
                        joblisting['_id'] = joblisting['jobkey']
                        joblisting['status'] = 'Open'
                        joblisting['searchparams'] = params
                        joblisting['searchparams']['procTime']=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        joblisting['searchparams']['totalResults'] = tr
                        joblisting['compMin'] = c*25000
                        joblisting['compMax'] = (c+1) * 25000
                        joblisting['compRange'] = params['salary']
                        if joblisting['city'] == "":
                            del joblisting['city']
                        if joblisting['state'] == "":
                            del joblisting['state']
                        job = joblisting
                        jobID=job['_id']
                        Title=job['jobtitle'][0:60].replace("'","")
                        Company=job['company'].replace("'","").encode('latin-1','ignore')
                        if 'source' in job:
                            Source=job['source'].replace("'","")
                        else:
                            Source = ''
                        Description=job['snippet'][0:250].replace("'","").encode('latin-1','ignore').replace("'","")
                        Description = cleanup(Description,{'<b>':'','</b>':'','<B>':'','</B>':''})
                        listed = job['date'].replace('GMT','UTC')
                        job['dateListed'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S')
                        CompRange_min=job['compMin']
                        CompRange_max=job['compMax']
                        CompRange=job['compRange']
                        textURL = "http://www.indeed.com/viewjob?jk=" + job['jobkey']
                        jd = {}
                        jd['jdText'] = alchemy(textURL,'URLGetText','text')
                        jd['jdConcepts'] = alchemy(textURL,'URLGetRankedConcepts','concepts')
                        jd = json.loads(json.dumps(jd))
                        job['jobDescription'] = jd
                        if not db.fluxxJobs.find_one({'jobkey':job['jobkey']}):
                            db.grantsJobs.save(job)
    delisted = set(oldlist).difference(set(newlist))
    for jobkey in delisted:
        expiredJobs+=1
        rightnow = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        upResult = db.grantsJobs.update({'jobkey':jobkey},{'$set':{'dateClosed':rightnow,'datesClosed':timeDictStamp(),'status':'Closed','expired':'true'}})
    print " "
    print 'FINISH:',str(time.asctime(time.localtime()))
    print '=================================================================================================='
Beispiel #43
0
class indeed:

    #jobDataFrame

    def __init__(self):
        #        self.jobDataFrame= pd.DataFrame();
        self.client = IndeedClient(8836246992678581)

    def skill(self, l, city, jobtype):
        #print l
        #print " AND ".join(l)
        print(jobtype)
        if jobtype in ['intern', 'internship', 'Internship']:
            jobtype = 'internship'
        else:
            jobtype = 'fulltime'
        params = {
            'q': " AND ".join(l),
            'l': city,
            'jt': jobtype,
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            'limit': "25",
            'start': 0,
            'highlight': 1
        }
        i = 25
        search_response = self.client.search(**params)
        results = []
        if (len(search_response['results']) <= 0):
            return results

        while (i < 100 and i < search_response['totalResults']):
            results += search_response['results']
            params['start'] += 25
            search_response = self.client.search(**params)
            results += search_response['results']
            i += 25
            print(params['start'])

        self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey')
        self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8')
        return results

    def skillOR(self, l, city, jobtype):
        #print l
        #print " AND ".join(l)
        print(jobtype)
        if jobtype in ['intern', 'internship', 'Internship']:
            jobtype = 'internship'
        else:
            jobtype = 'fulltime'
        params = {
            'q': " OR ".join(l),
            'l': city,
            'jt': jobtype,
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            'limit': "50"
        }
        i = 25
        search_response = self.client.search(**params)
        results = []
        if (len(search_response['results']) <= 0):
            return results

        while (i < 100 and i < search_response['totalResults']):
            results += search_response['results']
            params['start'] += 25
            search_response = self.client.search(**params)
            results += search_response['results']
            i += 25
            print(params['start'])

        self.jobDataFrame = pd.DataFrame(results).drop_duplicates('jobkey')
        self.jobDataFrame.to_csv("sample.csv", encoding='UTF-8')
        return results

    def similarJobs(self, job):
        print("the job is" + job)
        sampledfo = pd.read_csv("sample.csv", encoding='UTF-8')
        sampledf = sampledfo.copy()
        del sampledf['stations']
        del sampledf['Unnamed: 0']
        del sampledf['source']
        del sampledf['onmousedown']
        del sampledf['formattedLocation']
        del sampledf['formattedLocationFull']
        del sampledf['url']
        del sampledf['date']
        del sampledf['formattedRelativeTime']
        sampledf['indeedApply'] = [
            0 if x == 'false' else 1 for x in sampledf['indeedApply']
        ]
        sampledf['expired'] = [
            0 if x == 'false' else 1 for x in sampledf['expired']
        ]
        sampledf['sponsored'] = [
            0 if x == 'false' else 1 for x in sampledf['sponsored']
        ]
        jobNo = job
        self.dataJob = sampledf.loc[sampledf['jobkey'] == jobNo]
        df = sampledf[sampledf["jobkey"] != jobNo]
        #        df[''] = ['red' if x == 'Z' else 'green' for x in df['Set']]
        df.ix[df.city == self.dataJob.city.iloc[0],
              ['city', 'country', 'state']] = 1
        df.ix[df.city != 1, ['city', 'country', 'state']] = 0
        df.ix[df.company == self.dataJob.company.iloc[0], ['company']] = 1
        df.ix[df.company != 1, ['company']] = 0

        #        df[''] = df.apply(my_test2, axis=1)

        df['snippet'] = [
            textSim.cosine_sim(x, self.dataJob.snippet.iloc[0])
            for x in df['snippet']
        ]
        df['jobtitle'] = [
            textSim.cosine_sim(x, self.dataJob.jobtitle.iloc[0])
            for x in df['jobtitle']
        ]

        df['variance'] = df['city'] + df['company'] + df['country'] + df[
            'expired'] + df[
                'indeedApply'] + 10 * df['snippet'] + 5 * df['jobtitle']

        result = df.sort(['variance'], ascending=False)
        #import pdb; pdb.set_trace()
        simList = result['jobkey'][:10].tolist()
        simDict = []
        for x in simList:
            s = sampledfo.loc[sampledfo['jobkey'] == x]
            simDict.append(s.to_dict(orient='records')[0])
        return simDict
Beispiel #44
0
def indeedAPI(defTask):
    params = {}
    params['userip'] = "1.2.3.4",
    params['useragent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)"
    params['start'] = 1
    params['latlong'] = 1
    params['as_ttl'] = ''
    params['limit'] = 25
    params['fromage']='any'
    params['radius'] = 0
    params['q'] =  "grants+management"
    params['highlight'] = 0
    params['jobtitle'] = ''
    compincr = 25
    complevels = 11
    indeedapi = IndeedClient(publisher='7423517030312598')
    print
    print 'START:',str(time.asctime(time.localtime()))
    newJobs = 0
    expiredJobs = 0
    states=["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
    oldlist=[]
    newlist=[]
    runStart = time.time()
    for job in db.fluxxJobs.find():
        oldlist.append(job['jobkey'])
    for state in states:
        print state,
        params['l'] = state
        for c in range(complevels):
            params['salary'] = "$" + str(c * 25) + "K-$" + str(((c+1) * 25)-1) + "K"
            if c == (complevels-1):
                params['salary'] = "$" + str(c*compincr) + "K"
            sr = indeedapi.search(**params)
            tr = sr['totalResults']
            ps = params['salary'].replace("$","")
            for apirequests in range((tr/compincr)+1):
                params['start'] = (apirequests * compincr)
                sr = indeedapi.search(**params)
                for joblisting in sr['results']:
                    jobListing = json.loads(json.dumps(joblisting))
                    newlist.append(jobListing['jobkey'])
                    if joblisting['jobkey'] not in oldlist:
                        newJobs += 1
                        listed = joblisting['date'].replace('GMT','UTC')
                        joblisting['dateOpen'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S')
                        joblisting['datesOpen'] = timeDictStamp('')
                        joblisting['_id'] = joblisting['jobkey']
                        joblisting['status'] = 'Open'
                        joblisting['searchparams'] = params
                        joblisting['searchparams']['procTime']=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        joblisting['searchparams']['totalResults'] = tr
                        joblisting['compMin'] = c*25000
                        joblisting['compMax'] = (c+1) * 25000
                        joblisting['compRange'] = params['salary']
                        if joblisting['city'] == "":
                            del joblisting['city']
                        if joblisting['state'] == "":
                            del joblisting['state']
                        job = joblisting
                        jobID=job['_id']
                        Title=job['jobtitle'][0:60].replace("'","")
                        Company=job['company'].replace("'","").encode('latin-1','ignore')
                        if 'source' in job:
                            Source=job['source'].replace("'","")
                        else:
                            Source = ''
                        Description=job['snippet'][0:250].replace("'","").encode('latin-1','ignore').replace("'","")
                        Description = cleanup(Description,{'<b>':'','</b>':'','<B>':'','</B>':''})
                        listed = job['date'].replace('GMT','UTC')
                        job['dateListed'] = datetime.strftime(datetime.strptime(listed, '%a, %d %b %Y %H:%M:%S %Z'), '%Y%m%d %H:%M:%S')
                        CompRange_min=job['compMin']
                        CompRange_max=job['compMax']
                        CompRange=job['compRange']
                        textURL = "http://www.indeed.com/viewjob?jk=" + job['jobkey']
                        jd = {}
                        jd['jdText'] = alchemy(textURL,'URLGetText','text')
                        jd['jdConcepts'] = alchemy(textURL,'URLGetRankedConcepts','concepts')
                        jd = json.loads(json.dumps(jd))
                        job['jobDescription'] = jd
                        if not db.fluxxJobs.find_one({'jobkey':job['jobkey']}):
                            db.grantsJobs.save(job)
    delisted = set(oldlist).difference(set(newlist))
    for jobkey in delisted:
        expiredJobs+=1
        rightnow = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        upResult = db.grantsJobs.update({'jobkey':jobkey},{'$set':{'dateClosed':rightnow,'datesClosed':timeDictStamp(),'status':'Closed','expired':'true'}})
    print " "
    print 'FINISH:',str(time.asctime(time.localtime()))
    print '=================================================================================================='
Beispiel #45
0
def fullmap():
    gmaps = googlemaps.Client(key="AIzaSyAx1j38VITDr2p2-VclAyX8pSOp7C_1-kM")
    lctn = gmaps.geolocate()
    #reverse = gmaps.reverse_geocode(latlng = [lctn['location']['lat'],lctn['location']['lng']] )

    client = IndeedClient('1905750874242217')
    params = {
        'q': "python",
        'l': "Kharkiv",
        'userip': "1.2.3.4",
        'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
        'co': "UA",
        'latlong': 1,
        'start': 0,
        'limit': 25
    }
    search_response = client.search(**params)
    jobs = json_normalize(search_response['results'])

    jobs_markers = [{
        'icon': '//maps.google.com/mapfiles/ms/icons/blue-dot.png',
        'lat': lctn['location']['lat'],
        'lng': lctn['location']['lng'],
        'infobox': "My Location"
    }]

    for index, row in jobs.iterrows():
        get_address = gmaps.places(query=row['company'] + ' ' + row['city'],
                                   location=str(lctn['location']['lat']) +
                                   ',' + str(lctn['location']['lng']))
        company = json_normalize(get_address['results'])
        for index, row_company in company.iterrows():
            jobs_markers.append({
                'icon':
                '//maps.google.com/mapfiles/ms/icons/red-dot.png',
                'lat':
                row_company['geometry.location.lat'],
                'lng':
                row_company['geometry.location.lng'],
                'infobox':
                row['company'] + ' - ' + row_company['formatted_address'] +
                ' snippet:' + row['snippet']
            })

    #ltn = location()

    fullmap = Map(
        identifier="fullmap",
        varname="fullmap",
        style=("height:70%;"
               "width:99%;"
               "top:50;"
               "left:10;"
               "position:absolute;"
               "z-index:200;"),
        lat=lctn['location']['lat'],
        lng=lctn['location']['lng'],
        markers=jobs_markers,
        # maptype = "TERRAIN",
        zoom="11",
        #cluster=True
        fit_markers_to_bounds=True)
    return render_template('example_fullmap.html',
                           fullmap=fullmap,
                           GOOGLEMAPS_KEY=request.args.get('apikey'))
Beispiel #46
0
import json
from indeed import IndeedClient
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from database import Positions

engine = create_engine('sqlite:///jobs.db', echo=True)

# create  a session
Session = sessionmaker(bind=engine)
session = Session()

# publisher=5950869068484812
client = IndeedClient('5950869068484812')
params = {
    'q': "python",
    'l': "Palo Alto",
    'userip': "168.159.213.210",
    'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4)",
    'limit': "50",
    'sort': "date",
    'start': "0"
}
search_response = client.search(**params)
print search_response
# print search_response['results']
# use JSON editor online to view result
# http://www.jsoneditoronline.org/

with open('indeed_positions_json.txt', 'w') as outfile:
    jobs = json.dump(search_response, outfile)