def start_requests(self):
        '''return iterable of job links'''

        with CommonFuncs.get_db() as db:
            todoforsite = db.query(UnprocessedJob).filter(UnprocessedJob.bot_type == 'Indeed_Bot').all()
        if len(todoforsite) >= 100:
            return

        start_time = datetime.now()

        job_profile = CommonFuncs.get_job_profile()
        locations = CommonFuncs.get_locations_list(job_profile)
        query_list = CommonFuncs.build_query_string(job_profile=job_profile, or_delim='or', bracket1='(', bracket2=')', adv_supp=True)
        query_string = query_list[0]

        if len(query_string) == 0: return

        ##########
        # URL ENCODE EACH QUERY
        ##########
        start_urls = []
        for location in locations:
            query_dict = {'q':query_string, 'l':location}
            encoded_query = urllib.parse.urlencode(query_dict, safe='')
            job_url = JOB_SITE_LINKS['Indeed']['query'] + '&' + encoded_query
            start_urls.append(job_url)

        # CommonFuncs.log('time spent building start_urls for Indeed: ' + str(datetime.now() - start_time))

        ##########
        # GET URL RESPONSES AND CALL PARSE FUNCTION TO ITERATE OVER PAGES
        ##########
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)
Beispiel #2
0
    def start_requests(self):
        '''return iterable of job links'''

        with CommonFuncs.get_db() as db:
            todoforsite = db.query(UnprocessedJob).filter(
                UnprocessedJob.bot_type == 'Ziprecruiter_Bot').all()
        if len(todoforsite) >= 100:
            return

        start_time = datetime.now()

        job_profile = CommonFuncs.get_job_profile()
        locations = CommonFuncs.get_locations_list(job_profile)
        query_list = CommonFuncs.build_query_string(job_profile=job_profile,
                                                    or_delim='',
                                                    bracket1='',
                                                    bracket2='',
                                                    adv_supp=False)

        if len(query_list) == 0: return

        ##########
        # URL ENCODE EACH QUERY
        ##########
        start_urls = []
        for location in locations:
            for query_string in query_list:
                bot = CommonFuncs.get_bot('Ziprecruiter_Bot')
                if bot.is_running:  # verify that the bot is running before continuing to the next page
                    query_dict = {'search': query_string, 'location': location}
                    encoded_query = urllib.parse.urlencode(query_dict, safe='')
                    job_url = JOB_SITE_LINKS['Ziprecruiter'][
                        'query'] + '&' + encoded_query
                    start_urls.append(job_url)
                    response = html.fromstring(requests.get(job_url).content)
                    temp = response.xpath(
                        "//menu[@class='select-menu-submenu t_filter_dropdown_titles']/a/@href"
                    )
                    temp = [
                        JOB_SITE_LINKS['Ziprecruiter']['job_site_base'] + i
                        for i in temp
                    ]
                    start_urls += temp  # append all of the links from filtering by job title
                    temp = response.xpath(
                        "//menu[@class='select-menu-submenu t_filter_dropdown_companies']/a/@href"
                    )
                    temp = [
                        JOB_SITE_LINKS['Ziprecruiter']['job_site_base'] + i
                        for i in temp
                    ]
                    start_urls += temp  # append all of the links from filtering by company
                else:
                    return

        msg = 'time spent building start_urls for Ziprecruiter: ' + str(
            datetime.now() - start_time)
        # CommonFuncs.log( msg )
        print(msg)

        ##########
        # GET URL RESPONSES AND CALL PARSE FUNCTION TO ITERATE OVER PAGES
        ##########
        print('TOTAL START URLs: ' + str(len(start_urls)))
        i = 1
        for url in start_urls:
            print('LINK#: ' + str(i) + ' WORKING ON NEW START URL: ' + url)
            yield scrapy.Request(url=url, callback=self.parse)
            i += 1
Beispiel #3
0
    def get_api_results(self, desired_result_count=1):
        '''return job json objects from the indeed api.'''

        job_profile = CommonFuncs.get_job_profile()

        # GET LOCATION IN JOB PROFILE
        locations = CommonFuncs.get_locations_list(job_profile)

        # KEYWORDS CONNECTED BY OR
        query_list = CommonFuncs.build_query_string(job_profile=job_profile,
                                                    or_delim='or',
                                                    bracket1='(',
                                                    bracket2=')',
                                                    adv_supp=True)
        query_string = query_list[0]

        new_jobs_queue = queue.Queue(maxsize=0)
        new_jobs = None

        limit = '25'  # 25 is the max results per request
        lookback_period = '60'  # default lookback period
        client_id = {}
        api = None

        # CONNECT TO INDEED API FOR JOB QUERIES
        try:
            client_id = json.load(open(API_KEYS_PATH, 'r'))
            api = IndeedClient(publisher=client_id['publisher_id'])
        except:
            ValueError('No publisher id found. Filtering aborted.')

        filters = {
            'q': query_string,
            'l': '',
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            "raw": "False",
            "sort": "date",
            "radius": job_profile.radius,
            "limit": limit,
            "fromage": lookback_period,
        }

        # FIND NEW JOB JSON OBJECT USING INDEED API
        # GET NEW JOBS

        for location in locations:  # iterate over each location
            filters['l'] = location
            filters['q'] = query_string

            # THREAD-BRAINED APPROACH to get all results at once
            def get_results(i):
                '''get results and check against the db if they are new. add to queue if new'''
                filters['start'] = i
                temp_list = []
                # get 25 results, using provided filters with start index
                [
                    temp_list.append(x) for x in json.loads(
                        CommonFuncs.convertBytesToString(api.search(
                            **filters)))['results']
                ]
                [
                    new_jobs_queue.put(x) for x in temp_list
                    if new_jobs_queue.unfinished_tasks < desired_result_count
                ]

            result_count = int(
                json.loads(
                    CommonFuncs.convertBytesToString(
                        api.search(**filters)))['totalResults'])

            list_of_filter_starts = [
                str(i) for i in range(0, result_count, 25)
            ]  # build list of start positions

            for item in list_of_filter_starts:
                if not new_jobs_queue.unfinished_tasks < desired_result_count:
                    break
                get_results(item)

            new_jobs = list(
                new_jobs_queue.queue)  # append query results to list

        # RETURN JOBS
        if new_jobs:
            if desired_result_count == 1:  # just return a single job, not in a list
                return new_jobs[0]
            elif desired_result_count <= len(
                    new_jobs
            ):  # if we have more than enough new jobs, return those in a list
                return new_jobs[0:desired_result_count]
            else:  # if more than the available number of new jobs requested, return all that could be found
                return new_jobs
        else:
            return []  # if no new links found