Beispiel #1
0
 def spider_opened(self, spider):
     spider.logger.info('Spider opened: %s' % spider.name)
     UrlsState.init()
     print(UrlsState.parsed_urls)
     print(
         '##########################################################################################'
     )
Beispiel #2
0
 def __is_first_request(self, url):
     print('#_is_first_request')
     ##state = UrlsState.get_parsed_url_state(url)
     if UrlsState.exist_url_state(url):
         return UrlsState.get_url_data(url, UrlsState.KEY_IS_FIRST_REQUEST)
     else:
         UrlsState._create_parsed_url_state(url)
         return True
Beispiel #3
0
 def __create_a_new_tab(self, url):
     ##new_index_tab = UrlsState.get_a_new_index_tab()
     ##print('#created the tab %i'%new_index_tab)
     ##self.parsed_urls[url]['tab'] = new_index_tab
     UrlsState.set_new_index_tab(url)
     index_tab = UrlsState.get_index_tab(url)
     if index_tab != 0:
         self.driver.execute_script("window.open('');")
         self.__select_tab(url)
     self.driver.get(url)
     self.parsed_urls[url]['is_first_request'] = False
Beispiel #4
0
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        print("#IeSpiderMiddleware.process_spider_input: {}".format(
            response.url))
        url = response.url
        if IeSpiderMiddleware.JOBS_URL_PATH in url:
            UrlsState.update_url_state(
                response.meta[UrlsState.KEY_START_URL],
                response.meta[UrlsState.KEY_TOTAL_RESULTS])
        elif '?page=0' in url:
            print('# page not found')
            try:
                print(response.meta[UrlsState.KEY_START_URL])
            except Exception as e:
                print(f'Error!!! {e}')
            UrlsState.reset_url_state(response.meta[UrlsState.KEY_START_URL])
        # Should return None or raise an exception.
        return None
Beispiel #5
0
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        print('# ###process_start_requests')

        for r in start_requests:
            # The init url is changed for the url with the pending page query
            r._set_url(UrlsState.get_pending_url(r.url))
            print(r)
            print('end')
            yield r
Beispiel #6
0
 def __select_tab(self, url):
     self.driver.switch_to.window(
         self.driver.window_handles[UrlsState.get_index_tab(url)])
Beispiel #7
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # The request.url must be:
        # - a job url
        # - a job results page url
        url = request.url
        url_to_parse = request.url
        print('url: %s' % url)
        print('url_to_parse: %s' % url_to_parse)
        if not self.__is_a_valid_job_results_url(url):
            print('#This url will not process by selenium: %s' % url)
            try:
                print(request)
                try:
                    print(request.meta)
                    print(request.meta[UrlsState.KEY_START_URL])
                except:
                    print('1')
                clean_url = self.__clean_url(
                    request.meta[UrlsState.KEY_START_URL])
                print('#parsing job from: %s' % clean_url)
                ## self.__update_parsed_url(clean_url)
                print()
                print()
                print('# request.meta: %s' %
                      request.meta[UrlsState.KEY_START_URL])
                print()
                print()
            except:
                pass
            return None
        else:
            print('IM SELENIUM PROCESSING THE REQUEST: %s' % request.url)  ##
            if not self.driver:
                self.driver = self.__get_chrome_browser(request)

        clean_url = self.__clean_url(url)
        page = self.__get_number_page_from_the_url(url)
        print('clean url: %s' % clean_url)
        print('page: %i' % page)

        try:
            if self.__is_first_request(clean_url):
                print(
                    'Selenium is going to get the page %i of %s for first time'
                    % (page, url))
                self.__create_a_new_tab(url)
                if page == 0:
                    page = UrlsState.get_pending_page(clean_url)
                if page > 1 and page < 500:
                    self.__go_to_the_page(page)
                    url_to_parse = '%s%s/' % (clean_url, str(page))
                elif page >= 500:
                    self.__go_to_the_page(499)
                    next_page = 500
                    while next_page <= page:
                        actual_page = self._get_actual_page_from_driver()
                        print('actual_page: %i, next_page: %i' %
                              (actual_page, next_page))
                        if actual_page and ((actual_page + 1) == next_page):
                            self.__go_to_the_next_page(next_page)
                            next_page += 1
                        else:
                            print('#something wrong')
                            break
            else:
                print('Selenium is going to get the next page %i of %s' %
                      (page, url))
                if page == 0:
                    inexistent_page = UrlsState.get_pending_page(
                        clean_url) + 555
                    UrlsState.reset_url_state(url)
                    print('# Im selenium, all the pages have been parsed')
                    self.__go_to_the_page(inexistent_page)
                else:
                    self.__select_tab(clean_url)
                    actual_page = self._get_actual_page_from_driver()
                    print('#actual_page: %i' % actual_page)
                    if actual_page and ((actual_page + 1) == page):
                        self.__go_to_the_next_page(page)
                    else:
                        self.__reboot_tab(clean_url, page)
        except Exception as e:
            print('#Exception __go_to_the_page: %s %i' % (clean_url, page))
            print('#Exception raised: %s' % e)
            self.__reboot_tab(clean_url, page)

        body = self.driver.page_source
        return HtmlResponse(url_to_parse,
                            body=body,
                            encoding='utf-8',
                            request=request)
Beispiel #8
0
 def spider_closed(self, spider):
     spider.logger.info('y Spider closed: %s', spider.name)
     UrlsState.close()