Ejemplo n.º 1
0
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        #"""

        print()
        print()
        print(
            "########################################################################"
        )
        print("#@ IeSpiderMiddleware.process_spider_input: {}".format(
            response.url))
        print(
            "######################################################################"
        )
        write_in_a_file('IeSpiderMiddleware.process_spider_input',
                        {'response': response}, 'spider.txt')
        print('IeSpiderMiddleware.process_spider_input')
        time.sleep(10)
        print(
            "#######IeSpiderMiddleware.process_spider_input############################################"
        )
        print()
        print()
        # Should return None or raise an exception.
        return None
Ejemplo n.º 2
0
 def spider_closed(self, spider):
     spider.logger.info('Spider.Middleware closed: %s', spider.name)
     #"""
     write_in_a_file('IeSpiderMiddleware.spider_closed', {}, 'spider.txt')
     print(f'job_requests_count: {self.job_requests_count}')
     print(f'job_requests_count2: {self.job_requests_count2}')
     print(f'time: {IeSpiderMiddleware.START - now()}')
Ejemplo n.º 3
0
 def _reset_process(self, state=Task.STATE_FINISHED):
     self._is_resetting = True
     number_of_processed_items  = self.qitems.qsize()
     try:
         self._empty_and_close_queues()
         self.process.terminate()
         write_in_a_file('_reset_process terminated (from stop)', {'is_running': self.process.is_alive()}, 'tasks.txt')
         data = {
             'state': state,
             'result': number_of_processed_items,
             'finished_at': datetime.now(),
         }
         self._update_task(self._id_task, data)
         write_in_a_file('_reset_process before join (from stop)', {},'tasks.txt')
         self.process.join(120)  # ! IMPORTANT after .terminate -> .join
         try:
             os.kill(self.process.pid, signal.SIGTERM)
         except Exception as e:
             write_in_a_file(f'_reset_process - Error trying to kill the process {self.process.pip}', {}, 'tasks.txt')
             pass
         write_in_a_file('_reset_process after join (from stop)', {}, 'tasks.txt')
         write_in_a_file('_reset_process joinned (from stop)', {'is_running': self.process.is_alive()}, 'tasks.txt')
     except Exception as e:
         pass
     finally:
         self.process = None
         self._id_task = None
         self._is_resetting = False
     self._count = 0
Ejemplo n.º 4
0
 def _spider_closed(self, spider, reason):
     write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt')
     now = datetime.now()
     data = {
         'description': f'spider closed with count: {self._count} at {str(now)}',
         'result': self._count,
         'finished_at': now
     }
     self._update_task(self._id_task, data)
Ejemplo n.º 5
0
    def spider_opened(self, spider):
        spider.logger.info('Spider.Middleware opened: %s' % spider.name)
        #"""
        write_in_a_file('IeSpiderMiddleware.spider_opened', {}, 'spider.txt')
        print(
            '##########################################################################################'
        )

        spider.logger.info('Spider.Middleware opened: %s' % 'end')
Ejemplo n.º 6
0
 def _update_process(self):
     """
     The process is reset if it is not alive and if it has'nt being reset
     """
     write_in_a_file(f'__update_process - process == {self.process.is_alive() if self.process else None}', {}, 'tasks.txt')
     # If the process has finished
     try:
         if not self.process.is_alive() and not self._is_resetting:
             self._reset_process()
     except:
         pass
Ejemplo n.º 7
0
    def process_response(self, request, response, spider):
        write_in_a_file('IeSpiderDownloaderMiddleware.process_response', {
            'request': request,
            'response': response
        }, 'spider.txt')
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
Ejemplo n.º 8
0
 def _item_scraped(self, item, response, spider):
     self._items.append(item)
     self.qitems.put(item)
     self._count = self._count + 1
     n = self._count
     write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'item': item, 'count': n}, 'task.txt')
     try:
         self.q.get_nowait()
     except:
         pass
     finally:
         self.q.put(n)
Ejemplo n.º 9
0
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        print()
        print()
        print(
            "######################################################################"
        )
        print("#@ IeSpiderMiddleware.process_spider_output: {}".format(
            response.url))
        print(
            "######################################################################"
        )
        print(f'response.meta: {response.meta}')
        print(response.meta)

        print("result: {}".format(result))
        print(f'job_requests_count: {self.job_requests_count}')

        for i in result:
            #"""
            print()
            print('-----------------------------------------------------')
            print('RESULT:')
            print(f'type: {type(i)}')
            if type(i) == JobItem:
                self.job_requests_count += 1
            print(f'{i}')
            print('-----------------------------------------------------')
            print()
            print(f'job_requests_count: {self.job_requests_count}')
            print('IeSpiderMiddleware.process_spider_output & yield i')
            time.sleep(10)
            yield i
            print('IeSpiderMiddleware.process_spider_output  yield i ...')
            time.sleep(10)
        write_in_a_file(
            'IeSpiderMiddleware.process_spider_output', {
                'response': response,
                'result': result,
                'count': self.job_requests_count,
                'count2': self.job_requests_count2
            }, 'spider.txt')
        print(
            "#######IeSpiderMiddleware.process_spider_output############################################"
        )
        print()
        print()
Ejemplo n.º 10
0
 def _empty_and_close_queues(self):
     write_in_a_file(f'SpiderProcess._clear_queques: q', {}, 'tasks.txt')
     self._empty_and_close_queue(self.q)
     write_in_a_file(f'SpiderProcess._clear_queques: qitems', {}, 'tasks.txt')
     self._empty_and_close_queue(self.qitems)
     write_in_a_file(f'SpiderProcess._clear_queques: qis_scrapping', {}, 'tasks.txt')
     self._empty_and_close_queue(self.qis_scrapping)
     write_in_a_file(f'SpiderProcess._clear_queques: all queues have been celaned', {}, 'tasks.txt')
Ejemplo n.º 11
0
    def _start_process(self):
        """
        Starts the process and update the Task instance with the pid process and the running state

        :return: None
        """
        self.init_datetime = timezone.now()  # Before create the task
        self.process.start()
        write_in_a_file('SpiderProcess._start_process: process started', {'pid': self.process.pid}, 'tasks.txt')
        data = {
            'pid': self.process.pid,
            'state': Task.STATE_RUNNING,
            'started_at': datetime.now(),
        }
        self._update_task(self._id_task, data)
Ejemplo n.º 12
0
 def _clean_location(self, string):
     try:
         write_in_a_file(f'CleanPipeline._clean_location({string})', {}, 'pipeline.txt')
         city = get_text_before_parenthesis(string) or string
         parenthesis = get_text_between_parenthesis(string)
         if parenthesis and len(parenthesis[0]) < 4:
             # Bercial (el) -> el Bercial
             city = parenthesis[0].capitalize() + " " + city
         if city and city.isupper():
             city = city.title()
         alrededores = re.compile(r'(a|A)ldedores (de )?|(a|A)lredores (de )?|(a|A)lredor (de )?|(a|A)lrededores (de )?|(a|A)ldedor (de )?|(a|A)lrededor (de )?')
         city = alrededores.sub('', city)
         city = city.replace('.', "").replace('-', "").replace('etc', "")
         return city.strip()
     except Exception as e:
         return ''
Ejemplo n.º 13
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        write_in_a_file('IeSpiderDownloaderMiddleware.process_request',
                        {'request': request}, 'spider.txt')
        print(f'IeSpiderDownloaderMiddleware.process_request: {request.url}')
        #body = self.driver.page_source
        #return HtmlResponse(request.url, body=body, encoding='utf-8', request=request)
        return None
Ejemplo n.º 14
0
 def _empty_queue(self, q):
     write_in_a_file(f'SpiderProcess._clear_queque:', {}, 'tasks.txt')
     while True:
         try:
             self.q.get(block=False)
         except Exception as e:
             write_in_a_file(f'SpiderProcess._clear_queque: error - {e}', {}, 'tasks.txt')
             write_in_a_file(f'SpiderProcess._clear_queque: error - queue.qsize: {q.qsize()}', {}, 'tasks.txt')
             break
     write_in_a_file(f'SpiderProcess._clear_queque: queue empty?', {}, 'tasks.txt')
Ejemplo n.º 15
0
    def item_scraped(self, response, item, spider, signal, sender):

        print(
            '-------------------------------------------------------------------------------------------'
        )
        print('IeSpiderMiddleware.item_scraped')
        self.job_requests_count2 += 1
        write_in_a_file(
            'IeSpiderMiddleware.item_scraped', {
                'response': response,
                'count': self.job_requests_count,
                'count2': self.job_requests_count2
            }, 'spider.txt')
        print('IeSpiderMiddleware.process_spider_item_scraped')
        time.sleep(10)
        print(
            '-------------------------------------------------------------------------------------------'
        )
Ejemplo n.º 16
0
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        write_in_a_file('IeSpiderMiddleware.process_start_requests', {},
                        'spider.txt')
        print('# ###process_start_requests')

        for r in start_requests:
            # The init url is changed for the url with the pending page query
            #start_url = r.url
            #r._set_url(start_url)
            print(r.url)
            #print(UrlsState.get_url_data(start_url, RESULTS_PARSED))
            write_in_a_file('IeSpiderMiddleware.process_start_requests',
                            {'req': r}, 'spider.txt')
            print('###end process_start_requests')
            yield r
Ejemplo n.º 17
0
 def is_scrapping(self):
     # ñapa
     if self.qis_scrapping and self.process:
         __d = {'qis_scrapping.qsize': self.qis_scrapping.qsize(), 'process.is_alive': self.process.is_alive()}
     else:
         __d = {}
     write_in_a_file(f'SpiderProcess.is_scrapping', __d, 'tasks.txt')
     print(f'is_scraping - {__d}')
     if self.process:
         write_in_a_file(f'is_scraping - process != None', __d,'tasks.txt')
         if self.qis_scrapping.qsize() > 0:
             write_in_a_file(f'is_scraping - there is something in qis_scrapping ({self.qis_scrapping.qsize()}) -> True - process.is_alive -> {self.process.is_alive()}', {}, 'tasks.txt')
             # return self.process.is_alive()
             return True
         else:
             write_in_a_file(f'is_scraping - there is nothing in qis_scrapping -> False', __d, 'tasks.txt')
             self._stop(state=Task.STATE_FINISHED)
             return False
     else:
         write_in_a_file('is_scraping - process == None -> False', {}, 'tasks.txt')
         return False
Ejemplo n.º 18
0
 def _close_queue(self, q):
     try:
         write_in_a_file(f'SpiderProcess._clear_queque: close...', {}, 'tasks.txt')
         q.close()
         q.join_thread()
         write_in_a_file(f'SpiderProcess._clear_queque: queue closed', {}, 'tasks.txt')
     except Exception as e:
         write_in_a_file(f'SpiderProcess._clear_queque: close -> {e}', {}, 'tasks.txt')
Ejemplo n.º 19
0
 def _spider_opened(self, *args, **kwargs):
     write_in_a_file('CrawlerProcess.signal.open', {'args': args, 'kwargs': kwargs, 'process': self.process}, 'task.txt')
     self._count = 0
Ejemplo n.º 20
0
 def _get_city(self, city_name, province=None, country=None):
     write_in_a_file('StorePipeline._get_city', {'city_name': city_name + '.', 'province':province, 'country': country}, 'pipeline.txt')
     if not city_name:
         return None
     city = None
     if country and country.name == 'España':
         print('The country is España')
         # first search with iexact
         if province:
             print('Province <> None (iexact)')
             cities_qs = City.objects.filter(country=country, province=province, name__iexact=city_name)
         else:
             print('Province == None (iexact)')
             cities_qs = City.objects.filter(country=country, name__iexact=city_name)
         # second search with icontains
         if cities_qs:
             city = cities_qs[0]
         else: # not iexact city found
             if province:
                 cities_qs = City.objects.filter(country=country, province=province, name__icontains=city_name)
             else:
                 cities_qs = City.objects.filter(country=country, name__icontains=city_name)
             if cities_qs.count() > 1:
                 cities_qs = cities_qs.filter(name__icontains='/')
                 for city in cities_qs:
                     cities = [city for name in city.name.split('/') if city_name.lower() == name.lower()]
                 cities_qs = cities
             if cities_qs:
                 city = cities_qs[0]
                 if province:
                     city.province = province
                     city.save()
             elif province and (city_name != province.name):
                 city = City.objects.create(name=city_name, province=province, country=country)
     elif country and (city_name.lower() == country.name.lower() or city_name.lower() == get_acronym(country.name).lower()):
         return None
     elif country : # a foreign city:
         print('a foreign city')
         cities_qs = City.objects.filter(name__iexact=city_name,  country=country)
         if cities_qs:
             city = cities_qs[0]
         else:
             city, is_a_new_city = City.objects.get_or_create(name=city_name, country=country)
     else:
         write_in_a_file('StorePipeline._get_city',
                         {'if':'not country'}, 'pipeline.txt')
         try:
             cities_qs = City.objects.filter(name__iexact=city_name)
         except Exception as e:
             write_in_a_file('StorePipeline._get_city',
                             {'city_name': city_name + '.', 'error': e}, 'pipeline.txt')
         write_in_a_file('StorePipeline._get_city',
                         {'cities_qs': str(cities_qs)}, 'pipeline.txt')
         if not cities_qs:
             write_in_a_file('StorePipeline._get_city',
                             {'if': 'not cities_qs'}, 'pipeline.txt')
             cities_qs = City.objects.filter(name__contains=city_name) # contains to avoid coincidence in the middle of the string
             if cities_qs and cities_qs.count() > 1:
                 cities_qs = cities_qs.filter(name__icontains='/')
                 cities = None
                 for city in cities_qs:
                     cities = [city for name in city.name.split('/') if city_name.lower() == name.lower()]
                 cities_qs = cities
             if cities_qs:
                 city = cities_qs[0]
         elif  cities_qs.count() == 1:
             write_in_a_file('StorePipeline._get_city',
                             {'if': 'cities_qs.count() == 1'}, 'pipeline.txt')
             city = cities_qs[0]
     write_in_a_file('StorePipeline._get_city',
                     {'city': city}, 'pipeline.txt')
     return city
Ejemplo n.º 21
0
 def _crawl(self, spider, qis_running):
     write_in_a_file('CrawlerProcess.signal.error', {'signals': dir(signals)}, 't.txt')
     qis_running.put(spider)
     crawler = CrawlerProcess(get_project_settings())
     crawler.crawl(spider)
     # To prevent the infamous error: django.db.utils.InterfaceError: (0, '')
     db.connection.close()
     crawler.start()
     write_in_a_file('SpiderProcess.start: process started', {}, 'debug.txt')
     crawler.join()
     write_in_a_file('SpiderProcess.crawl: process joined', {}, 'task.txt')
     write_in_a_file('SpiderProcess.crawl: process joined', {}, 'tasks.txt')
     write_in_a_file('SpiderProcess.crawl: process joined', {}, 'spider.txt')
     write_in_a_file(f'Crawler Process - before: qis_running.qsize: {qis_running.qsize()}', {}, 'tasks.txt')
     try:
         qis_running.get()
     except Exception as e:
         write_in_a_file(f'Crawler Process - error in qis_running.get: {e}', {}, 'tasks.txt')
     write_in_a_file(f'Crawler Process - after: qis_running.qsize: {qis_running.qsize()}', {}, 'tasks.txt')
     write_in_a_file('===========================================================================================', {}, 'tasks.txt')
Ejemplo n.º 22
0
 def _spider_error(self, *args, **kwargs):
     write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt')
Ejemplo n.º 23
0
 def item_scraped(self, *args, **kwargs):
     write_in_a_file('IeSpiderDownloaderMiddleware.item_scraped', {},
                     'spider.txt')
     print(
         '-------------------------------------------------------------------------------------------'
     )
Ejemplo n.º 24
0
 def spider_closed(self, spider):
     write_in_a_file('IeSpiderDownloaderMiddleware.spider_closed', {},
                     'spider.txt')
     spider.logger.info('Spider closed: %s' % spider.name)
Ejemplo n.º 25
0
 def _engine_stopped(self, *args, **kwargs):
     write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt')
Ejemplo n.º 26
0
def run_crawler_view(request, model=None):
    sp = SpiderProcess.get_instance()
    req = ''
    write_in_a_file('view request - start', {}, 'tasks_view.txt')
    try:
        spider = InfoempleoCompaniesSpider if 'company' in model.lower(
        ) else InfoempleoSpider
    except:
        spider = None
    if request.GET.get('crawl', None):
        write_in_a_file('view request - start get request', {},
                        'tasks_view.txt')
        user = User.objects.get(username=request.user)
        sp.start(spider, user)
        write_in_a_file('view request - end get request', {}, 'tasks_view.txt')
        req = 'crawl'
    if request.GET.get('crawl',
                       None) or request.is_ajax():  # get a task at most
        last_task = sp.get_actual_task() or sp.get_latest_task()
        last_tasks = [last_task] if last_task else []
    else:  # can get various tasks
        actual_task = sp.get_actual_task()
        last_tasks = (actual_task and [actual_task]) or sp.get_latest_tasks()
        last_tasks = [task for task in last_tasks
                      if task.name == spider.name] if spider else last_tasks
    write_in_a_file('view request - continue', {}, 'tasks_view.txt')
    is_running = sp.is_scrapping()
    write_in_a_file('view request - continue after call is_scrapping', {},
                    'tasks_view.txt')
    write_in_a_file(
        'view request - continue after call get_actual_task and get_latest_task',
        {}, 'tasks_view.txt')
    context = {
        'model': model,
        'tasks': last_tasks,
        'is_running': is_running,
        'p': sp.process and sp.process.pid,
        'running_state': Task.STATE_RUNNING,
        'ajax': False,
        'req': req,
        'scraped_items_number': -100,
    }
    write_in_a_file('view request - continue2', {
        'is_running': is_running,
        'context': context
    }, 'tasks_view.txt')
    if is_running:
        context = {
            **context,
            **{
                'scraped_items_number': sp.get_scraped_items_number(),
            }
        }
    write_in_a_file('view request - continue3', {
        'is_running': is_running,
        'context': context
    }, 'tasks_view.txt')
    if request.is_ajax():
        write_in_a_file('view request - ajax request', {
            'is_running': is_running,
            'context': context
        }, 'tasks_view.txt')
        context['ajax'] = True
        if (last_task.state == Task.STATE_RUNNING):
            template_name = 'task/info_crawler_task.html'
            return render(request, template_name, context)
        elif (last_task.state == Task.STATE_PENDING):
            return HttpResponse('not running')
        elif (last_task.state != Task.STATE_RUNNING):
            template_name = 'task/init_ajax.html'
            return render(request, template_name, context)
        else:
            return HttpResponse('not running')
    else:
        write_in_a_file('view request - not ajax request end', {
            'is_running': is_running,
            'context': context
        }, 'tasks_view.txt')
        template_name = 'task/main.html'
        return render(request, template_name, context)