def close(spider, reason): print(f'{multiprocessing.current_process().name}: *!!CLOSE') write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt') t = Task.objects.get_latest_crawler_task() d = datetime.today() t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}' t.result = CrawlProcess.count t.save()
def crawl(self, spider): queue = Queue() self.queue = Queue() self.process = Process(target=self._crawl, args=(queue, spider)) self.process.start() write_in_a_file('.crawl 1', {'process': self.process, 'process-pid': self.process and self.process.pid, 'queue': self.queue.qsize()}, "t.txt") self.process.join() write_in_a_file('.crawl 2', {'process': self.process, 'process-pid': self.process and self.process.pid, 'queue': self.queue.qsize()}, "t.txt")
def scraped(item, response, spider): print(f'{multiprocessing.current_process().name}: *!!SCRAPED') print() CrawlProcess.count = CrawlProcess.count + 1 n = CrawlProcess.count write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt') try: q.get_nowait() q.put_nowait(n) except: q.put_nowait(n)
def _crawl(self, queue, spider): self.crawler = CrawlerProcess(get_project_settings()) self.crawler.crawl(spider) dispatcher.connect(self._item_scraped, signals.item_scraped) dispatcher.connect(self._so, signals.spider_opened) dispatcher.connect(self._so2, signals.spider_opened) dispatcher.connect(self._sc, signals.spider_closed) write_in_a_file('crawler start', {'db': dir(db), 'db.connection': dir(db.connection)}, "t.txt") print(dir(db.connection)) db.connection.close() self.crawler.start() self.crawler.stop() write_in_a_file('crawler ended', {'qsize': queue.qsize() }, "t.txt") queue.put(self.items)
def open(spider): print(f'{multiprocessing.current_process().name}: *!!OPEN') try: name = spider.name except: name = str(spider) write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt') CrawlProcess.count = 0 try: t = Task.objects.get_latest_crawler_task() t.name = str(process.pid) t.save() except Exception as e: t.name = e t.save() #q.put_nowait() print()
def _reset_process(self, state=Task.STATE_FINISHED): print(f'CrawlerProcess._reset_process({state})') try: self.process.terminate() write_in_a_file('_reset_process terminated (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt') self.task.result = CrawlProcess.count self.task.state = state self.task.save() self.process.join() # ! IMPORTANT after .terminate -> .join write_in_a_file('_reset_process joinned (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt') except: pass try: self.result = self.q.get_nowait() except Exception as e: pass self._clear_queue()
def run_crawler(): #https://srv.buysellads.com/ads/click/x/GTND42QNC6BDCKQ7CV7LYKQMCYYIC2QJFTAD4Z3JCWSD42QYCYYIVKQKC6BIKKQIF6AI6K3EHJNCLSIZ?segment=placement:techiediariescom; write_in_a_file('run crawler start', {}, 'celery.txt') Job.objects.filter(area=Job.AREA_BANKING_AND_INSURANCE).delete() try: crawler_settings = get_project_settings() runner = CrawlerRunner(crawler_settings) dispatcher.connect(lambda _: print('finish'), signal=signals.spider_closed)#'item_scraped' dispatcher.connect(lambda _: print('item scraped'), signal=signals.item_scraped)#'item_scraped' deferred = runner.crawl(InfoempleoSpider) deferred.addBoth(lambda _: reactor.stop()) print('reactor...') print('run!!!!!') reactor.run() write_in_a_file('run crawler end', {}, 'celery.txt') print('end!!!!!') except Exception as e: print(e)
def _crawl(self, queue, spider): crawler = CrawlerProcess(get_project_settings()) crawler.crawl(spider) write_in_a_file('signals', {'signals': dir(signals)}, 'task.txt') write_in_a_file('._crawl start', {'process': self.process, 'process-pid': self.process and self.process.pid, 'db': dir(db), 'db.connection': dir(db.connection)}, "t.txt") print(dir(db.connection)) db.connection.close() crawler.start() crawler.stop() write_in_a_file('._crawl ended 1', {'qsize': self.queue.qsize() }, "t.txt") queue.put_nowait(self.items) write_in_a_file('._crawlended after q 2', {'qsize': queue.qsize()}, "t.txt")
def _start_process(self): print(f'CrawlerProcess._start_process') self.init_datetime = timezone.now() # Before create the task self.process.start() self.task.pid = self.process.pid write_in_a_file('CrawlProcess._start_process: process started', {'pid': self.process.pid}, 'debug.txt') self.task.state = Task.STATE_RUNNING self.task.save() self.crawler_process = self.parent_conn.recv() write_in_a_file('CrawlProcess._start_process: conn.recv', {'crawler_process':str(self.crawler_process), 'dir crawler_process':dir(self.crawler_process)}, 'debug.txt') write_in_a_file('CrawlProcess._start_process', {'CrawlerProcess': str(self.crawler_process), 'dir(CrawlerProcess)': dir(self.crawler_process)},'task.txt')
def crawl(cls, q, conn): print() print() print('***************************************************************************************') print('crawl') def close(spider, reason): print(f'{multiprocessing.current_process().name}: *!!CLOSE') write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt') t = Task.objects.get_latest_crawler_task() d = datetime.today() t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}' t.result = CrawlProcess.count t.save() def open(spider): print(f'{multiprocessing.current_process().name}: *!!OPEN') try: name = spider.name except: name = str(spider) write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt') CrawlProcess.count = 0 try: t = Task.objects.get_latest_crawler_task() t.name = str(process.pid) t.save() except Exception as e: t.name = e t.save() #q.put_nowait() print() def scraped(item, response, spider): print(f'{multiprocessing.current_process().name}: *!!SCRAPED') print() CrawlProcess.count = CrawlProcess.count + 1 n = CrawlProcess.count write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt') try: q.get_nowait() q.put_nowait(n) except: q.put_nowait(n) def stopped(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt') def error(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt') def send_by_pipe(item): try: conn.send(item) #conn.close() except Exception as e: write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt') process = CrawlerProcess(get_project_settings()) write_in_a_file('CrawlProcess.crawl: first', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') send_by_pipe(process) write_in_a_file('CrawlProcess.crawl: second', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt') process.crawl(InfoempleoSpider()) write_in_a_file('CrawlProcess.crawl: third', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt') crawler = Crawler(InfoempleoSpider()) crawler.signals.connect(open, signal=signals.spider_opened) crawler.signals.connect(scraped, signal=signals.item_scraped) crawler.signals.connect(close, signal=signals.spider_closed) crawler.signals.connect(stopped, signal=signals.engine_stopped) crawler.signals.connect(error, signal=signals.spider_error) write_in_a_file('CrawlProcess.crawl: before', {'crawler_process': str(process),'dir process': dir(process)},'debug.txt') process.crawl(crawler) write_in_a_file('CrawlProcess.crawl: after', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') process.start() write_in_a_file('CrawlProcess._crawl: process started', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') print('***************************************************************************************') print(f'CrawlerProcess: {process}') print(dir(process)) print('***************************************************************************************') print() print() write_in_a_file('CrawlProcess.crawl', {'CrawlerProcess': str(process), 'dir(CrawlerProcess)': dir(process)}, 'task.txt') process.join() write_in_a_file('CrawlProcess.crawl: process.join', {}, 'task.txt') write_in_a_file('CrawlProcess.crawl: process.join', {}, 'spider.txt') print('Crawler Process has Finished!!!!!')
def send_by_pipe(item): try: conn.send(item) #conn.close() except Exception as e: write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt')
def _item_scraped(self, item, **kwargs): self._count = self._count + 1 write_in_a_file('item scraped', {'count':self._count, 'item': item, 'kwargs':kwargs, 'process': self.process, 'process-pid': self.process and self.process.pid}, "t.txt") self.items.append(item) self.queue.put_nowait(item)
def _sc(self): write_in_a_file('spider_closed', {'scraped items': len(self.items)}, "t.txt")
def _so(self): write_in_a_file('spider_opened 1', {'open': 'open!', 'x': self.x, 'process': self.process, 'process-pid': self.process and self.process.pid}, "t.txt")
def error(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt')
def crawl(self, spider): queue = Queue() self.process = Process(target=self._crawl, args=(queue, spider,)) self.process.start() write_in_a_file('crawler started', {'crawler': dir(self.crawler)}, "t.txt") return self.process, queue#p.join()
def _item_scraped(self, item): write_in_a_file('item scraped', {'item': item}, "t.txt") self.items.append(item)
def _so2(self): write_in_a_file('spider_opened 2', {}, "t.txt")
def print_something5(): x = "print_something5" write_in_a_file('print_something5',{},'celery.txt') print(x) return x
def stopped(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt')