def crawl(cls): def close(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!CLOSE') print(f'args: {args}') print(f'kwargs: {kwargs}'); def open(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!OPEN') print(f'args: {args}') print(f'kwargs: {kwargs}') cls.q.put_nowait() print() def idle(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!IDLE') print(f'args: {args}') print(f'kwargs: {kwargs}') print() def scraped(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!SCRAPED') print(f'args: {args}') print(f'kwargs: {kwargs}') print() try: count = cls.q.get_nowait() cls.q.put_nowait(count + 1) except: cls.q.put_nowait(1) process = CrawlerProcess(get_project_settings()) process.crawl(InfoempleoSpider()) """ dispatcher.connect(open, signal=signals.spider_opened) # 'item_scraped' dispatcher.connect(close, signal=signals.spider_closed) # 'item_scraped' dispatcher.connect(scraped, signal=signals.item_scraped) # 'item_scraped' dispatcher.connect(open, signal=signals.spider_opened) # 'item_scraped' """ crawler = Crawler(InfoempleoSpider()) crawler.signals.connect(open, signal=signals.spider_opened) # 'item_scraped' crawler.signals.connect(scraped, signal=signals.item_scraped) # 'item_scraped' crawler.signals.connect(close, signal=signals.spider_closed) # 'item_scraped' crawler.signals.connect(idle, signal=signals.spider_idle) # 'item_scraped' process.crawl(crawler) process.start()
def run_crawler(cls): print('run_crawler') def spider_opened(**kwargs): print(f'{multiprocessing.current_process().name}: *qOPENED') #cls.q.put(datetime.today()) cls.q.put(0) cls.init_date = datetime.today() #reactor.stop() def spider_closed(**kwargs): print(f'{multiprocessing.current_process().name}: *qCLOSED') #while q.size > 2: # cls.q.get_nowait() cls.finish_date(datetime.today()) #reactor.stop() def item_scraped(**kwargs): print(f'{multiprocessing.current_process().name}: *qSCRAPED') try: items_scraped = cls.q.get_nowait() cls.q.put_nowait(items_scraped + 1) except: cls.q.put_nowait(1) try: print(f'{multiprocessing.current_process().name}: run_crawler') crawler_settings = get_project_settings() runner = CrawlerRunner(crawler_settings) """ dispatcher.connect(lambda _: spider_opened(_), signal=signals.spider_opened) # 'item_scraped' dispatcher.connect(lambda _: spider_closed(_), signal=signals.spider_closed) # 'item_scraped' dispatcher.connect(lambda _: item_scraped(_), signal=signals.item_scraped) # 'item_scraped' deferred = runner.crawl(InfoempleoSpider()) """ crawler = Crawler(InfoempleoSpider()) crawler.signals.connect(lambda _: spider_opened(_), signal=signals.spider_opened) # 'item_scraped' crawler.signals.connect(lambda _: spider_closed(_), signal=signals.spider_closed) # 'item_scraped' crawler.signals.connect(lambda _: item_scraped(_), signal=signals.item_scraped) # 'item_scraped' deferred = runner.crawl(crawler) deferred.addBoth(lambda _: reactor.stop()) print('reactor...') reactor.run() print('run!!!!!') except Exception as e: print(e)
def crawl_(): def close(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!CLOSE') print(f'args: {args}') print(f'kwargs: {kwargs}'); print() def open(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!OPEN') print(f'args: {args}') print(f'kwargs: {kwargs}'); print() def scraped(*args, **kwargs): print(f'{multiprocessing.current_process().name}: *!!SCRAPED') print(f'args: {args}') print(f'kwargs: {kwargs}'); print() process = CrawlerProcess(get_project_settings()) dispatcher.connect(open, signal=signals.spider_opened) # 'item_scraped' dispatcher.connect(close, signal=signals.spider_closed) # 'item_scraped' dispatcher.connect(scraped, signal=signals.item_scraped) # 'item_scraped' dispatcher.connect(open, signal=signals.spider_opened) # 'item_scraped' process.crawl(InfoempleoSpider()) """ crawler = Crawler(InfoempleoSpider()) crawler.signals.connect(open, signal=signals.spider_opened) # 'item_scraped' crawler.signals.connect(scraped, signal=signals.item_scraped) # 'item_scraped' crawler.signals.connect(close, signal=signals.spider_closed) # 'item_scraped' process.crawl(crawler) """ process.start()
def crawl(cls, q, conn): print() print() print('***************************************************************************************') print('crawl') def close(spider, reason): print(f'{multiprocessing.current_process().name}: *!!CLOSE') write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt') t = Task.objects.get_latest_crawler_task() d = datetime.today() t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}' t.result = CrawlProcess.count t.save() def open(spider): print(f'{multiprocessing.current_process().name}: *!!OPEN') try: name = spider.name except: name = str(spider) write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt') CrawlProcess.count = 0 try: t = Task.objects.get_latest_crawler_task() t.name = str(process.pid) t.save() except Exception as e: t.name = e t.save() #q.put_nowait() print() def scraped(item, response, spider): print(f'{multiprocessing.current_process().name}: *!!SCRAPED') print() CrawlProcess.count = CrawlProcess.count + 1 n = CrawlProcess.count write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt') try: q.get_nowait() q.put_nowait(n) except: q.put_nowait(n) def stopped(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt') def error(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt') def send_by_pipe(item): try: conn.send(item) #conn.close() except Exception as e: write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt') process = CrawlerProcess(get_project_settings()) write_in_a_file('CrawlProcess.crawl: first', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') send_by_pipe(process) write_in_a_file('CrawlProcess.crawl: second', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt') process.crawl(InfoempleoSpider()) write_in_a_file('CrawlProcess.crawl: third', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt') crawler = Crawler(InfoempleoSpider()) crawler.signals.connect(open, signal=signals.spider_opened) crawler.signals.connect(scraped, signal=signals.item_scraped) crawler.signals.connect(close, signal=signals.spider_closed) crawler.signals.connect(stopped, signal=signals.engine_stopped) crawler.signals.connect(error, signal=signals.spider_error) write_in_a_file('CrawlProcess.crawl: before', {'crawler_process': str(process),'dir process': dir(process)},'debug.txt') process.crawl(crawler) write_in_a_file('CrawlProcess.crawl: after', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') process.start() write_in_a_file('CrawlProcess._crawl: process started', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') print('***************************************************************************************') print(f'CrawlerProcess: {process}') print(dir(process)) print('***************************************************************************************') print() print() write_in_a_file('CrawlProcess.crawl', {'CrawlerProcess': str(process), 'dir(CrawlerProcess)': dir(process)}, 'task.txt') process.join() write_in_a_file('CrawlProcess.crawl: process.join', {}, 'task.txt') write_in_a_file('CrawlProcess.crawl: process.join', {}, 'spider.txt') print('Crawler Process has Finished!!!!!')
def run_crawler_script(): Job.objects.filter(area=Job.AREA_BANKING_AND_INSURANCE).delete() crawler = CrawlerScript() spider = InfoempleoSpider() p, q = crawler.crawl(spider)
def run_cp(): print('run_cp') crawler = Crawler(InfoempleoSpider()) cp = CP(crawler) cp.start()
def crawl_async(): print('crawl_async') spider = InfoempleoSpider() crawler = CrawlerScript(spider) crawler.start()