def clone_with_timeout(src: str, dest: str, clone_func: Callable[[], None], timeout: float) -> None: """Clone a repository with timeout. Args: src: clone source dest: clone destination clone_func: callable that does the actual cloning timeout: timeout in seconds """ errors: Queue = Queue() process = Process(target=_clone_task, args=(clone_func, errors)) process.start() process.join(timeout) if process.is_alive(): process.terminate() # Give it literally a second (in successive steps of 0.1 second), # then kill it. # Can't use `process.join(1)` here, billiard appears to be bugged # https://github.com/celery/billiard/issues/270 killed = False for _ in range(10): time.sleep(0.1) if not process.is_alive(): break else: killed = True os.kill(process.pid, signal.SIGKILL) raise CloneTimeout(src, timeout, killed) if not errors.empty(): raise CloneFailure(src, dest, errors.get())
def _reflash(self, path): """ this will execute the upgrade operation in another process because the SSH connection may hang indefinitely while reflashing and would block the program; setting a timeout to `exec_command` doesn't seem to take effect so at least we can stop the process using `subprocess.join(timeout=self.UPGRADE_TIMEOUT)` """ def upgrade(conn, path, timeout): conn.connect() conn.exec_command('sysupgrade -v -c {0}'.format(path), timeout=timeout) conn.close() subprocess = Process( target=upgrade, args=[self, path, self.UPGRADE_TIMEOUT] ) subprocess.start() self.log('Upgrade operation in progress...') subprocess.join(timeout=self.UPGRADE_TIMEOUT) self.log('SSH connection closed, will wait {0} seconds before ' 'attempting to reconnect...'.format(self.SLEEP_TIME)) sleep(self.SLEEP_TIME) # kill the subprocess if it has hanged if subprocess.is_alive(): subprocess.terminate() subprocess.join()
def _reflash(self, path): """ this will execute the upgrade operation in another process because the SSH connection may hang indefinitely while reflashing and would block the program; setting a timeout to `exec_command` doesn't seem to take effect on some OpenWRT versions so at least we can stop the process using `subprocess.join(timeout=self.UPGRADE_TIMEOUT)` """ self.disconnect() command = self.get_upgrade_command(path) def upgrade(conn, path, timeout): conn.connect() conn.exec_command(command, timeout=timeout) conn.disconnect() subprocess = Process(target=upgrade, args=[self, path, self.UPGRADE_TIMEOUT]) subprocess.start() self.log('Upgrade operation in progress...') subprocess.join(timeout=self.UPGRADE_TIMEOUT) self.log( f'SSH connection closed, will wait {self.RECONNECT_DELAY} seconds before ' 'attempting to reconnect...') sleep(self.RECONNECT_DELAY) # kill the subprocess if it has hanged if subprocess.is_alive(): subprocess.terminate() subprocess.join()
def test_set_pdeathsig(self): return_pid = Value('i') p = Process(target=parent_task, args=(return_pid,)) p.start() sleep(3) # wait for setting pdeathsig p.terminate() sleep(3) # wait for process termination with pytest.raises(psutil.NoSuchProcess): proc = psutil.Process(return_pid.value)
def test_set_pdeathsig(self): return_pid = Value('i') p = Process(target=parent_task, args=(return_pid, )) p.start() sleep(3) # wait for setting pdeathsig p.terminate() sleep(3) # wait for process termination with pytest.raises(psutil.NoSuchProcess): proc = psutil.Process(return_pid.value)
def test_set_pdeathsig(self): success = "done" q = Queue() p = Process(target=parent_task, args=(q, success)) p.start() child_proc = psutil.Process(q.get(timeout=3)) try: p.terminate() assert q.get(timeout=3) == success finally: child_proc.terminate()
def crawl_endpoint_to_file( self, ip_address=None, port=None, hostname=None, use_ssl=False, use_sni=False, start_urls=[], in_separate_process=True, ): """ Start crawling the given endpoint using the given list of URLs and write the results to a local file. :param ip_address: The IP address to crawl. :param port: The port where the application resides. :param hostname: The hostname to submit alongside all requests to the remote endpoint. :param use_ssl: Whether or not to use SSL to connect to the remote web service. :param use_sni: Whether or not to use SNI to connect to the remote web service. :param start_urls: A list of URLs to start crawling from. :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This enables us to call this method multiple times in the same process, as a Twisted reactor can only be started and stopped once per process. :return: A tuple containing (1) the string containing the local file path where crawling results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file. """ temp_file_path = FilesystemHelper.get_temporary_file_path() local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port) spider_kwargs = { "input_ip_address": ip_address, "input_start_urls": start_urls, "input_file_path": local_file_path, "input_hostname": hostname, "input_use_ssl": use_ssl, "input_use_sni": use_sni, "input_port": port, } pipeline_settings = self.__get_local_storage_item_pipeline() requested_hostname = hostname if hostname is not None else ip_address settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname) crawling_config = { "spider_kwargs": spider_kwargs, "settings": settings, } if in_separate_process: process = Process(target=self.__crawl, kwargs=crawling_config) process.start() process.join() process.terminate() else: self.__crawl(**crawling_config) return local_file_path, ScrapyResultWrapper.from_file(local_file_path)
class LongCalculation(QProgressDialog): """ Multiprocessing based worker for mesh and eigenvalue calculations. This is necessary to make sure GUI is not blocked while mesh is built, or when eigenvalue calculations are performed. Transformations do not need as much time, unless there is one implemented without numpy vectorized coordinate calculations. """ res = None def __init__(self, fun, args, postprocess, job): """ Build multiprocessing queues and start worker. """ super(LongCalculation, self).__init__(job, "Cancel", 0, 0) self.setModal(True) self.input = Queue() self.output = Queue() self.input.put((fun, args, postprocess)) self.proc = Process(target=worker, args=(self.input, self.output)) self.proc.start() self.timer = QTimer() self.timer.timeout.connect(self.update) self.timer.start(10) def update(self): """ Check if worker is done, and close dialog. """ try: out = self.output.get(block=False) if isinstance(out, basestring): self.setLabelText(out) return if out is None: self.done(0) return self.res = out self.timer.stop() self.proc.join() del self.proc self.done(1) except: pass def cleanUp(self): """ Kill the running processes if cancelled/failed. """ if self.proc: while self.proc.is_alive(): self.proc.terminate() del self.proc self.timer.stop()
def main(param): KILLING = False KILLING2 = False original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) data = setParam(param=param) if __name__ == "__main__": printParam(data) logger.critical( f"######################### NEW JOB: {data.folder} #########################" ) # file_handler_job = logging.FileHandler(os.path.join(data.outputDir, 'log')) # file_handler_job.setLevel(logging.INFO) # file_handler_job.setFormatter(formatter) # logger.addHandler(file_handler_job) processes = [] p = Process(target=processMaster, args=(data, ), name="MASTER") p.start() processes.append(p) logger.info('[i] MASTER started') for n in range(data.workers): p = Process(target=processWorker, args=(data, ), name="WORKER") p.start() processes.append(p) logger.info(f'[i] WORKER {n+1}/{data.workers} started') p = Process(target=processSink, args=(data, ), name="SINK") p.start() processes.append(p) logger.info(f'[i] Sink started') ksObj = KillSwitch(data) def killswitch(a, b): ksObj() ksObj() signal.signal(signal.SIGINT, killswitch) emails, nmbPgsScraped = Producer(data).workYouBastard() logger.info("[i] Finalising, \t") for p in processes: p.terminate() logger.info('[i] Done') return emails, nmbPgsScraped
def run(self, jobs): '''Start the Scrapy engine, and execute all jobs. Return consolidated results in a single list. Parms: jobs ([Job]) - one or more Job objects to be processed. Returns: List of objects yielded by the spiders after all jobs have run. ''' if not isinstance(jobs, collections.Iterable): jobs = [jobs] self.validate(jobs) p = Process(target=self._crawl, args=[jobs]) p.start() p.join() p.terminate()
def run(self, jobs): '''Start the Scrapy engine, and execute all jobs. Return consolidated results in a single list. Parms: jobs ([Job]) - one or more Job objects to be processed. Returns: List of objects yielded by the spiders after all jobs have run. ''' if not isinstance(jobs, collections.Iterable): jobs = [jobs] self.validate(jobs) p = Process(target=self._crawl, args=[jobs]) p.start() p.join() p.terminate() return self.results.get()
class CrawlProcess(): _model = Task __instance = None count = 0 @staticmethod def get_instance(): """ Static access method. """ print(CrawlProcess.__instance) if CrawlProcess.__instance == None: CrawlProcess() return CrawlProcess.__instance def __init__(self): if CrawlProcess.__instance != None: raise Exception("This class is a singleton!") else: CrawlProcess.__instance = self #self.process = Process(target=CrawlProcess.crawl) self.process = None self.crawler_process = None self.task = None self.q = Queue() self.parent_conn, self.child_conn = Pipe() #@classmethod #def crawl(cls, q): #@classmethod #def crawl(cls, process, q): @classmethod def crawl(cls, q, conn): print() print() print('***************************************************************************************') print('crawl') def close(spider, reason): print(f'{multiprocessing.current_process().name}: *!!CLOSE') write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt') t = Task.objects.get_latest_crawler_task() d = datetime.today() t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}' t.result = CrawlProcess.count t.save() def open(spider): print(f'{multiprocessing.current_process().name}: *!!OPEN') try: name = spider.name except: name = str(spider) write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt') CrawlProcess.count = 0 try: t = Task.objects.get_latest_crawler_task() t.name = str(process.pid) t.save() except Exception as e: t.name = e t.save() #q.put_nowait() print() def scraped(item, response, spider): print(f'{multiprocessing.current_process().name}: *!!SCRAPED') print() CrawlProcess.count = CrawlProcess.count + 1 n = CrawlProcess.count write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt') try: q.get_nowait() q.put_nowait(n) except: q.put_nowait(n) def stopped(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt') def error(*args, **kwargs): write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt') def send_by_pipe(item): try: conn.send(item) #conn.close() except Exception as e: write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt') process = CrawlerProcess(get_project_settings()) write_in_a_file('CrawlProcess.crawl: first', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') send_by_pipe(process) write_in_a_file('CrawlProcess.crawl: second', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt') process.crawl(InfoempleoSpider()) write_in_a_file('CrawlProcess.crawl: third', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt') crawler = Crawler(InfoempleoSpider()) crawler.signals.connect(open, signal=signals.spider_opened) crawler.signals.connect(scraped, signal=signals.item_scraped) crawler.signals.connect(close, signal=signals.spider_closed) crawler.signals.connect(stopped, signal=signals.engine_stopped) crawler.signals.connect(error, signal=signals.spider_error) write_in_a_file('CrawlProcess.crawl: before', {'crawler_process': str(process),'dir process': dir(process)},'debug.txt') process.crawl(crawler) write_in_a_file('CrawlProcess.crawl: after', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') process.start() write_in_a_file('CrawlProcess._crawl: process started', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt') print('***************************************************************************************') print(f'CrawlerProcess: {process}') print(dir(process)) print('***************************************************************************************') print() print() write_in_a_file('CrawlProcess.crawl', {'CrawlerProcess': str(process), 'dir(CrawlerProcess)': dir(process)}, 'task.txt') process.join() write_in_a_file('CrawlProcess.crawl: process.join', {}, 'task.txt') write_in_a_file('CrawlProcess.crawl: process.join', {}, 'spider.txt') print('Crawler Process has Finished!!!!!') @classmethod def crawl2(cls, q): while CrawlProcess.count < 15: # print(f'doing something: {CrawlProcess.count}') CrawlProcess.count = CrawlProcess.count + 1 n = CrawlProcess.count try: q.get_nowait() except: pass q.put(n) if CrawlProcess.count % 5 == 0: # print(f'qsize: {q.qsize()}') time.sleep(5) def _clear_queue(self): while not self.q.empty(): self.q.get_nowait() def _init_process(self, user): print(f'CrawlerProcess.init_process') self.q.put_nowait(0) self.process = Process(target=CrawlProcess.crawl, args=(self.q, self.child_conn,)) self.task = Task(user=user, state=Task.STATE_PENDING, type=Task.TYPE_CRAWLER) def _start_process(self): print(f'CrawlerProcess._start_process') self.init_datetime = timezone.now() # Before create the task self.process.start() self.task.pid = self.process.pid write_in_a_file('CrawlProcess._start_process: process started', {'pid': self.process.pid}, 'debug.txt') self.task.state = Task.STATE_RUNNING self.task.save() self.crawler_process = self.parent_conn.recv() write_in_a_file('CrawlProcess._start_process: conn.recv', {'crawler_process':str(self.crawler_process), 'dir crawler_process':dir(self.crawler_process)}, 'debug.txt') write_in_a_file('CrawlProcess._start_process', {'CrawlerProcess': str(self.crawler_process), 'dir(CrawlerProcess)': dir(self.crawler_process)},'task.txt') def _reset_process(self, state=Task.STATE_FINISHED): print(f'CrawlerProcess._reset_process({state})') try: self.process.terminate() write_in_a_file('_reset_process terminated (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt') self.task.result = CrawlProcess.count self.task.state = state self.task.save() self.process.join() # ! IMPORTANT after .terminate -> .join write_in_a_file('_reset_process joinned (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt') except: pass try: self.result = self.q.get_nowait() except Exception as e: pass self._clear_queue() def _update_process(self): print('CrawlerProcess._update_process') print(f'process is alive: {self.process and self.process.is_alive()}') if self.process and not self.process.is_alive(): self._reset_process() def start(self, user=None, **kwargs): """ Si el proceso no está vivo es que o no se ha iniciado aún o que ya ha terminado, así que se guardan los datos almacenados y se ejecuta el proceso. Si el proceso está vivo no se hace nada. :param user: The uses that make the request :param kwargs: :return: """ print(f'self.q.empty(): {self.q.empty()}') print(f'self.q.qsize(): {self.q.qsize()}') if not self.is_scrapping(): if self.task and (self.task.state == Task.STATE_RUNNING): self._reset_process() self._init_process(user) self._start_process() def stop(self): print(f'CrawleProcess.stop') self._reset_process(Task.STATE_INCOMPLETE) # self.crawler_process.stop() #self.crawler_process.join() def join(self): self.process.join() def get_actual_task(self): self._update_process() return self.task def get_latest_task(self): last_task = Task.objects.get_latest_crawler_task() # If the latest task from de db has state equal STATE_RUNNING and not is the actual task will be an incomplete task... #... and would have to update its state is_an_incomplete_task = ( last_task and last_task.state == Task.STATE_RUNNING and (not self.task or self.task.pk != last_task.pk) ) if is_an_incomplete_task: last_task.state = Task.STATE_INCOMPLETE last_task.save() return last_task def is_scrapping(self): print(CrawlProcess.is_scrapping) if self.process: return self.process.is_alive() else: return False def _get_scraped_jobs(self): latest_task = Task.objects.get_latest_crawler_task() return Job.objects.filter(Q(created_at__gte=latest_task.created_at) | Q(updated_at__gte=latest_task.created_at)) def get_scraped_items_number(self): print() print('!!!! CrawlProcess.get_scraped_items_number');print(); count = CrawlProcess.count try: print(self.q) #print(f'CrawlProcess.count: {CrawlProcess.count}') #print(f'qsize: {self.q.qsize()}') count = self.q.get(block=True, timeout=5) CrawlProcess.count = count print(f'q.count: {count}') except Exception as e: print(f'get_scraped_items_number') # save_error(e, {'count': count}) return count def get_scraped_items_percentage(self): # Calcula el total con los items scrapeados de la tarea enterior count = self.get_scraped_items_number() task = Task.objects.get_latest_finished_crawler_task() if task: old_result = task.result or 20000 else: old_result = 20000 if count < old_result: total = old_result else: total = count db_count = self._get_scraped_jobs().count() try: percentage = round(db_count/total, 2) except: percentage = 0 if percentage >= 0.95 and self.is_scrapping(): percentage = 0.95 return percentage