class Restarter(FileSystemEventHandler): __slots__ = ("_observer", "_changed") PATHS = Token("RESTARTER_PATHS") def __init__(self, paths: PATHS = None): self._changed = False self._observer = PollingObserver() if paths: for p in paths: # type: ignore print("watching for changes %r" % p) self._observer.schedule(self, str(p), recursive=True) self._observer.start() def restart_required(self) -> bool: changed = self._changed self._changed = False return changed def on_any_event(self, event: FileSystemEvent): self._changed = self._changed or (not event.is_directory and event.src_path.endswith(".py")) def stop(self): self._observer.unschedule_all()
class AutoOcrScheduler(object): SINGLE_FOLDER = 'single_folder' MIRROR_TREE = 'mirror_tree' OUTPUT_MODES = [SINGLE_FOLDER, MIRROR_TREE] def __init__( self, config_dir, input_dir, output_dir, output_mode, success_action=OcrTask.ON_SUCCESS_DO_NOTHING, archive_dir=None, notify_url='', process_existing_files=False, run_scheduler=True, polling_observer=False, ): self.logger = logger.getChild('scheduler') self.config_dir = local.path(config_dir) self.input_dir = local.path(input_dir) self.output_dir = local.path(output_dir) if self.input_dir == self.output_dir: raise AutoOcrSchedulerError('Invalid configuration. Input and output directories must not be the same to avoid recursive OCR invocation!') self.output_mode = output_mode.lower() if self.output_mode not in AutoOcrScheduler.OUTPUT_MODES: raise AutoOcrSchedulerError('Invalid output mode: {}. Must be one of: {}'.format(self.output_mode, ', '.join(AutoOcrScheduler.OUTPUT_MODES))) self.success_action = success_action.lower() if self.success_action not in OcrTask.SUCCESS_ACTIONS: raise AutoOcrSchedulerError('Invalid success action: {}. Must be one of {}'.format(self.success_action, ', '.join(OcrTask.SUCCESS_ACTIONS))) self.archive_dir = local.path(archive_dir) if archive_dir else None if self.success_action == OcrTask.ON_SUCCESS_ARCHIVE and not self.archive_dir: raise AutoOcrSchedulerError('Archive directory required for success action {}'.format(self.success_action)) self.notify_url = notify_url self.current_tasks = {} self.walk_existing_task = None self.current_outputs = set() # Create a Threadpool to run OCR tasks on self.threadpool = ThreadPoolExecutor(max_workers=3) # Wire up an AutoOcrWatchdogHandler watchdog_handler = AutoOcrWatchdogHandler(self.on_file_touched, self.on_file_deleted) # Schedule watchdog to observe the input directory if run_scheduler: self.observer = PollingObserver() if polling_observer else Observer() self.observer.schedule(watchdog_handler, self.input_dir, recursive=True) self.observer.start() self.logger.warning('Watching %s', self.input_dir) else: self.observer = None self.logger.warning('Not watching %s', self.input_dir) # Process existing files in input directory, if requested if process_existing_files: self.walk_existing_task = self.threadpool.submit(self.walk_existing_files) def shutdown(self): # Shut down the feed of incoming watchdog events if self.observer: self.logger.debug('Shutting down filesystem watchdog...') self.observer.unschedule_all() self.observer.stop() # Cancel all outstanding cancelable tasks if self.walk_existing_task: self.logger.debug('Canceling walk existing files task...') self.walk_existing_task.cancel() self.logger.debug('Canceling all %d in-flight tasks...', len(self.current_tasks)) tasks = [task for _, task in self.current_tasks.items()] for task in tasks: task.cancel() # Wait for the threadpool to clean up if self.threadpool: self.logger.debug('Shutting down threadpool...') self.threadpool.shutdown() self.threadpool = None # Wait for the watchdog to clean up if self.observer: self.logger.debug('Cleaning up filesystem watchdog...') self.observer.join() self.observer = None def __enter__(self): return self def __exit__(self, *args): self.shutdown() return False def _map_output_path(self, input_path): if self.output_mode == AutoOcrScheduler.MIRROR_TREE: return self.output_dir / (input_path - self.input_dir) else: assert self.output_mode == AutoOcrScheduler.SINGLE_FOLDER output_path = self.output_dir / (input_path.name) unique = 1 if output_path.exists() or output_path in self.current_outputs: suffix = '.{}.{}{}'.format(datetime.now().strftime('%Y%m%d'), unique, output_path.suffix) output_path = output_path.with_suffix(suffix) while output_path.exists() or output_path in self.current_outputs: unique = unique + 1 output_path = output_path.with_suffix('.{}{}'.format(unique, output_path.suffix), depth=2) return output_path def _map_archive_path(self, input_path): return self.archive_dir / (input_path - self.input_dir) def _get_config_path(self, input_path): assert (input_path - self.input_dir)[0] != '..' config_path = input_path.parent / 'ocr.config' while True: if config_path.exists(): return config_path if config_path.parent == self.input_dir: break config_path = config_path.parent.parent / 'ocr.config' config_path = self.config_dir / 'ocr.config' if config_path.exists(): return config_path return None def queue_path(self, path): output_path = self._map_output_path(path) config_file = self._get_config_path(path) archive_file = self._map_archive_path(path) task = OcrTask(path, output_path, self.threadpool.submit, self.on_task_done, config_file=config_file, success_action=self.success_action, archive_path=archive_file, notify_url=self.notify_url) self.current_tasks[path] = task self.current_outputs.add(output_path) def walk_existing_files(self): self.logger.debug('Enumerating existing input files...') def keep_file(file): return any([fnmatch.fnmatch(file, pattern) for pattern in AutoOcrWatchdogHandler.MATCH_PATTERNS]) for file in self.input_dir.walk(filter=keep_file): self.on_file_touched(file) self.walk_existing_task = None def on_file_touched(self, path): if path in self.current_tasks: self.current_tasks[path].touch() else: self.queue_path(path) def on_file_deleted(self, path): if path in self.current_tasks: self.current_tasks[path].cancel() def on_task_done(self, task): self.current_outputs.remove(task.output_path) del self.current_tasks[task.input_path] def wait_for_idle(self): if self.walk_existing_task: self.logger.debug('Waiting for walk existing files to complete...') concurrent.futures.wait([self.walk_existing_task]) while self.current_tasks: self.logger.debug('Waiting for %d tasks to complete...', len(self.current_tasks)) concurrent.futures.wait([task.future for _, task in self.current_tasks.items()])
def start_minknow_and_basecalled_monitoring(sequencing_statistics, args, header, minotour_api, stdscr, screen, log_win): """ Start the minKnow monitoring and basecalled data monitoring in accordance with arguments passed by user Parameters ---------- sequencing_statistics: minFQ.utils.SequencingStatistics Tracker class for files being monitored, and the metrics about upload args: argparse.NameSpace The command line arguments that were passed to the script log: logging.Logger The logger for this script header: dict The dictionary with headers for the requests, including authentiction minotour_api: minFQ.minotourapi.MinotourAPI The minotourAPI class stdscr: _curses.window Curses window for printing out screen: _curses.window The main curses screen log_win: _curses.window The logging window we write to Returns ------- """ sequencing_statistics.read_count = 0 runs_being_monitored_dict = {} if not args.no_fastq: # This block handles the fastq # Add our watchdir to our WATCHLIST if args.watch_dir is not None: sequencing_statistics.to_watch_directory_list.append( args.watch_dir) # if we are connecting to minKNOW if not args.no_minknow: # this block is going to handle the running of minknow monitoring by the client. stdscr.addstr("Connecting to minknow instance at {}".format(args.ip)) refresh_pad(screen, stdscr) minknow_connection = MinionManager( args=args, header=header, sequencing_statistics=sequencing_statistics) curses.napms(2000) stdscr.clear() event_handler = FastqHandler(args, header, runs_being_monitored_dict, sequencing_statistics, minotour_api) observer = Observer() observer.start() stats = True try: while True: try: c = stdscr.getch() if c == ord("l"): stats = False elif c == ord("s"): stats = True except curses.ERR: stats = True # todo these should be abstracted into one function as they are verrrry similar if stats: stdscr.addstr( 0, 0, "To stop minFQ use CTRL-C. To see the logs, Press l. To return to stats, Press s.", curses.color_pair(4)) stdscr.addstr(1, 0, "Fetching Data...") write_out_minfq_info(stdscr, sequencing_statistics) ascii_minotour(stdscr) write_out_minknow_info(stdscr, sequencing_statistics) write_out_fastq_info(stdscr, sequencing_statistics) refresh_pad(screen, stdscr) stdscr.overwrite(screen) else: log_win.overwrite(screen) # log_win.addstr(0, 0, "To stop minFQ use CTRL-C. To see the logs, Press l. To see info, Press s.", curses.color_pair(4)) refresh_pad(screen, log_win) screen.refresh() if not args.no_fastq and sequencing_statistics.to_watch_directory_list: for folder in sequencing_statistics.to_watch_directory_list: # directory watchlist has the new run dir log.warning( "Checking folder {} that is in our to watch directory". format(folder)) if folder and folder not in sequencing_statistics.watched_directory_set: # check that the folder exists, before adding it to be scheduled if os.path.exists(folder): # We have a new folder that hasn't been added. # We need to add this to our list to schedule and catalogue the files. # TODO bug here where we never remove directories from the already watching set - eventually this would lead to a massive set of strings in this set if minFQ is never quit sequencing_statistics.watched_directory_set.add( folder) event_handler.addfolder(folder) log.info( "FastQ Monitoring added for {}".format(folder)) sequencing_statistics.to_watch_directory_list.remove( folder) sequencing_statistics.update = True else: log.warning( "Waiting for minKNOW to create folder {} before updating watchdog." .format(folder)) log.warning("Updating observers is {}".format( sequencing_statistics.update)) if sequencing_statistics.update: observer.unschedule_all() for folder in sequencing_statistics.watched_directory_set: if folder and os.path.exists(folder): log.info( "Scheduling observer for {}, which does exist". format(folder)) observer.schedule(event_handler, path=folder, recursive=True) else: log.warning( "Tried to add {}, but folder does not exist". format(folder)) sequencing_statistics.update = False # check if we need to remove any fastq info sequencing_statistics.check_fastq_info() time.sleep(1) if sequencing_statistics.errored: log.error( "Errored - Will take a few seconds to clean clear_lines!") log.error(sequencing_statistics.error_message) if not args.no_minknow: minknow_connection.stop_monitoring() if not args.no_fastq: observer.stop() observer.join() observer.stop() observer.join() event_handler.stopt() curses.nocbreak() stdscr.keypad(False) curses.echo() curses.endwin() print(repr(sequencing_statistics.error_message)) print( "Exiting after error - Will take a few seconds to close threads." ) sys.exit(0) except (KeyboardInterrupt, Exception) as e: stdscr.addstr("Exiting - Will take a few seconds to close threads.") if not args.no_minknow: minknow_connection.stop_monitoring() observer.stop() observer.join() event_handler.stopt() curses.nocbreak() stdscr.keypad(False) curses.echo() curses.endwin() print(repr(e)) print("Exiting - Will take a few seconds to close threads.") sys.exit(0)