Ejemplo n.º 1
0
def test_sample_1_depth():
    frame = sys._getframe()
    while frame.f_back is not None:
        frame = frame.f_back
    assert frame.f_back is None
    profiler = SamplingProfiler()
    profiler.sample(frame)
Ejemplo n.º 2
0
def test_sample_1_depth():
    frame = sys._getframe()
    while frame.f_back is not None:
        frame = frame.f_back
    assert frame.f_back is None
    profiler = SamplingProfiler()
    profiler.sample(frame)
Ejemplo n.º 3
0
    def __init__(self, options, needs_qt=False):
        self._overwrite = options.get("overwrite", False)
        self._processes = options.get("processes", 1)
        self._timeout = options.get("alive", 600)
        self._name = options.get("name", "")
        self._verbose = False

        self._lock_strategy = options.get("lock_strategy", "DB")
        self._lock_level = options.get("lock_level", "PAGE")
        self._lock_timeout = options.get("lock_timeout", "60")
        self._max_lock_age = options.get("max_lock_age")
        self._lock_chunk_size = 25
        self._mutex = None

        if self._lock_strategy == "DB":
            self._lock_database = options.get("lock_database")
        elif self._lock_strategy in ("FILE", "NONE"):
            pass
        else:
            raise ValueError(self._lock_strategy)

        if needs_qt:
            self._qt_app = qt_app()
            if self._processes > 1:
                logging.warning(
                    "this batch does not support multiple processes.")
                self._processes = 1  # cannot safely fork here.
        else:
            self._qt_app = None

        if options.get("profile"):
            from profiling.sampling import SamplingProfiler
            self._profiler = SamplingProfiler()
            self._overwrite = True  # profile implies overwrite
        else:
            self._profiler = None

        self._print_paths = False
        self._plain = options.get("plain")
        if self._plain:
            self._print_paths = True

        self._debug_write = options.get("debug_write", False)
        self._track_changes = options.get("track_changes", False)
Ejemplo n.º 4
0
def _test_sampling_profiler(sampler):
    profiler = SamplingProfiler(top_frame=sys._getframe(), sampler=sampler)
    with profiler:
        spin_100ms()
        spin_500ms()
    stat1 = find_stats(profiler.stats, 'spin_100ms')
    stat2 = find_stats(profiler.stats, 'spin_500ms')
    ratio = stat1.deep_hits / stat2.deep_hits
    # 1:5 expaected, but tolerate (0.8~1.2):5
    assert 0.8 <= ratio * 5 <= 1.2
Ejemplo n.º 5
0
class Processor:
    def __init__(self, options, needs_qt=False):
        self._overwrite = options.get("overwrite", False)
        self._processes = options.get("processes", 1)
        self._timeout = options.get("alive", 600)
        self._name = options.get("name", "")
        self._verbose = False

        self._lock_strategy = options.get("lock_strategy", "DB")
        self._lock_level = options.get("lock_level", "PAGE")
        self._lock_timeout = options.get("lock_timeout", "60")
        self._max_lock_age = options.get("max_lock_age")
        self._lock_chunk_size = 25
        self._mutex = None

        if self._lock_strategy == "DB":
            self._lock_database = options.get("lock_database")
        elif self._lock_strategy in ("FILE", "NONE"):
            pass
        else:
            raise ValueError(self._lock_strategy)

        if needs_qt:
            self._qt_app = qt_app()
            if self._processes > 1:
                logging.warning(
                    "this batch does not support multiple processes.")
                self._processes = 1  # cannot safely fork here.
        else:
            self._qt_app = None

        if options.get("profile"):
            from profiling.sampling import SamplingProfiler
            self._profiler = SamplingProfiler()
            self._overwrite = True  # profile implies overwrite
        else:
            self._profiler = None

        self._print_paths = False
        self._plain = options.get("plain")
        if self._plain:
            self._print_paths = True

        self._debug_write = options.get("debug_write", False)
        self._track_changes = options.get("track_changes", False)

    @staticmethod
    def options(f):
        options = [
            click.option('--processes',
                         type=int,
                         default=1,
                         help="Number of parallel processes to employ."),
            click.option(
                '--alive',
                type=int,
                default=600,
                help="Seconds to wait after inactive process is killed."),
            click.option(
                '--name',
                type=str,
                default="",
                help="Only process paths that conform to the given pattern."),
            click.option('--lock-strategy',
                         type=click.Choice(['FILE', 'DB', 'NONE'],
                                           case_sensitive=False),
                         default="DB",
                         help="How to implement locking for concurrency."),
            click.option('--lock-level',
                         type=click.Choice(['PAGE', 'TASK'],
                                           case_sensitive=False),
                         default="PAGE",
                         help="Lock granularity."),
            click.option(
                '--lock-database',
                type=click.Path(),
                required=False,
                help="Mutex database path used for concurrent processing"),
            click.option(
                '--lock-timeout',
                type=int,
                default=60,
                required=False,
                help=
                "Seconds to wait to acquire locking. NFS volumes might need high values."
            ),
            click.option(
                '--max-lock-age',
                type=int,
                default=600,
                required=False,
                help=
                "Maximum age of a lock in seconds until it is considered invalid."
            ),
            click.option(
                '--overwrite',
                is_flag=True,
                default=False,
                help="Recompute and overwrite existing result files."),
            click.option('--profile',
                         is_flag=True,
                         default=False,
                         help="Enable profiling and show results."),
            click.option(
                '--plain',
                is_flag=True,
                default=False,
                help="Print plain output that is friendly to piping."),
            click.option('--debug-write',
                         is_flag=True,
                         default=False,
                         help="Debug which files are written."),
            click.option(
                '--track-changes',
                type=str,
                default="",
                help="Recompute files and track changes with given tag.")
        ]
        return functools.reduce(lambda x, opt: opt(x), options, f)

    @property
    def processor_name(self):
        return self.__class__.__name__

    def is_image(self, path):
        # imghdr might be the perfect tool for this, but
        # it fails to detect some valid images. so we go
        # with extenstions for the most part.
        # see https://stackoverflow.com/questions/36870661/
        # imghdr-python-cant-detec-type-of-some-images-image-extension

        if path.suffix.lower() in (".jpg", ".png", ".tif", ".tiff"):
            return True

        return imghdr.what(path) is not None

    def should_process(self, page_path):
        return True

    def prepare_process(self, page_path):
        artifacts = self.artifacts()

        if self._track_changes:
            file_writer = TrackChangeWriter(self._track_changes)
        else:
            file_writer = AtomicFileWriter(overwrite=self._overwrite)
            if self._debug_write:
                file_writer = DebuggingFileWriter(file_writer)

        kwargs = dict()
        for arg, spec in artifacts:
            f = spec.instantiate(page_path=page_path,
                                 processor=self,
                                 file_writer=file_writer)

            f.fix_inconsistent()

            if not f.is_ready():
                if self._verbose:
                    print("skipping %s: missing %s" % (page_path, f.missing))
                return False

            kwargs[arg] = f

        return kwargs

    def _trigger_process1(self, p, kwargs, locked):
        work = locked

        if not locked:
            logging.warning(f"failed to obtain lock for {p}. ignoring.")

        try:
            if work:
                # a concurrent worker might already have done this.
                for f in kwargs.values():
                    if not f.is_ready():
                        work = False
                        break

            if work:
                with elapsed_timer() as elapsed:
                    data_path = find_data_path(p)
                    data_path.mkdir(exist_ok=True)

                    runtime_info = self.process(p, **kwargs)

                if runtime_info is None:
                    runtime_info = dict()
                runtime_info["status"] = "COMPLETED"
                runtime_info["elapsed"] = round(elapsed(), 2)

                self._update_runtime_info(p,
                                          {self.processor_name: runtime_info})

        except KeyboardInterrupt:
            logging.exception("Interrupted at %s." % p)
            raise

        except:
            logging.exception("Failed to process %s." % p)
            runtime_info = dict(status="FAILED",
                                traceback=traceback.format_exc())
            self._update_runtime_info(p, {self.processor_name: runtime_info})

        finally:
            # free memory allocated in cached io.Reader
            # attributes. this can get substantial for
            # long runs.
            kwargs.clear()

    def _trigger_process(self, chunk):
        if self._lock_level == "PAGE":
            lock_actor_name = "page"
        elif self._lock_level == "TASK":
            lock_actor_name = self.processor_name
        else:
            raise ValueError(self._lock_level)

        with self._mutex.lock(lock_actor_name,
                              [str(p) for _, p, _ in chunk]) as locked:

            for i, p, kwargs in chunk:
                global_work_set.add(i)
                try:
                    self._trigger_process1(p, kwargs, locked)
                finally:
                    global_work_set.remove(i)
                yield i, p

    def _trigger_process_async(self, chunk):
        results = []
        for i, p in self._trigger_process(chunk):
            results.append((i, p))
            global_stop_watch.reset()
        return results

    def _process_queue(self, queued):
        global global_work_set
        global_work_set = SharedMemoryWorkSet(lambda i: queued[i][1],
                                              max(1, self._processes))

        with self._profiler or nullcontext():
            chunked_queue_gen = chunks(queued, self._lock_chunk_size)

            def iprogress(i):
                nd = len(str(len(queued)))
                return f"[{str(i + 1).rjust(nd)} / {len(queued)}]"

            if self._processes > 1:
                with multiprocessing.Pool(self._processes,
                                          maxtasksperchild=4) as pool:
                    watchdog = Watchdog(pool=pool,
                                        stop_watch=global_stop_watch,
                                        work_set=global_work_set,
                                        timeout=self._timeout)
                    watchdog.start()

                    with tqdm(total=len(queued),
                              disable=self._print_paths) as progress:
                        for chunk in pool.imap_unordered(
                                self._trigger_process_async,
                                chunked_queue_gen):

                            if self._print_paths:
                                for i, p in chunk:
                                    print(f"{iprogress(i)} {p}", flush=True)
                            else:
                                progress.update(len(chunk))

                            global_stop_watch.reset()

                if watchdog.is_cancelled():
                    watchdog.kill()
                    sys.exit(1)
                else:
                    watchdog.set_is_done()
            else:
                with tqdm(total=len(queued),
                          disable=self._print_paths) as progress:
                    for chunk in chunked_queue_gen:
                        for i, p in self._trigger_process(chunk):
                            if self._print_paths:
                                print(f"{iprogress(i)} {p}", flush=True)
                            else:
                                progress.update(1)

    def _build_queue(self, path):
        path = Path(path)
        if not path.exists():
            raise FileNotFoundError("%s does not exist." % path)

        queued = []
        counts = dict(images=0)

        def add_path(p):
            if not p.exists():
                print("skipping %s: path does not exist." % p)
                return

            if self._name and not re.search(self._name, str(p)):
                return

            if not self.is_image(p):
                if self._verbose:
                    print("skipping %s: not an image." % p)
                return

            counts['images'] += 1

            if not self.should_process(p):
                if self._verbose:
                    print("skipping %s: should_process is False" % p)
                return

            kwargs = self.prepare_process(p)
            if kwargs is not False:
                queued.append((len(queued), p, kwargs))

        if not path.is_dir():
            if path.suffix == ".txt":
                with open(path, "r") as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            add_path(Path(line))
            else:
                raise FileNotFoundError(
                    "%s is not a valid path or text file of paths." % path)
        else:
            print(f"scanning {path}... ", flush=True, end="")

            with Spinner(disable=self._plain):
                for folder, dirs, filenames in os.walk(path):
                    folder = Path(folder)
                    if folder.name.endswith(".out"):
                        dirs.clear()
                        continue
                    else:
                        dirs.sort()

                        for filename in sorted(filenames):
                            add_path(folder / filename)

            print("done.", flush=True)
            print(
                f"{counts['images']} documents found, {len(queued)} ready to process."
            )

        return queued

    def traverse(self, path: Path):
        print(f"running {self.processor_name}.", flush=True)

        queued = self._build_queue(path)

        if self._lock_strategy == "DB":
            if self._lock_database:
                db_path = Path(self._lock_database)
            elif Path(path).is_dir():
                db_path = Path(path) / "origami.lock.db"
            else:
                db_path = Path(path).parent / "origami.lock.db"

            self._mutex = DatabaseMutex(db_path, timeout=self._lock_timeout)

            self._mutex.clear_locks(self._max_lock_age)

        elif self._lock_strategy == "FILE":
            self._mutex = FileMutex()

        elif self._lock_strategy == "NONE":
            self._mutex = DummyMutex()

        else:
            raise ValueError(self._lock_strategy)

        try:
            self._process_queue(queued)
        finally:
            self._mutex = None

        if self._profiler:
            self._profiler.run_viewer()

    def process(self, p: Path):
        pass

    def lock_or_open(self, path, mode):
        if self._lock_strategy == "FILE":
            return portalocker.Lock(path,
                                    mode,
                                    flags=portalocker.LOCK_EX,
                                    timeout=1,
                                    fail_when_locked=True)
        else:
            return open(path, mode)

    def _update_json(self, page_path, artifact, updates):
        try:
            data_path = find_data_path(page_path)
            json_path = data_path / artifact.filename()

            new_json_path = json_path.parent / (json_path.stem + ".updated" +
                                                json_path.suffix)
            if new_json_path.exists():
                os.remove(new_json_path)

            if json_path.exists():
                with open(json_path, "r") as f:
                    file_data = f.read()
                    data = json.loads(file_data)
            else:
                data = dict()

            for k, v in updates.items():
                if v is None:
                    del data[k]
                else:
                    data[k] = v

            with open(new_json_path, "w") as f:
                json.dump(data, f)

            if json_path.exists():
                os.remove(json_path)
            os.rename(new_json_path, json_path)

        except:
            logging.error(traceback.format_exc())

    def _update_runtime_info(self, page_path, updates):
        self._update_json(page_path, Artifact.RUNTIME, updates)
Ejemplo n.º 6
0
def test_not_sampler():
    with pytest.raises(TypeError):
        SamplingProfiler(sampler=123)