def run(self):
        prog = Progress(1, "Determine images without hashes",
                        do_output=self.verbose_progress > 0)

        indexed_ids = set([x.contentid for x in self.db.imagehash.find()])
        tmp = set(self.db.content.find_ids(isimage=1,
                                           sort="first1ksha1 ASC"))
        ids_to_index = list(tmp - indexed_ids)
        prog.work()
        prog.finish()

        if not ids_to_index:
            self.progress("INFO: Have calculated all image signatures.\n")
            return
        prog = Progress(len(ids_to_index), "Calculate image signatures",
                        do_output=self.verbose_progress > 0)
        current_time = time.time()
        self.next_commit = current_time + self.commit_every
        self.time_per_type = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        for chunk in chunker(ids_to_index, CHUNK_SIZE * self.parallel_threads):
            current_time = time.time()
            if current_time > self.next_commit:
                self.next_commit = current_time + self.commit_every
                self.db.commit()

            jobs = self.create_jobs(chunk)
            result = self.execute_jobs(jobs)
            self.save_results(result)

            prog.work(len(chunk))
        prog.finish()
        self.db.commit()
        self.report_time_used_per_type()
 def test_format(self):
     todo = 1000000000000
     with NoStderr():
         prog = Progress(todo, "calculating")
         time.sleep(1)
         prog.work(todo)
         prog.finish()
     self.assert_no_exception()
    def _find_contents(self):
        prog = Progress(1, "Loading images",
                        do_output=self.verbose_progress > 0)
        contents = self.db.content.find(isimage=1)
        prog.work()
        prog.finish()

        prog = Progress(len(contents), "Loading image hashes",
                        do_output=self.verbose_progress > 0)
        for content in contents:
            content.prepared_hash = None
            hashs = self.db.imagehash.find(contentid=content.id, iht=self.iht)
            if hashs:
                assert len(hashs) == 1
                if hashs[0].hash:
                    content.prepared_hash = self.prepare(hashs[0].hash)
            prog.work()
        contents = [x for x in contents if x.prepared_hash is not None]
        prog.finish()
        return contents
    def test_simple(self):
        with NoStderr():
            prog = Progress(10, "calculating")
            prog.work(0)
            for _ in range(10):
                prog.work()
                time.sleep(0.1)

            prog.finish()
        self.assert_no_exception()
    def find(self):
        prog = Progress(1, "Searching for duplicated files", do_output=1)
        contents = self._find_multiple_referenced_content()
        prog.work()
        prog.finish()
        is_first = True

        prog = Progress(len(contents), "Output", do_output=1)
        for content_index in range(len(contents)):
            content = contents[content_index]
            all_files = self._find_files_for_content_id(content.id)
            files = []
            for file in all_files:
                if not is_known_file(files, file):
                    files.append(file)

            if len(files) > 1:
                yield BitEqualBucket(
                    content.size,
                    files, is_first)
                is_first = False
            prog.work()
        prog.finish()