def run(self): prog = Progress(1, "Determine images without hashes", do_output=self.verbose_progress > 0) indexed_ids = set([x.contentid for x in self.db.imagehash.find()]) tmp = set(self.db.content.find_ids(isimage=1, sort="first1ksha1 ASC")) ids_to_index = list(tmp - indexed_ids) prog.work() prog.finish() if not ids_to_index: self.progress("INFO: Have calculated all image signatures.\n") return prog = Progress(len(ids_to_index), "Calculate image signatures", do_output=self.verbose_progress > 0) current_time = time.time() self.next_commit = current_time + self.commit_every self.time_per_type = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} for chunk in chunker(ids_to_index, CHUNK_SIZE * self.parallel_threads): current_time = time.time() if current_time > self.next_commit: self.next_commit = current_time + self.commit_every self.db.commit() jobs = self.create_jobs(chunk) result = self.execute_jobs(jobs) self.save_results(result) prog.work(len(chunk)) prog.finish() self.db.commit() self.report_time_used_per_type()
def test_format(self): todo = 1000000000000 with NoStderr(): prog = Progress(todo, "calculating") time.sleep(1) prog.work(todo) prog.finish() self.assert_no_exception()
def test_simple(self): with NoStderr(): prog = Progress(10, "calculating") prog.work(0) for _ in range(10): prog.work() time.sleep(0.1) prog.finish() self.assert_no_exception()
def _find_contents(self): prog = Progress(1, "Loading images", do_output=self.verbose_progress > 0) contents = self.db.content.find(isimage=1) prog.work() prog.finish() prog = Progress(len(contents), "Loading image hashes", do_output=self.verbose_progress > 0) for content in contents: content.prepared_hash = None hashs = self.db.imagehash.find(contentid=content.id, iht=self.iht) if hashs: assert len(hashs) == 1 if hashs[0].hash: content.prepared_hash = self.prepare(hashs[0].hash) prog.work() contents = [x for x in contents if x.prepared_hash is not None] prog.finish() return contents
def find(self): prog = Progress(1, "Searching for duplicated files", do_output=1) contents = self._find_multiple_referenced_content() prog.work() prog.finish() is_first = True prog = Progress(len(contents), "Output", do_output=1) for content_index in range(len(contents)): content = contents[content_index] all_files = self._find_files_for_content_id(content.id) files = [] for file in all_files: if not is_known_file(files, file): files.append(file) if len(files) > 1: yield BitEqualBucket( content.size, files, is_first) is_first = False prog.work() prog.finish()