def __init__(self, config: DeduplicatorConfig): self.config = config self.filter = Filter(self.config.filter_filename) self.db = FileDatabase(self.config.sqlite_filename) self.log = open(self.config.log_file, 'a') self.moved = dict() self._selected_for_move: typing.List[FileEntry] = list() self._selected_hash = dict( ) # (uppercase-name, file-size, date, checksum) => true self._count = dict(n=0, r=0, d=0) # new, reject, dup self._print_count = False self._target_entries: typing.Dict[str, FileEntry] = dict() self.steps: typing.List[Step] = [ Step( self._read_from_db, 'Read file information from database and select accepted images' ), Step(self._make_file_names_unique, 'Prepare file information as unique target names'), Step(self._store_file_path_map, 'Store (cache) file mapping information into the DB'), Step(self._copy_files, 'Copy files') ]
def __init__(self, filter_filename: str, sqlite_filename: str): self.filter_filename = filter_filename self.filter = Filter(filter_filename) self.sqlite_filename = sqlite_filename self.db = FileDatabase(sqlite_filename) self.read_to_filter = ProcessInputToFilter(self.filter, self.__class__.__name__) self.moved = dict() self._selected_for_move = list() self._selected_hash = dict() # (uppercase-name, file-size, date, checksum) => true self._count = dict(n=0, r=0, d=0, s=0) # new, reject, dup, skipped
def __init__(self, source_dir: str, sqlite_filename: str, pretend: bool = False): self.source_dir = source_dir self.sqlite_filename = sqlite_filename self.pretend = pretend self.db = FileDatabase(sqlite_filename) self.moved = dict() self._selected_for_move = list()
def __init__(self, config: SafeEraserConfig): self.config = config self.db = FileDatabase(self.config.sqlite_filename) self.log = open(self.config.log_file, 'a') self.copied: typing.Dict[str, FileEntry] = dict() self.safe_to_delete = list() self.known_rows = set() self.copied_hashes = dict() self.copied_hashes_without_date = dict() self.steps: typing.List[Step] = [ Step(self._read_from_db, 'Read copied file entries'), Step(self._find_duplicates, 'Find duplicates of copied files from DB'), Step(self._erase_duplicates, 'Remove duplicates from file sytem'), ]
class ImageCollector: def __init__(self, source_dir: str, sqlite_filename: str, pretend: bool = False): self.source_dir = source_dir self.sqlite_filename = sqlite_filename self.pretend = pretend self.db = FileDatabase(sqlite_filename) self.moved = dict() self._selected_for_move = list() def run(self): for x in self._walk(): self._process(x) self.db.close() return 0 def _walk(self) -> typing.Iterable[FileEntry]: for root, dirs, files in os.walk(self.source_dir): print('Processing: {}'.format(root)) for name in files: full_path = os.path.join(root, name) if self._skip_file(name): # print('Skip: ' + full_path) continue yield FileEntry(full_path, name, name.upper(), *self._mod_date_and_file_size(full_path)) self.db.commit() def _skip_file(self, name: str) -> bool: _, ext = os.path.splitext(name.lower()) return ext not in ['.jpg', '.jpeg', '.cr2', '.mov', '.thm', '.mp4'] def _mod_date_and_file_size(self, full_path: str) -> typing.Tuple[int, int]: f = os.stat(full_path) return int(f.st_mtime), f.st_size def _checksum(self, filename: str) -> str: r = subprocess.check_output(['md5sum', filename]) return r.decode('UTF-8').split(' ')[0] def _process(self, entry: FileEntry): if entry.orig_path in self.db: return try: self.db.insert(entry, self._checksum(entry.orig_path)) except sqlite3.IntegrityError as e: print('File {} is already processed. Error: {}'.format( entry.orig_path, str(e)))
class ImageDeduplicator: MOD_DATE_MAX_DIFF = 5 # seconds between mtime and EXIF date/time original entry def __init__(self, config: DeduplicatorConfig): self.config = config self.filter = Filter(self.config.filter_filename) self.db = FileDatabase(self.config.sqlite_filename) self.log = open(self.config.log_file, 'a') self.moved = dict() self._selected_for_move: typing.List[FileEntry] = list() self._selected_hash = dict( ) # (uppercase-name, file-size, date, checksum) => true self._count = dict(n=0, r=0, d=0) # new, reject, dup self._print_count = False self._target_entries: typing.Dict[str, FileEntry] = dict() self.steps: typing.List[Step] = [ Step( self._read_from_db, 'Read file information from database and select accepted images' ), Step(self._make_file_names_unique, 'Prepare file information as unique target names'), Step(self._store_file_path_map, 'Store (cache) file mapping information into the DB'), Step(self._copy_files, 'Copy files') ] def run(self): self._run_steps() self._print_stats() def _run_steps(self): count = len(self.steps) current = 0 for step in self.steps: current += 1 print(f' * Step {current} of {count}: {step.help}') self.log.write(f' * Step {current} of {count}: {step.help}\n') step.func() def _read_from_db(self): if self.db.has_new_path_info(): return self._print_count = True for entry in self.db.iterate_photo_entries(): if entry.key in self._selected_hash: self._count['d'] += 1 continue result = self.filter.decide(entry) if result == FilterResult.ACCEPT: self._select_entry(entry) else: self._count['r'] += 1 def _select_entry(self, entry: FileEntry) -> bool: self._selected_for_move.append(entry) self._selected_hash[entry.key] = True self._count['n'] += 1 return True def _make_file_names_unique(self): if self.db.has_new_path_info(): return for entry in self._selected_for_move: self._add_entry_to_fs(entry) def _add_entry_to_fs(self, entry: FileEntry): full_path = os.path.join(self.config.main_target_dir, entry.new_name) if full_path not in self._target_entries: self._target_entries[full_path] = entry elif entry.checksum == self._target_entries[full_path].checksum: # This is likely the same file, even if the time stamp differs. This can occur # when importing into iPhoto (eg. before 2010!), or # when copying to different machines. # I saw such issue on Mac using Midnight Commander as FTP machine: the server's file timestamp # depends on the current time zone (more preciesly on daylight saving time enabled or not): CET vs. CEST self.log.write('\n** CHEKSUM DUP: (new):' + entry.orig_path + ' (old):' + self._target_entries[full_path].orig_path + ' ' + full_path + '\n') entry.print(self.log) self._target_entries[full_path].print(self.log) if not os.path.exists(entry.orig_path): self._target_entries[full_path] = entry print('Choosing NEW entry (missing original file)', file=self.log) elif os.path.exists(entry.orig_path): # The modification date of the entry can be exactly the same as in the EXIF data # or it has to be within a few seconds, probably because it takes time to write to the memory card. # I saw mostly 1 second difference, sometimes 2 seconds. # It seems to be a good start to use 5 seconds treshold. try: date_line = subprocess.check_output([ self.config.exiftool, '-t', self._target_entries[full_path].orig_path ]).decode('UTF-8') date_old = \ [x.split('\t')[1] for x in date_line.splitlines(keepends=False) if x.startswith('Date/Time Original')] except subprocess.CalledProcessError: date_old = [] if not date_old: print( 'Keeping original as date and time information is not available for OLD', file=self.log) return date_old = date_old[0] date_old_ts = time.mktime( time.strptime(date_old, '%Y:%m:%d %H:%M:%S')) print( f'(old) date in exif data of path \'{self._target_entries[full_path].orig_path}\' is {date_old}', file=self.log) if date_old != self._target_entries[ full_path].date_as_exif_data and abs( date_old_ts - self._target_entries[full_path]. mod_date) > self.MOD_DATE_MAX_DIFF: try: date_line = subprocess.check_output( [self.config.exiftool, '-t', entry.orig_path]).decode('UTF-8') date_new = \ [x.split('\t')[1] for x in date_line.splitlines(keepends=False) if x.startswith('Date/Time Original')] except subprocess.CalledProcessError: date_new = [] if not date_new: print( 'Keeping original as date and time information is not available for NEW', file=self.log) return date_new = date_new[0] date_new_ts = time.mktime( time.strptime(date_new, '%Y:%m:%d %H:%M:%S')) print( f'(new) date in exif data of path \'{entry.orig_path}\' is {date_old}', file=self.log) if date_new == entry.date_as_exif_data or abs( date_new_ts - entry.mod_date) < self.MOD_DATE_MAX_DIFF: self._target_entries[full_path] = entry print( 'Choosing NEW entry (almostmatching date and time)', file=self.log) else: print('Keeping original / OLD', file=self.log) else: print( 'Keeping original / OLD as EXIF timestamp and file mtime (almost) matches', file=self.log) else: print(f"Duplicated file entry detected on path: {full_path}") self.log.write( f"\n*** Duplicated file entry detected on path: {full_path}\n") entry.print(self.log) self._target_entries[full_path].print(self.log) base, ext = os.path.splitext(full_path) i = 1 new_name = f'{base}_{i}{ext}' while new_name in self._target_entries: i += 1 new_name = f'{base}_{i}{ext}' self._target_entries[new_name] = entry self.log.write( f'{self._target_entries[full_path].orig_path} became {new_name}\n' ) def _store_file_path_map(self): if self.db.has_new_path_info(): return for path, entry in self._target_entries.items(): self.db.insert_new_path(entry.id, path) self.db.commit() def _copy_files(self): count = len(self.config.main_target_dir) for path, entry in self.db.iterate_target_path_entries(): suffix = path[count:] if suffix[0] == os.path.sep: suffix = suffix[1:] (year, month, day, remaining) = suffix.split(os.path.sep, 3) path = os.path.join(self.config.main_target_dir, year, f'{year}-{month}', f'{year}-{month}-{day}', remaining) mode = 'SKIP' if os.path.exists(path) else 'COPY' print(f'{mode}: {entry.orig_path} -> {path}', file=self.log) if not self.config.dry_run: if not os.path.exists(entry.orig_path): print(f'SKIP MISSING SOURCE FILE: {entry.orig_path}', file=self.log) elif not os.path.exists(path): os.makedirs(os.path.dirname(path), 0o755, exist_ok=True) shutil.copy2(entry.orig_path, path) else: print( f'SKIP EXISTING TARGET {path} - SOURCE FILE: {entry.orig_path}', file=self.log) def _print_stats(self): if not self._print_count: return print('Stats: ') print(f'* NEW : {self._count["n"]}') print(f'* DUP : {self._count["d"]}') print(f'* REJECTED : {self._count["r"]}') print(f'# DONE : ' + str(self._count['n'] + self._count['d'] + self._count['r']))
class ImageSelector: INTERACTIVE = True def __init__(self, filter_filename: str, sqlite_filename: str): self.filter_filename = filter_filename self.filter = Filter(filter_filename) self.sqlite_filename = sqlite_filename self.db = FileDatabase(sqlite_filename) self.read_to_filter = ProcessInputToFilter(self.filter, self.__class__.__name__) self.moved = dict() self._selected_for_move = list() self._selected_hash = dict() # (uppercase-name, file-size, date, checksum) => true self._count = dict(n=0, r=0, d=0, s=0) # new, reject, dup, skipped def run(self): set_quit = False for entry in self.db.iterate_photo_entries(): if set_quit: break if entry.key in self._selected_hash: self._count['d'] += 1 continue self.read_to_filter.set_entry(entry) result = FilterResult.UNSPECIFIED while result == FilterResult.UNSPECIFIED: result = self.filter.decide(entry) if result == FilterResult.UNSPECIFIED: if self.INTERACTIVE: if not self.read_to_filter.read_and_process_line(): set_quit = True break else: self._count['s'] += 1 result = FilterResult.ACCEPT # doesn't matter else: if result == FilterResult.ACCEPT: self._select_entry(entry) else: self._count['r'] += 1 if set_quit: break self._print_stats() def _select_entry(self, entry: FileEntry) -> bool: self._selected_for_move.append(entry) self._selected_hash[entry.key] = True self._count['n'] += 1 return True def _print_stats(self): print('Stats: ') print(f'* NEW : {self._count["n"]}') print(f'* DUP : {self._count["d"]}') print(f'* REJECTED : {self._count["r"]}') print(f'* SKIP : {self._count["s"]}') print(f'# DONE : ' + str(self._count['n'] + self._count['d'] + self._count['r'])) print(f'# TOTAL : ' + str(self._count['n'] + self._count['d'] + self._count['r'] + self._count['s']))
class SafeEraser: MOD_DATE_MAX_DIFF = 5 # seconds between mtime and EXIF date/time original entry def __init__(self, config: SafeEraserConfig): self.config = config self.db = FileDatabase(self.config.sqlite_filename) self.log = open(self.config.log_file, 'a') self.copied: typing.Dict[str, FileEntry] = dict() self.safe_to_delete = list() self.known_rows = set() self.copied_hashes = dict() self.copied_hashes_without_date = dict() self.steps: typing.List[Step] = [ Step(self._read_from_db, 'Read copied file entries'), Step(self._find_duplicates, 'Find duplicates of copied files from DB'), Step(self._erase_duplicates, 'Remove duplicates from file sytem'), ] def run(self): self._run_steps() def _run_steps(self): count = len(self.steps) current = 0 for step in self.steps: current += 1 print(f' * Step {current} of {count}: {step.help}') self.log.write(f' * Step {current} of {count}: {step.help}\n') step.func() def _read_from_db(self): for path, entry in self.db.iterate_target_path_entries(): parts = path.split(os.path.sep) # parts is like (a, directory, name, to, somewhere, year, month, day, filename) # We drop year and remaining fields by now prefix = os.path.sep.join(parts[:-4]) year, month, day, filename = parts[-4:] real_path = os.path.join(prefix, year, f'{year}-{month}', f'{year}-{month}-{day}', filename) if os.path.exists(real_path): print(f'Found {real_path}, original can be deleted from {entry.orig_path}', file=self.log) self.copied[real_path] = entry self.safe_to_delete.append(entry.orig_path) self.known_rows.add(entry.id) self.copied_hashes[entry.key] = entry self.copied_hashes_without_date[entry.key_without_date] = entry else: print(f'*** NOT Found {real_path}, original (and dups) CANNNOT be deleted from {entry.orig_path}', file=self.log) print(f'Found {len(self.safe_to_delete)} files for deletion') print(f'Found {len(self.safe_to_delete)} files for deletion', file=self.log) def _find_duplicates(self): count = 0 for entry in self.db.iterate_photo_entries(): if entry.id in self.known_rows: continue if not os.path.exists(entry.orig_path): self.known_rows.add(entry.id) continue if entry.key in self.copied_hashes: print(f'! DUPLICATE found at {entry.orig_path}, original: {self.copied_hashes[entry.key].orig_path}', file=self.log) self.known_rows.add(entry.id) self.safe_to_delete.append(entry.orig_path) count += 1 elif entry.key_without_date in self.copied_hashes_without_date: try: date_line = subprocess.check_output([self.config.exiftool, '-t', entry.orig_path]).decode( 'UTF-8') date_new = \ [x.split('\t')[1] for x in date_line.splitlines(keepends=False) if x.startswith('Date/Time Original')] except subprocess.CalledProcessError: date_new = [] if not date_new: print(f'** Missing exif date info in file {entry.orig_path}', file=self.log) continue date_new = date_new[0] date_new_ts = time.mktime(time.strptime(date_new, '%Y:%m:%d %H:%M:%S')) if abs(date_new_ts - entry.mod_date) < self.MOD_DATE_MAX_DIFF or \ abs(abs(date_new_ts - entry.mod_date) - 3600) < self.MOD_DATE_MAX_DIFF: print( f'. DUPLICATE found at {entry.orig_path}, original: ' + self.copied_hashes_without_date[ entry.key_without_date].orig_path + \ f' date diff is {entry.mod_date - date_new_ts}', file=self.log) self.known_rows.add(entry.id) self.safe_to_delete.append(entry.orig_path) count += 1 print(f'Found {count} more files for deletion') print(f'Found {count} more files for deletion', file=self.log) def _erase_duplicates(self): print(f'Deleting {len(self.safe_to_delete)} files') print(f'Deleting {len(self.safe_to_delete)} files', file=self.log) count = 0 missing = 0 for path in self.safe_to_delete: if self.config.dry_run or os.path.exists(path): print(f'DELETE {path}', file=self.log) if not self.config.dry_run: try: os.unlink(path) except Exception as e: print(f'Exception occurred; class={e.__class__.__name__}, msg={e}, path={path}') print(f'Exception occurred; class={e.__class__.__name__}, msg={e}, path={path}', file=self.log) count += 1 else: print(f'ALREADY DELETED {path}', file=self.log) missing += 1 print(f'Deleted {count} of {len(self.safe_to_delete)} files; already removed (missing) {missing} files') print(f'Deleted {count} of {len(self.safe_to_delete)} files; already removed (missing) {missing} files', file=self.log)