Example #1
0
    def __init__(self, config: DeduplicatorConfig):
        self.config = config
        self.filter = Filter(self.config.filter_filename)
        self.db = FileDatabase(self.config.sqlite_filename)
        self.log = open(self.config.log_file, 'a')
        self.moved = dict()
        self._selected_for_move: typing.List[FileEntry] = list()
        self._selected_hash = dict(
        )  # (uppercase-name, file-size, date, checksum) => true
        self._count = dict(n=0, r=0, d=0)  # new, reject, dup
        self._print_count = False

        self._target_entries: typing.Dict[str, FileEntry] = dict()

        self.steps: typing.List[Step] = [
            Step(
                self._read_from_db,
                'Read file information from database and select accepted images'
            ),
            Step(self._make_file_names_unique,
                 'Prepare file information as unique target names'),
            Step(self._store_file_path_map,
                 'Store (cache) file mapping information into the DB'),
            Step(self._copy_files, 'Copy files')
        ]
Example #2
0
 def __init__(self, filter_filename: str, sqlite_filename: str):
     self.filter_filename = filter_filename
     self.filter = Filter(filter_filename)
     self.sqlite_filename = sqlite_filename
     self.db = FileDatabase(sqlite_filename)
     self.read_to_filter = ProcessInputToFilter(self.filter, self.__class__.__name__)
     self.moved = dict()
     self._selected_for_move = list()
     self._selected_hash = dict()  # (uppercase-name, file-size, date, checksum) => true
     self._count = dict(n=0, r=0, d=0, s=0)  # new, reject, dup, skipped
Example #3
0
 def __init__(self,
              source_dir: str,
              sqlite_filename: str,
              pretend: bool = False):
     self.source_dir = source_dir
     self.sqlite_filename = sqlite_filename
     self.pretend = pretend
     self.db = FileDatabase(sqlite_filename)
     self.moved = dict()
     self._selected_for_move = list()
Example #4
0
    def __init__(self, config: SafeEraserConfig):
        self.config = config
        self.db = FileDatabase(self.config.sqlite_filename)
        self.log = open(self.config.log_file, 'a')
        self.copied: typing.Dict[str, FileEntry] = dict()
        self.safe_to_delete = list()
        self.known_rows = set()
        self.copied_hashes = dict()
        self.copied_hashes_without_date = dict()

        self.steps: typing.List[Step] = [
            Step(self._read_from_db, 'Read copied file entries'),
            Step(self._find_duplicates, 'Find duplicates of copied files from DB'),
            Step(self._erase_duplicates, 'Remove duplicates from file sytem'),
        ]
Example #5
0
class ImageCollector:
    def __init__(self,
                 source_dir: str,
                 sqlite_filename: str,
                 pretend: bool = False):
        self.source_dir = source_dir
        self.sqlite_filename = sqlite_filename
        self.pretend = pretend
        self.db = FileDatabase(sqlite_filename)
        self.moved = dict()
        self._selected_for_move = list()

    def run(self):
        for x in self._walk():
            self._process(x)

        self.db.close()
        return 0

    def _walk(self) -> typing.Iterable[FileEntry]:
        for root, dirs, files in os.walk(self.source_dir):
            print('Processing: {}'.format(root))
            for name in files:
                full_path = os.path.join(root, name)

                if self._skip_file(name):
                    # print('Skip: ' + full_path)
                    continue

                yield FileEntry(full_path, name, name.upper(),
                                *self._mod_date_and_file_size(full_path))
            self.db.commit()

    def _skip_file(self, name: str) -> bool:
        _, ext = os.path.splitext(name.lower())

        return ext not in ['.jpg', '.jpeg', '.cr2', '.mov', '.thm', '.mp4']

    def _mod_date_and_file_size(self,
                                full_path: str) -> typing.Tuple[int, int]:
        f = os.stat(full_path)

        return int(f.st_mtime), f.st_size

    def _checksum(self, filename: str) -> str:
        r = subprocess.check_output(['md5sum', filename])
        return r.decode('UTF-8').split(' ')[0]

    def _process(self, entry: FileEntry):
        if entry.orig_path in self.db:
            return
        try:
            self.db.insert(entry, self._checksum(entry.orig_path))
        except sqlite3.IntegrityError as e:
            print('File {} is already processed. Error: {}'.format(
                entry.orig_path, str(e)))
Example #6
0
class ImageDeduplicator:
    MOD_DATE_MAX_DIFF = 5  # seconds between mtime and EXIF date/time original entry

    def __init__(self, config: DeduplicatorConfig):
        self.config = config
        self.filter = Filter(self.config.filter_filename)
        self.db = FileDatabase(self.config.sqlite_filename)
        self.log = open(self.config.log_file, 'a')
        self.moved = dict()
        self._selected_for_move: typing.List[FileEntry] = list()
        self._selected_hash = dict(
        )  # (uppercase-name, file-size, date, checksum) => true
        self._count = dict(n=0, r=0, d=0)  # new, reject, dup
        self._print_count = False

        self._target_entries: typing.Dict[str, FileEntry] = dict()

        self.steps: typing.List[Step] = [
            Step(
                self._read_from_db,
                'Read file information from database and select accepted images'
            ),
            Step(self._make_file_names_unique,
                 'Prepare file information as unique target names'),
            Step(self._store_file_path_map,
                 'Store (cache) file mapping information into the DB'),
            Step(self._copy_files, 'Copy files')
        ]

    def run(self):
        self._run_steps()
        self._print_stats()

    def _run_steps(self):
        count = len(self.steps)
        current = 0

        for step in self.steps:
            current += 1
            print(f' * Step {current} of {count}: {step.help}')
            self.log.write(f' * Step {current} of {count}: {step.help}\n')
            step.func()

    def _read_from_db(self):
        if self.db.has_new_path_info():
            return

        self._print_count = True

        for entry in self.db.iterate_photo_entries():
            if entry.key in self._selected_hash:
                self._count['d'] += 1
                continue

            result = self.filter.decide(entry)
            if result == FilterResult.ACCEPT:
                self._select_entry(entry)
            else:
                self._count['r'] += 1

    def _select_entry(self, entry: FileEntry) -> bool:
        self._selected_for_move.append(entry)
        self._selected_hash[entry.key] = True
        self._count['n'] += 1
        return True

    def _make_file_names_unique(self):
        if self.db.has_new_path_info():
            return

        for entry in self._selected_for_move:
            self._add_entry_to_fs(entry)

    def _add_entry_to_fs(self, entry: FileEntry):
        full_path = os.path.join(self.config.main_target_dir, entry.new_name)

        if full_path not in self._target_entries:
            self._target_entries[full_path] = entry

        elif entry.checksum == self._target_entries[full_path].checksum:
            # This is likely the same file, even if the time stamp differs. This can occur
            # when importing into iPhoto (eg. before 2010!), or
            # when copying to different machines.
            # I saw such issue on Mac using Midnight Commander as FTP machine: the server's file timestamp
            # depends on the current time zone (more preciesly on daylight saving time enabled or not): CET vs. CEST

            self.log.write('\n** CHEKSUM DUP: (new):' + entry.orig_path +
                           ' (old):' +
                           self._target_entries[full_path].orig_path + ' ' +
                           full_path + '\n')
            entry.print(self.log)
            self._target_entries[full_path].print(self.log)

            if not os.path.exists(entry.orig_path):
                self._target_entries[full_path] = entry
                print('Choosing NEW entry (missing original file)',
                      file=self.log)

            elif os.path.exists(entry.orig_path):
                # The modification date of the entry can be exactly the same as in the EXIF data
                # or it has to be within a few seconds, probably because it takes time to write to the memory card.
                # I saw mostly 1 second difference, sometimes 2 seconds.
                # It seems to be a good start to use 5 seconds treshold.
                try:
                    date_line = subprocess.check_output([
                        self.config.exiftool, '-t',
                        self._target_entries[full_path].orig_path
                    ]).decode('UTF-8')
                    date_old = \
                        [x.split('\t')[1] for x in date_line.splitlines(keepends=False)
                         if x.startswith('Date/Time Original')]
                except subprocess.CalledProcessError:
                    date_old = []

                if not date_old:
                    print(
                        'Keeping original as date and time information is not available for OLD',
                        file=self.log)
                    return

                date_old = date_old[0]
                date_old_ts = time.mktime(
                    time.strptime(date_old, '%Y:%m:%d %H:%M:%S'))

                print(
                    f'(old) date in exif data of path \'{self._target_entries[full_path].orig_path}\' is {date_old}',
                    file=self.log)

                if date_old != self._target_entries[
                        full_path].date_as_exif_data and abs(
                            date_old_ts - self._target_entries[full_path].
                            mod_date) > self.MOD_DATE_MAX_DIFF:
                    try:
                        date_line = subprocess.check_output(
                            [self.config.exiftool, '-t',
                             entry.orig_path]).decode('UTF-8')
                        date_new = \
                            [x.split('\t')[1] for x in date_line.splitlines(keepends=False)
                             if x.startswith('Date/Time Original')]
                    except subprocess.CalledProcessError:
                        date_new = []

                    if not date_new:
                        print(
                            'Keeping original as date and time information is not available for NEW',
                            file=self.log)
                        return

                    date_new = date_new[0]
                    date_new_ts = time.mktime(
                        time.strptime(date_new, '%Y:%m:%d %H:%M:%S'))
                    print(
                        f'(new) date in exif data of path \'{entry.orig_path}\' is {date_old}',
                        file=self.log)

                    if date_new == entry.date_as_exif_data or abs(
                            date_new_ts -
                            entry.mod_date) < self.MOD_DATE_MAX_DIFF:
                        self._target_entries[full_path] = entry
                        print(
                            'Choosing NEW entry (almostmatching date and time)',
                            file=self.log)
                    else:
                        print('Keeping original / OLD', file=self.log)

                else:
                    print(
                        'Keeping original / OLD as EXIF timestamp and file mtime (almost) matches',
                        file=self.log)

        else:
            print(f"Duplicated file entry detected on path: {full_path}")
            self.log.write(
                f"\n*** Duplicated file entry detected on path: {full_path}\n")
            entry.print(self.log)
            self._target_entries[full_path].print(self.log)
            base, ext = os.path.splitext(full_path)

            i = 1
            new_name = f'{base}_{i}{ext}'
            while new_name in self._target_entries:
                i += 1
                new_name = f'{base}_{i}{ext}'

            self._target_entries[new_name] = entry
            self.log.write(
                f'{self._target_entries[full_path].orig_path} became {new_name}\n'
            )

    def _store_file_path_map(self):
        if self.db.has_new_path_info():
            return

        for path, entry in self._target_entries.items():
            self.db.insert_new_path(entry.id, path)

        self.db.commit()

    def _copy_files(self):
        count = len(self.config.main_target_dir)
        for path, entry in self.db.iterate_target_path_entries():
            suffix = path[count:]
            if suffix[0] == os.path.sep:
                suffix = suffix[1:]

            (year, month, day, remaining) = suffix.split(os.path.sep, 3)
            path = os.path.join(self.config.main_target_dir, year,
                                f'{year}-{month}', f'{year}-{month}-{day}',
                                remaining)
            mode = 'SKIP' if os.path.exists(path) else 'COPY'
            print(f'{mode}: {entry.orig_path} -> {path}', file=self.log)

            if not self.config.dry_run:
                if not os.path.exists(entry.orig_path):
                    print(f'SKIP MISSING SOURCE FILE: {entry.orig_path}',
                          file=self.log)
                elif not os.path.exists(path):
                    os.makedirs(os.path.dirname(path), 0o755, exist_ok=True)
                    shutil.copy2(entry.orig_path, path)
                else:
                    print(
                        f'SKIP EXISTING TARGET {path} - SOURCE FILE: {entry.orig_path}',
                        file=self.log)

    def _print_stats(self):
        if not self._print_count:
            return

        print('Stats: ')
        print(f'* NEW      : {self._count["n"]}')
        print(f'* DUP      : {self._count["d"]}')
        print(f'* REJECTED : {self._count["r"]}')
        print(f'# DONE     : ' +
              str(self._count['n'] + self._count['d'] + self._count['r']))
Example #7
0
class ImageSelector:
    INTERACTIVE = True

    def __init__(self, filter_filename: str, sqlite_filename: str):
        self.filter_filename = filter_filename
        self.filter = Filter(filter_filename)
        self.sqlite_filename = sqlite_filename
        self.db = FileDatabase(sqlite_filename)
        self.read_to_filter = ProcessInputToFilter(self.filter, self.__class__.__name__)
        self.moved = dict()
        self._selected_for_move = list()
        self._selected_hash = dict()  # (uppercase-name, file-size, date, checksum) => true
        self._count = dict(n=0, r=0, d=0, s=0)  # new, reject, dup, skipped

    def run(self):
        set_quit = False
        for entry in self.db.iterate_photo_entries():
            if set_quit:
                break

            if entry.key in self._selected_hash:
                self._count['d'] += 1
                continue

            self.read_to_filter.set_entry(entry)
            result = FilterResult.UNSPECIFIED

            while result == FilterResult.UNSPECIFIED:
                result = self.filter.decide(entry)
                if result == FilterResult.UNSPECIFIED:
                    if self.INTERACTIVE:
                        if not self.read_to_filter.read_and_process_line():
                            set_quit = True
                        break
                    else:
                        self._count['s'] += 1
                        result = FilterResult.ACCEPT  # doesn't matter
                else:
                    if result == FilterResult.ACCEPT:
                        self._select_entry(entry)
                    else:
                        self._count['r'] += 1

            if set_quit:
                break

        self._print_stats()

    def _select_entry(self, entry: FileEntry) -> bool:
        self._selected_for_move.append(entry)
        self._selected_hash[entry.key] = True
        self._count['n'] += 1
        return True

    def _print_stats(self):
        print('Stats: ')
        print(f'* NEW      : {self._count["n"]}')
        print(f'* DUP      : {self._count["d"]}')
        print(f'* REJECTED : {self._count["r"]}')
        print(f'* SKIP     : {self._count["s"]}')
        print(f'# DONE     : ' + str(self._count['n'] + self._count['d'] + self._count['r']))
        print(f'# TOTAL    : ' + str(self._count['n'] + self._count['d'] + self._count['r'] + self._count['s']))
Example #8
0
class SafeEraser:
    MOD_DATE_MAX_DIFF = 5  # seconds between mtime and EXIF date/time original entry

    def __init__(self, config: SafeEraserConfig):
        self.config = config
        self.db = FileDatabase(self.config.sqlite_filename)
        self.log = open(self.config.log_file, 'a')
        self.copied: typing.Dict[str, FileEntry] = dict()
        self.safe_to_delete = list()
        self.known_rows = set()
        self.copied_hashes = dict()
        self.copied_hashes_without_date = dict()

        self.steps: typing.List[Step] = [
            Step(self._read_from_db, 'Read copied file entries'),
            Step(self._find_duplicates, 'Find duplicates of copied files from DB'),
            Step(self._erase_duplicates, 'Remove duplicates from file sytem'),
        ]

    def run(self):
        self._run_steps()

    def _run_steps(self):
        count = len(self.steps)
        current = 0

        for step in self.steps:
            current += 1
            print(f' * Step {current} of {count}: {step.help}')
            self.log.write(f' * Step {current} of {count}: {step.help}\n')
            step.func()

    def _read_from_db(self):
        for path, entry in self.db.iterate_target_path_entries():
            parts = path.split(os.path.sep)
            # parts is like (a, directory, name, to, somewhere, year, month, day, filename)
            # We drop year and remaining fields by now
            prefix = os.path.sep.join(parts[:-4])
            year, month, day, filename = parts[-4:]
            real_path = os.path.join(prefix, year, f'{year}-{month}', f'{year}-{month}-{day}', filename)
            if os.path.exists(real_path):
                print(f'Found {real_path}, original can be deleted from {entry.orig_path}', file=self.log)

                self.copied[real_path] = entry
                self.safe_to_delete.append(entry.orig_path)
                self.known_rows.add(entry.id)
                self.copied_hashes[entry.key] = entry
                self.copied_hashes_without_date[entry.key_without_date] = entry

            else:
                print(f'*** NOT Found {real_path}, original (and dups) CANNNOT be deleted from {entry.orig_path}',
                      file=self.log)

        print(f'Found {len(self.safe_to_delete)} files for deletion')
        print(f'Found {len(self.safe_to_delete)} files for deletion', file=self.log)

    def _find_duplicates(self):
        count = 0
        for entry in self.db.iterate_photo_entries():
            if entry.id in self.known_rows:
                continue

            if not os.path.exists(entry.orig_path):
                self.known_rows.add(entry.id)
                continue

            if entry.key in self.copied_hashes:
                print(f'! DUPLICATE found at {entry.orig_path}, original: {self.copied_hashes[entry.key].orig_path}',
                      file=self.log)
                self.known_rows.add(entry.id)
                self.safe_to_delete.append(entry.orig_path)
                count += 1
            elif entry.key_without_date in self.copied_hashes_without_date:
                try:
                    date_line = subprocess.check_output([self.config.exiftool, '-t', entry.orig_path]).decode(
                        'UTF-8')
                    date_new = \
                        [x.split('\t')[1] for x in date_line.splitlines(keepends=False)
                         if x.startswith('Date/Time Original')]
                except subprocess.CalledProcessError:
                    date_new = []

                if not date_new:
                    print(f'** Missing exif date info in file {entry.orig_path}', file=self.log)
                    continue

                date_new = date_new[0]
                date_new_ts = time.mktime(time.strptime(date_new, '%Y:%m:%d %H:%M:%S'))

                if abs(date_new_ts - entry.mod_date) < self.MOD_DATE_MAX_DIFF or \
                        abs(abs(date_new_ts - entry.mod_date) - 3600) < self.MOD_DATE_MAX_DIFF:
                    print(
                        f'. DUPLICATE found at {entry.orig_path}, original: ' + self.copied_hashes_without_date[
                            entry.key_without_date].orig_path + \
                        f' date diff is {entry.mod_date - date_new_ts}',
                        file=self.log)
                    self.known_rows.add(entry.id)
                    self.safe_to_delete.append(entry.orig_path)
                    count += 1

        print(f'Found {count} more files for deletion')
        print(f'Found {count} more files for deletion', file=self.log)

    def _erase_duplicates(self):
        print(f'Deleting {len(self.safe_to_delete)} files')
        print(f'Deleting {len(self.safe_to_delete)} files', file=self.log)
        count = 0
        missing = 0
        for path in self.safe_to_delete:
            if self.config.dry_run or os.path.exists(path):
                print(f'DELETE {path}', file=self.log)
                if not self.config.dry_run:
                    try:
                        os.unlink(path)
                    except Exception as e:
                        print(f'Exception occurred; class={e.__class__.__name__}, msg={e}, path={path}')
                        print(f'Exception occurred; class={e.__class__.__name__}, msg={e}, path={path}', file=self.log)

                count += 1
            else:
                print(f'ALREADY DELETED {path}', file=self.log)
                missing += 1

        print(f'Deleted {count} of {len(self.safe_to_delete)} files; already removed (missing) {missing} files')
        print(f'Deleted {count} of {len(self.safe_to_delete)} files; already removed (missing) {missing} files',
              file=self.log)