def import_file(from_path, date, home, index, dry_run, mv): ym = date[:7] dir = path.join(home, ym, date) to_path = path.join(dir, path.basename(from_path)) rel_path = path.relpath(to_path, home) logging.info('importing %s to %s (date %s)' % (from_path, rel_path, date)) if dry_run: return if not path.exists(dir): makedirs(dir) if mv: move(from_path, dir) else: copy(from_path, dir) original_path = autorotate(to_path) if original_path: logging.info('auto rotated %s. original at %s' % (to_path, original_path)) md5_original = md5(original_path) else: md5_original = None mtime, size = stat(to_path) index.add(from_path, rel_path, md5(to_path), mtime, size, date, md5_original=md5_original)
def update_changed_files(args, changed, by_path, index): for pth in changed: print 'changed', pth full_path = path.join(args.archive_dir, pth) hash = md5(full_path) mtime, size = stat(full_path) rowid = by_path[pth]['rowid'] index.set(rowid, md5=hash, mtime=mtime, filesize=size)
def filter_duplicates(to_import, index): logging.info('Filtering duplicates ..') duplicates, by_md5, new = {}, {}, [] for i, f in enumerate(to_import): if i % 100 == 99: logging.info('Checking %d of %d ..', i+1, len(to_import)) hash = md5(f) existing = index.get(md5=hash) or index.get(md5_original=hash) if existing: duplicates[f] = existing[0]['path'] elif hash in by_md5: duplicates[f] = by_md5[hash] else: by_md5[hash] = f new.append(f) return new, duplicates
def add_new_files(args, new, index): for new_path in new: full_path = path.join(args.archive_dir, new_path) hash = md5(full_path) mtime, size = stat(full_path) already = index.get(md5=hash) if already: if args.allow_duplicate: print 'duplicate-new', new_path, 'with', already[0]['path'] else: print 'duplicate-ignore', new_path, 'with', already[0]['path'] continue else: print 'new', new_path index.add(origin=full_path, path=new_path, md5=hash, mtime=mtime, filesize=size)
def detect_moved_files(args, new, missing, by_path): missing_md5 = {by_path[missing_path]['md5']: missing_path for missing_path in missing} matched_new = set() moved = {} print 'matching moved files ..' for i, new_path in enumerate(sorted(new)): if i % 1000 == 999: print 'processing', i, '/', len(new), '..', new_path.encode('utf-8') hash = md5(path.join(args.archive_dir, new_path)) if hash in missing_md5: from_path = missing_md5[hash] to_path = new_path moved[from_path] = to_path matched_new.add(to_path) del missing_md5[hash] new = {pth for pth in new if pth not in matched_new} missing = {pth for pth in missing if pth not in moved} return new, missing, moved
#!/usr/bin/python # -*- coding: utf-8 -*- from os import path from sys import argv from lib.fs_utils import flat_walk, md5, stat from lib.index import Index from lib.config import should_index home = argv[1] with Index(path.join(home, 'pictures.db')) as index: all = flat_walk(home) for file in filter(should_index, all): hash = md5(file) mtime, size = stat(file) pth = path.relpath(file, home) print 'indexing', pth, mtime, size index.add(origin=file, path=pth, mtime=mtime, filesize=size, md5=hash)
from lib.index import Index from lib.image_utils import autorotate from lib.fs_utils import stat, md5 from os import path with Index('/home/jongman/data/pictures-backup/pictures.db', autocommit=True) as index: all = index.get() for i, a in enumerate(sorted(all, key=lambda p: p['path'])): if i % 100 == 99: print i, '/', len(all), '..', a['path'] pth = '/home/jongman/data/pictures-backup/' + a['path'] + '.original' if path.exists(pth) and a['md5_original'] is None: original_md5 = md5(pth) index.set(a['rowid'], md5_original=original_md5) print 'updated', pth
from lib.index import Index from lib.image_utils import autorotate from lib.fs_utils import stat, md5 with open('errors.txt', 'w') as errors: with Index('/Volumes/Passport/pictures-backup/pictures.db', autocommit=True) as index: all = index.get() for i, a in enumerate(sorted(all, key=lambda p: p['path'])): if i % 100 == 99: print i, '/', len(all), '..', a['path'] pth = '/Volumes/Passport/pictures-backup/' + a['path'] try: if autorotate(pth): mtime, size = stat(pth) hash = md5(pth) index.set(a['rowid'], mtime=mtime, filesize=size, md5=hash) print 'updated', pth except: errors.write('%s\n' % pth.encode('utf-8'))