def index_local(paths): """ Index all local files. Indexing files enables analyses on them such as statistics and dupe detection. The given paths must be configured paths in the user's .backuper/config.json If no paths are given, then all configured paths are indexed. """ path_set = set(paths) tsv_file_path = cfg.get_relative_to_config_path('local.index.tsv') print(tsv_file_path) config = cfg.load() base_path = config['base_path'] base_excludes = config['excludes'] with open(tsv_file_path, 'w', newline='') as fp: writer = csv.writer(fp, delimiter='\t') writer.writerow(['config_path', 'rel_path']) for (path_spec, _, dir_path, _) in iter_sync_dirs(): cur_cfg_path = path_spec['path'] # not paths means include all paths if not paths or cur_cfg_path in path_set: for abs_path in files.scan_files(dir_path, excludes=base_excludes): writer.writerow([cur_cfg_path, os.path.relpath(abs_path, start=dir_path)])
def scan(dir_, min_size): global last_progress for abspath in files.scan_files(dir_, progress_interval_sec=PROGRESS_INTERVAL_SEC): if os.stat(abspath).st_size < min_size: # Too small continue elif redis.get('path:' + abspath): # Already hashed continue log.err('Hashing', abspath) with open(abspath, 'rb') as file: md5er = hashlib.md5() try: # Buffering is required for large files because a single read crashes python (maybe only on OSX). # I think that it should not and is a bug, but there is no consensus nor documentation to indicate # intended behavior. for buf in iter(partial(file.read, BUFFER_THRESH), b''): md5er.update(buf) md5 = md5er.hexdigest() except OSError as e: log.err(' ', file, e) continue log.err(' ', os.path.relpath(abspath, start=os.getcwd()), md5) redis.set('path:' + abspath, md5) redis.rpush('hash:' + md5, abspath) print(abspath)