Example #1
0
def index_local(paths):
    """
    Index all local files.

    Indexing files enables analyses on them such as statistics and dupe detection.
    The given paths must be configured paths in the user's .backuper/config.json
    If no paths are given, then all configured paths are indexed.
    """
    path_set = set(paths)

    tsv_file_path = cfg.get_relative_to_config_path('local.index.tsv')
    print(tsv_file_path)

    config = cfg.load()
    base_path = config['base_path']
    base_excludes = config['excludes']

    with open(tsv_file_path, 'w', newline='') as fp:
        writer = csv.writer(fp, delimiter='\t')
        writer.writerow(['config_path', 'rel_path'])

        for (path_spec, _, dir_path, _) in iter_sync_dirs():
            cur_cfg_path = path_spec['path']

            # not paths means include all paths
            if not paths or cur_cfg_path in path_set:
                for abs_path in files.scan_files(dir_path, excludes=base_excludes):
                    writer.writerow([cur_cfg_path, os.path.relpath(abs_path, start=dir_path)])
Example #2
0
def scan(dir_, min_size):
    global last_progress

    for abspath in files.scan_files(dir_, progress_interval_sec=PROGRESS_INTERVAL_SEC):

        if os.stat(abspath).st_size < min_size:
            # Too small
            continue

        elif redis.get('path:' + abspath):
            # Already hashed
            continue

        log.err('Hashing', abspath)

        with open(abspath, 'rb') as file:
            md5er = hashlib.md5()
            try:
                # Buffering is required for large files because a single read crashes python (maybe only on OSX).
                # I think that it should not and is a bug, but there is no consensus nor documentation to indicate
                # intended behavior.
                for buf in iter(partial(file.read, BUFFER_THRESH), b''):
                    md5er.update(buf)
                md5 = md5er.hexdigest()

            except OSError as e:
                log.err('  ', file, e)
                continue

        log.err('  ', os.path.relpath(abspath, start=os.getcwd()), md5)
        redis.set('path:' + abspath, md5)
        redis.rpush('hash:' + md5, abspath)
        print(abspath)