Beispiel #1
0
def write_file_groups(testdir, sizes, group_size=None):
    """For each file size (bytes) in `sizes`, write a group of ``nfiles`` files
    ``{testdir}/filesize_{size}/file_{idx}; idx=0...nfiles-1``, such that each
    dir ``filesize_{size}`` has approximately ``group_size``. If `group_size`
    is omitted, then use ``group_size=max(sizes)`` such that the group with
    the largest file size has only one file. Returns lists of group dirs
    and file names."""
    if group_size is None:
        group_size = max(sizes)
    else:
        assert group_size >= max(sizes), \
                f"{size2str(group_size)} < {size2str(max(sizes))}"
    group_dirs = []
    files = []
    for _filesize in sizes:
        filesize = int(_filesize)
        filesize_str = size2str(filesize)
        dr = pj(testdir, f'filesize_{filesize_str}')
        group_dirs.append(dr)
        if not os.path.exists(dr):
            os.makedirs(dr, exist_ok=True)
            nfiles = int(group_size) // filesize
            assert nfiles >= 1
            for idx in range(nfiles):
                fn = pj(dr, f'file_{idx}')
                write(fn, filesize)
                files.append(fn)
        else:
            print(f'    dir already present: {dr}')
    return group_dirs, files
Beispiel #2
0
def bench_main_blocksize_filesize(tmpdir, maxsize):
    stmt = textwrap.dedent("""
        cfg.update(blocksize={blocksize})
        main.main({files_dirs})
        """)
    params = []

    # single files, test filesize and blocksize
    max_filesize = maxsize
    max_blocksize = min(200*MiB, max_filesize)
    cases = [(np.array([max_filesize]),
              bytes_logspace(10*KiB, max_blocksize, 20),
              'main_blocksize_single'),
             (bytes_linspace(min(1*MiB, max_filesize//2), max_filesize, 5),
              np.array([256*KiB]),
              'main_filesize_single'),
             ]

    for filesize, blocksize, study in cases:
        testdir = mkdtemp(dir=tmpdir, prefix=study + '_')
        files = write_single_files(testdir, filesize)
        this = ps.pgrid(zip(ps.plist('filesize', filesize),
                            ps.plist('filesize_str', map(size2str,
                                                         filesize)),
                            ps.plist('files_dirs', [[x] for x in files])),
                        ps.plist('study', [study]),
                        ps.plist('maxsize_str', [size2str(maxsize)]),
                        zip(ps.plist('blocksize', blocksize),
                            ps.plist('blocksize_str', map(size2str,
                                                          blocksize))))
        params += this

    study = 'main_blocksize'
    testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir,
                                                  study=study)
    blocksize = bytes_logspace(10*KiB, min(200*MiB, maxsize), 20)
    this = ps.pgrid(ps.plist('files_dirs', [[testdir]]),
                    ps.plist('study', [study]),
                    ps.plist('maxsize_str', [size2str(maxsize)]),
                    zip(ps.plist('blocksize', blocksize),
                        ps.plist('blocksize_str', map(size2str,
                                                          blocksize))))
    params += this
    return stmt, params, {}
Beispiel #3
0
def bench_main_parallel(tmpdir, maxsize):
    stmt = textwrap.dedent("""
        cfg.update(blocksize={blocksize},
                   nthreads={nthreads},
                   nprocs={nprocs},
                   share_leafs={share_leafs})
        main.main({files_dirs})
        """)
    params = []

    study = 'main_parallel'
    testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir,
                                                  study=study)
    blocksize = np.array([256*KiB])

    for share_leafs in [True, False]:
        this = ps.pgrid(ps.plist('files_dirs', [[testdir]]),
                        ps.plist('study', [study]),
                        zip(ps.plist('nthreads', range(1, MAXWORKERS+1)),
                            ps.plist('nworkers', range(1, MAXWORKERS+1))),
                        ps.plist('nprocs', [1]),
                        ps.plist('pool_type', ['thread']),
                        ps.plist('maxsize_str', [size2str(maxsize)]),
                        ps.plist('share_leafs', [share_leafs]),
                        zip(ps.plist('blocksize', blocksize),
                            ps.plist('blocksize_str', list(map(size2str,
                                                                   blocksize)))))
        params += this

        this = ps.pgrid(ps.plist('files_dirs', [[testdir]]),
                        ps.plist('study', [study]),
                        zip(ps.plist('nprocs', range(1, MAXWORKERS+1)),
                            ps.plist('nworkers', range(1, MAXWORKERS+1))),
                        ps.plist('nthreads', [1]),
                        ps.plist('pool_type', ['proc']),
                        ps.plist('maxsize_str', [size2str(maxsize)]),
                        ps.plist('share_leafs', [share_leafs]),
                        zip(ps.plist('blocksize', blocksize),
                            ps.plist('blocksize_str', list(map(size2str,
                                                                   blocksize)))))
        params += this
    return stmt, params, {}
Beispiel #4
0
 def __init__(self, path, alias=None, tmpdir='/tmp/findsame_datadir_cache'):
     self.path = path
     self.alias = alias
     cache_fn = os.path.join(tmpdir, path.replace('/','_')) + '.npy'
     if os.path.exists(cache_fn):
         self.sizes = np.load(cache_fn)
     else:
         self.sizes = collect_file_sizes([self.path])
         os.makedirs(tmpdir, exist_ok=True)
         np.save(cache_fn, self.sizes)
     self.cache_fn = cache_fn
     self.size_str = co.size2str(self.sizes.sum())
Beispiel #5
0
def bench_hash_file_parallel(tmpdir, maxsize):
    params = []

    study = 'hash_file_parallel'
    testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir,
                                                  study=study)

    pool_map = {'seq': pl.SequentialPoolExecutor,
                'thread': pl.ThreadPoolExecutor,
                'proc': pl.ProcessPoolExecutor,
                'proc,thread=1': lambda nw: pl.ProcessAndThreadPoolExecutor(nw, 1),
                'thread,proc=1': lambda nw: pl.ProcessAndThreadPoolExecutor(1, nw),
                }

    ctx = dict(pool_map=pool_map,
               pl=pl,
               files=files,
               worker=_worker_bench_hash_file_parallel,
               )

    setup = cache_flush_setup

    stmt = """
with pool_map['{pool_type}']({nworkers}) as pool:
    x=list(pool.map(worker, files))
    """

    this = ps.pgrid(ps.plist('pool_type',
                             [k for k in pool_map.keys() if k != 'seq']),
                    ps.plist('nworkers', range(1, MAXWORKERS+1)),
                    ps.plist('study', [study]),
                    ps.plist('maxsize_str', [size2str(maxsize)]),
                    )
    params += this
    # non-pool reference
    params += [{'study': study, 'pool_type': 'seq', 'nworkers': 1,
                'maxsize_str': size2str(maxsize)}]

    return stmt, params, dict(setup=setup, globals=ctx)
Beispiel #6
0
def hist(_xlst, bins=100, norm=False, shift_fac=0.8, labels=None,
         logx=True, ax=None, logbase=10, density=False):
    """As in plt.hist, plot multiple histograms for each x in xlst, but use
    x-axis log scale if logx=True (plt.hist(..., log=True) applies to y).
    Optional normalization to sum of bin areas = 1. Use step plots for each
    histogram, and shift them along y if shift_fac > 0.

    Parameters
    ----------
    xlst : list of 1d arrays

    Returns
    -------
    fig, ax

    Notes
    -----
    When logx=True, we exclude empty files b/c of the log scale.

    When len(xlst) > 1 and shift_fac > 0, histograms are shifted along y for
    better visability. In that case we turn of y ticks (the bin counts) since
    it makes no sense in that case.
    """
    xlst = [_xlst] if isinstance(_xlst, np.ndarray) else _xlst
    if labels is not None:
        assert len(xlst) == len(labels)

    if ax is None:
        fig,ax = plt.subplots()
    else:
        fig = ax.get_figure()
    lastmax = 0.0
    for ii,xi in enumerate(xlst):
        hh,be = histogram(xi, bins=bins, logx=logx, norm=norm,
                          logbase=logbase, density=density)
        label = None if labels is None else labels[ii]
        ax.step(be[:-1] + 0.5*np.diff(be), hh + lastmax, label=label, lw=2,
                where='mid')
        lastmax += hh.max() * shift_fac
    if logx:
        ax.set_xscale('log', basex=logbase)
    ax.set_xticklabels([co.size2str(int(x)) for x in ax.get_xticks()])
    if len(xlst) > 1 and shift_fac > 0:
        ax.set_yticklabels([])
        ax.set_yticks([])
    if labels is not None:
        ax.legend()
    return fig,ax
Beispiel #7
0
def bench_main_parallel_2d(tmpdir, maxsize):
    stmt = textwrap.dedent("""
        cfg.update(blocksize={blocksize},
                   nthreads={nthreads},
                   nprocs={nprocs})
        main.main({files_dirs})
        """)
    params = []

    study = 'main_parallel_2d'
    testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir,
                                                  study=study)
    blocksize = np.array([256*KiB])
    this = ps.pgrid(ps.plist('files_dirs', [[testdir]]),
                    ps.plist('study', [study]),
                    ps.plist('nthreads', range(1, MAXWORKERS+1)),
                    ps.plist('nprocs', range(1, MAXWORKERS+1)),
                    ps.plist('maxsize_str', [size2str(maxsize)]),
                    zip(ps.plist('blocksize', blocksize),
                        ps.plist('blocksize_str', list(map(size2str,
                                                               blocksize)))))
    params += this
    return stmt, params, {}
Beispiel #8
0
def test_size_str():
    sizes = [1023, random.randint(1000, 300000000000)]
    for size in sizes:
        assert co.str2size(co.size2str(size, prec=30)) == size
    assert co.size2str(co.str2size('None')) == 'None'
    assert co.str2size(co.size2str(None)) is None