コード例 #1
0
ファイル: qg_nsubs.py プロジェクト: slowmoyang/EnergyFlow
def load(num_data=-1, cache_dir=None):
    """Loads the dataset. The first time this is called, it will automatically
    download the dataset. Future calls will attempt to use the cached dataset 
    prior to redownloading.

    **Arguments**

    - **num_data** : _int_
        - The number of events to return. A value of `-1` means read in all events.
    - **cache_dir** : _str_
        - The directory where to store/look for the file.

    **Returns**

    - _3-d numpy.ndarray_, _1-d numpy.ndarray_
        - The `X` and `y` components of the dataset as specified above.
    """

    fpath = _get_file(
        'QG_nsubs.npz',
        url='https://www.dropbox.com/s/y1l6avj5yj7jn9t/QG_nsubs.npz?dl=1',
        file_hash=
        'a99f771147af9b207356c990430cfeba6b6aa96fe5cff8263450ff3a31ab0997',
        cache_dir=cache_dir)

    f = np.load(fpath)
    X, y = f['X'], f['y']
    f.close()

    if num_data > -1:
        X, y = X[:num_data], y[:num_data]

    return X, y
コード例 #2
0
def load(num_data=-1, filename='QG_jets.npz', cache_dir=None):
    """Loads the dataset. The first time this is called, it will automatically
    download the dataset. Future calls will attempt to use the cached dataset 
    prior to redownloading.

    **Arguments**

    - **num_data** : _int_
        - The number of events to return. A value of `-1` means read in all events.
    - **filename** : _str_
        - The filename where to store/look for the file.
    - **cache_dir** : _str_
        - The directory where to store/look for the file.

    **Returns**

    - _3-d numpy.ndarray_, _1-d numpy.ndarray
        - The `X` and `y` components of the dataset as specified above.
    """

    fpath = _get_file(
        filename,
        url='https://www.dropbox.com/s/fclsl7pukcpobsb/QG_jets.npz?dl=1',
        file_hash=
        '3f27a02eab06e8b83ccc9d25638021e6e24c9361341730961f9d560dee12c257',
        cache_dir=cache_dir)

    f = np.load(fpath)
    X, y = f['X'], f['y']
    f.close()

    if num_data > -1:
        X, y = X[:num_data], y[:num_data]

    return X, y
コード例 #3
0
def load(num_data=100000, cache_dir=None):
    """Loads samples from the dataset (which in total is contained in twenty files). 
    Any file that is needed that has not been cached will be automatically downloaded.
    Downloading a file causes it to be cached for later use. Basic checksums are
    performed.

    **Arguments**

    - **num_data** : _int_
        - The number of events to return. A value of `-1` means read in all events.
    - **cache_dir** : _str_
        - The directory where to store/look for the file.

    **Returns**

    - _3-d numpy.ndarray_, _1-d numpy.ndarray_
        - The `X` and `y` components of the dataset as specified above.
    """

    num_files = int(np.ceil(num_data /
                            num_per_file)) if num_data > -1 else max_num_files

    # handle request for too much data
    if num_files > max_num_files:
        warnings.warn(
            'More data requested than available. Providing the full dataset.')
        num_files = max_num_files
        num_data = -1

    Xs, ys = [], []
    for i in range(num_files):

        # preserve old first file
        filename = 'QG_jets_{}.npz'.format(i) if i > 0 else 'QG_jets.npz'

        f = np.load(
            _get_file(filename,
                      url=QG_jets_urls[i],
                      file_hash=QG_jets_hashes[i],
                      cache_dir=cache_dir))
        Xs.append(f['X'])
        ys.append(f['y'])
        f.close()

    max_len_axis1 = max([X.shape[1] for X in Xs])

    X = np.vstack([_pad_events_axis1(x, max_len_axis1) for x in Xs])
    y = np.concatenate(ys)

    if num_data > -1:
        X, y = X[:num_data], y[:num_data]

    return X, y
コード例 #4
0
def load(num_data=100000,
         generator='pythia',
         pad=True,
         with_bc=False,
         cache_dir='~/.energyflow'):
    """Loads samples from the dataset (which in total is contained in twenty 
    files). Any file that is needed that has not been cached will be 
    automatically downloaded. Downloading a file causes it to be cached for
    later use. Basic checksums are performed.

    **Arguments**

    - **num_data** : _int_
        - The number of events to return. A value of `-1` means read in all
        events.
    - **generator** : _str_
        - Specifies which Monte Carlo generator the events should come from.
        Currently, the options are `'pythia'` and `'herwig'`.
    - **pad** : _bool_
        - Whether to pad the events with zeros to make them the same length.
        Note that if set to `False`, the returned `X` array will be an object
        array and not a 3-d array of floats.
    - **with_bc** : _bool_
        - Whether to include jets coming from bottom or charm quarks. Changing
        this flag does not mask out these jets but rather accesses an entirely
        different dataset. The datasets with and without b and c quarks should
        not be combined.
    - **cache_dir** : _str_
        - The directory where to store/look for the files. Note that 
        `'datasets'` is automatically appended to the end of this path.

    **Returns**

    - _3-d numpy.ndarray_, _1-d numpy.ndarray_
        - The `X` and `y` components of the dataset as specified above. If
        `pad` is `False` then these will be object arrays holding the events,
        each of which is a 2-d ndarray.
    """

    # check for valid options
    if generator not in GENERATORS:
        raise ValueError("'generator' must be in " + str(GENERATORS))

    # get number of files we need
    num_files = int(np.ceil(num_data /
                            NUM_PER_FILE)) if num_data > -1 else MAX_NUM_FILES
    if num_files > MAX_NUM_FILES:
        warnings.warn(
            'More data requested than available. Providing the full dataset.')
        num_files = MAX_NUM_FILES
        num_data = -1

    # index into global variables
    bc = 'bc' if with_bc else 'nobc'
    urls = URLS[generator][bc]
    hashes = HASHES[generator][bc]

    # obtain files
    Xs, ys = [], []
    for i in range(num_files):
        for j, source in enumerate(SOURCES):
            try:
                url = urls[source][i]
                filename = url.split('/')[-1].split('?')[0]

                fpath = _get_file(filename,
                                  url,
                                  cache_dir,
                                  file_hash=hashes['sha256'][i])

                # we succeeded, so don't continue trying to download this file
                break

            except Exception as e:
                print(str(e))

                # if this was our last source, raise an error
                if j == len(SOURCES) - 1:
                    m = 'Failed to download {} from any source.'.format(
                        filename)
                    raise RuntimeError(m)

                # otherwise indicate we're trying again
                else:
                    print(
                        "Failed to download {} from source '{}', trying next source..."
                        .format(filename, source))

        # load file and append arrays
        f = np.load(fpath)
        Xs.append(f['X'])
        ys.append(f['y'])
        f.close()

    # get X array
    if pad:
        max_len_axis1 = max([X.shape[1] for X in Xs])
        X = np.vstack([_pad_events_axis1(x, max_len_axis1) for x in Xs])
    else:
        X = np.asarray([x[x[:, 0] > 0] for X in Xs for x in X])

    # get y array
    y = np.concatenate(ys)

    # chop down to specified amount of data
    if num_data > -1:
        X, y = X[:num_data], y[:num_data]

    return X, y