def load(num_data=-1, cache_dir=None): """Loads the dataset. The first time this is called, it will automatically download the dataset. Future calls will attempt to use the cached dataset prior to redownloading. **Arguments** - **num_data** : _int_ - The number of events to return. A value of `-1` means read in all events. - **cache_dir** : _str_ - The directory where to store/look for the file. **Returns** - _3-d numpy.ndarray_, _1-d numpy.ndarray_ - The `X` and `y` components of the dataset as specified above. """ fpath = _get_file( 'QG_nsubs.npz', url='https://www.dropbox.com/s/y1l6avj5yj7jn9t/QG_nsubs.npz?dl=1', file_hash= 'a99f771147af9b207356c990430cfeba6b6aa96fe5cff8263450ff3a31ab0997', cache_dir=cache_dir) f = np.load(fpath) X, y = f['X'], f['y'] f.close() if num_data > -1: X, y = X[:num_data], y[:num_data] return X, y
def load(num_data=-1, filename='QG_jets.npz', cache_dir=None): """Loads the dataset. The first time this is called, it will automatically download the dataset. Future calls will attempt to use the cached dataset prior to redownloading. **Arguments** - **num_data** : _int_ - The number of events to return. A value of `-1` means read in all events. - **filename** : _str_ - The filename where to store/look for the file. - **cache_dir** : _str_ - The directory where to store/look for the file. **Returns** - _3-d numpy.ndarray_, _1-d numpy.ndarray - The `X` and `y` components of the dataset as specified above. """ fpath = _get_file( filename, url='https://www.dropbox.com/s/fclsl7pukcpobsb/QG_jets.npz?dl=1', file_hash= '3f27a02eab06e8b83ccc9d25638021e6e24c9361341730961f9d560dee12c257', cache_dir=cache_dir) f = np.load(fpath) X, y = f['X'], f['y'] f.close() if num_data > -1: X, y = X[:num_data], y[:num_data] return X, y
def load(num_data=100000, cache_dir=None): """Loads samples from the dataset (which in total is contained in twenty files). Any file that is needed that has not been cached will be automatically downloaded. Downloading a file causes it to be cached for later use. Basic checksums are performed. **Arguments** - **num_data** : _int_ - The number of events to return. A value of `-1` means read in all events. - **cache_dir** : _str_ - The directory where to store/look for the file. **Returns** - _3-d numpy.ndarray_, _1-d numpy.ndarray_ - The `X` and `y` components of the dataset as specified above. """ num_files = int(np.ceil(num_data / num_per_file)) if num_data > -1 else max_num_files # handle request for too much data if num_files > max_num_files: warnings.warn( 'More data requested than available. Providing the full dataset.') num_files = max_num_files num_data = -1 Xs, ys = [], [] for i in range(num_files): # preserve old first file filename = 'QG_jets_{}.npz'.format(i) if i > 0 else 'QG_jets.npz' f = np.load( _get_file(filename, url=QG_jets_urls[i], file_hash=QG_jets_hashes[i], cache_dir=cache_dir)) Xs.append(f['X']) ys.append(f['y']) f.close() max_len_axis1 = max([X.shape[1] for X in Xs]) X = np.vstack([_pad_events_axis1(x, max_len_axis1) for x in Xs]) y = np.concatenate(ys) if num_data > -1: X, y = X[:num_data], y[:num_data] return X, y
def load(num_data=100000, generator='pythia', pad=True, with_bc=False, cache_dir='~/.energyflow'): """Loads samples from the dataset (which in total is contained in twenty files). Any file that is needed that has not been cached will be automatically downloaded. Downloading a file causes it to be cached for later use. Basic checksums are performed. **Arguments** - **num_data** : _int_ - The number of events to return. A value of `-1` means read in all events. - **generator** : _str_ - Specifies which Monte Carlo generator the events should come from. Currently, the options are `'pythia'` and `'herwig'`. - **pad** : _bool_ - Whether to pad the events with zeros to make them the same length. Note that if set to `False`, the returned `X` array will be an object array and not a 3-d array of floats. - **with_bc** : _bool_ - Whether to include jets coming from bottom or charm quarks. Changing this flag does not mask out these jets but rather accesses an entirely different dataset. The datasets with and without b and c quarks should not be combined. - **cache_dir** : _str_ - The directory where to store/look for the files. Note that `'datasets'` is automatically appended to the end of this path. **Returns** - _3-d numpy.ndarray_, _1-d numpy.ndarray_ - The `X` and `y` components of the dataset as specified above. If `pad` is `False` then these will be object arrays holding the events, each of which is a 2-d ndarray. """ # check for valid options if generator not in GENERATORS: raise ValueError("'generator' must be in " + str(GENERATORS)) # get number of files we need num_files = int(np.ceil(num_data / NUM_PER_FILE)) if num_data > -1 else MAX_NUM_FILES if num_files > MAX_NUM_FILES: warnings.warn( 'More data requested than available. Providing the full dataset.') num_files = MAX_NUM_FILES num_data = -1 # index into global variables bc = 'bc' if with_bc else 'nobc' urls = URLS[generator][bc] hashes = HASHES[generator][bc] # obtain files Xs, ys = [], [] for i in range(num_files): for j, source in enumerate(SOURCES): try: url = urls[source][i] filename = url.split('/')[-1].split('?')[0] fpath = _get_file(filename, url, cache_dir, file_hash=hashes['sha256'][i]) # we succeeded, so don't continue trying to download this file break except Exception as e: print(str(e)) # if this was our last source, raise an error if j == len(SOURCES) - 1: m = 'Failed to download {} from any source.'.format( filename) raise RuntimeError(m) # otherwise indicate we're trying again else: print( "Failed to download {} from source '{}', trying next source..." .format(filename, source)) # load file and append arrays f = np.load(fpath) Xs.append(f['X']) ys.append(f['y']) f.close() # get X array if pad: max_len_axis1 = max([X.shape[1] for X in Xs]) X = np.vstack([_pad_events_axis1(x, max_len_axis1) for x in Xs]) else: X = np.asarray([x[x[:, 0] > 0] for X in Xs for x in X]) # get y array y = np.concatenate(ys) # chop down to specified amount of data if num_data > -1: X, y = X[:num_data], y[:num_data] return X, y