Example #1
0
def _check_data_shape_dask(data, input_idxs):
    """Check data shape and adjust if necessary."""
    # Handle multiple datasets
    if data.ndim > 2 and data.shape[0] * data.shape[1] == input_idxs.shape[0]:
        data = da.reshape(data, data.shape[0] * data.shape[1], data.shape[2])
    # Also ravel single dataset
    elif data.shape[0] != input_idxs.size:
        data = da.ravel(data)

    # Ensure two dimensions
    if data.ndim == 1:
        data = da.reshape(data, (data.size, 1))

    return data
Example #2
0
 def _matvec(self, x):
     if self.reshape:
         x = da.reshape(x, self.dims)
     y = da.roll(x, shift=self.shift, axis=self.dir)
     y = y.rechunk(x.chunks)
     return y.ravel()
Example #3
0
    def get_sample_from_bil_info(self, data, fill_value=np.nan,
                                 output_shape=None):
        if fill_value is None:
            fill_value = np.nan
        # FIXME: can be this made into a dask construct ?
        cols, lines = np.meshgrid(np.arange(data['x'].size),
                                  np.arange(data['y'].size))
        cols = da.ravel(cols)
        lines = da.ravel(lines)
        try:
            self.valid_input_index = self.valid_input_index.compute()
        except AttributeError:
            pass
        vii = self.valid_input_index.squeeze()
        try:
            self.index_array = self.index_array.compute()
        except AttributeError:
            pass

        # ia contains reduced (valid) indices of the source array, and has the
        # shape of the destination array
        ia = self.index_array
        rlines = lines[vii][ia]
        rcols = cols[vii][ia]

        slices = []
        mask_slices = []
        mask_2d_added = False
        coords = {}
        try:
            # FIXME: Use same chunk size as input data
            coord_x, coord_y = self.target_geo_def.get_proj_vectors_dask()
        except AttributeError:
            coord_x, coord_y = None, None

        for _, dim in enumerate(data.dims):
            if dim == 'y':
                slices.append(rlines)
                if not mask_2d_added:
                    mask_slices.append(ia >= self.target_geo_def.size)
                    mask_2d_added = True
                if coord_y is not None:
                    coords[dim] = coord_y
            elif dim == 'x':
                slices.append(rcols)
                if not mask_2d_added:
                    mask_slices.append(ia >= self.target_geo_def.size)
                    mask_2d_added = True
                if coord_x is not None:
                    coords[dim] = coord_x
            else:
                slices.append(slice(None))
                mask_slices.append(slice(None))
                try:
                    coords[dim] = data.coords[dim]
                except KeyError:
                    pass

        res = data.values[slices]
        res[mask_slices] = fill_value

        try:
            p_1 = res[:, :, 0]
            p_2 = res[:, :, 1]
            p_3 = res[:, :, 2]
            p_4 = res[:, :, 3]
        except IndexError:
            p_1 = res[:, 0]
            p_2 = res[:, 1]
            p_3 = res[:, 2]
            p_4 = res[:, 3]

        s__, t__ = self.bilinear_s, self.bilinear_t

        res = (p_1 * (1 - s__) * (1 - t__) +
               p_2 * s__ * (1 - t__) +
               p_3 * (1 - s__) * t__ +
               p_4 * s__ * t__)

        epsilon = 1e-6
        data_min = da.nanmin(data) - epsilon
        data_max = da.nanmax(data) + epsilon

        idxs = (res > data_max) | (res < data_min)
        res = da.where(idxs, fill_value, res)
        shp = self.target_geo_def.shape
        if data.ndim == 3:
            res = da.reshape(res, (res.shape[0], shp[0], shp[1]))
        else:
            res = da.reshape(res, (shp[0], shp[1]))
        res = DataArray(da.from_array(res, chunks=CHUNK_SIZE),
                        dims=data.dims, coords=coords)

        return res
Example #4
0
    def importData(self, samplesToRun, maxNJetBin = 11):
        #variables to train
        variables = self.getList()

        f = h5py.File(samplesToRun[0], "r")
        columnHeaders = f["EventShapeVar"].attrs["column_headers"]
        f.close()

        for v in variables:
            if not v in columnHeaders:
                print "Variable not found: %s"%v

        #load data files 
        dsets = [h5py.File(filename, mode='r')['EventShapeVar'] for filename in samplesToRun]
        arrays = [da.from_array(dset, chunks=(65536, 1024)) for dset in dsets]
        x = da.concatenate(arrays, axis=0)
         
        #setup and get data
        dataColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in variables])
        data = x[:,dataColumns]
        npyInputData = data.compute()
        #print data.shape
        
        #setup and get labels
        npyInputAnswers = np.zeros((npyInputData.shape[0], 2))
        if self.signal:
            npyInputAnswers[:,0] = 1
        else:
            npyInputAnswers[:,1] = 1
        
        #setup and get domains
        domainColumnNames = ["NGoodJets_double"]
        #maxNJetBin = 11
        domainColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in domainColumnNames])
        inputDomains = x[:,domainColumns]
        tempInputDomains = inputDomains.astype(int)
        tempInputDomains = da.reshape(tempInputDomains, [-1])
        tempInputDomains[tempInputDomains > maxNJetBin] = maxNJetBin 
        minNJetBin = tempInputDomains.min().compute()
        numDomains = maxNJetBin + 1 - minNJetBin
        tempInputDomains = tempInputDomains - tempInputDomains.min()
        d =  np.zeros((npyInputData.shape[0], numDomains))
        d[np.arange(d.shape[0]), tempInputDomains] = 1
            
        #setup and get weights
        wgtColumnNames = ["Weight"]
        wgtColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in wgtColumnNames])
        npyInputSampleWgts = x[:,wgtColumns].compute()

        #NJet
        npyNJet = np.zeros((npyInputData.shape[0], 1))
        for i in range(0, len(d)):
            nJet = minNJetBin
            for j in range(len(d[i])):
                if d[i][j] == 1:
                    break
                else:
                    nJet +=1
            npyNJet[i][0] = int(nJet)
            
        return {"data":npyInputData, "labels":npyInputAnswers, "domain":d, "Weight":npyInputSampleWgts, "nJet":npyNJet}
                    dtype=np.int32)\
                for i in range(0,neff,chunk_size)])
    print " >> Expected shape:", eventId.shape

    # m0
    branches = ["m0"]
    m0 = da.concatenate([\
                da.from_delayed(\
                    load_single(tree,i,i+chunk_size, branches),\
                    shape=(chunk_size,),\
                    dtype=np.float32)\
                for i in range(0,neff,chunk_size)])
    print " >> Expected shape:", m0.shape

    # EB rescaled by m0
    m0_reshape = da.reshape(m0, [-1,1,1,1])
    X_m0 = scale*X/m0_reshape 
    print " >> Expected shape:", X_m0.shape

    # Likelihood weights
    if j == 0:
      #h, bins = da.histogram(m0, bins=31, range=[80., 390.]) 
      h, bins = da.histogram(m0, bins=152, range=[82., 386.]) 
    else:
      #h, bins = da.histogram(m0, bins=8, range=[70., 150.]) 
      h, bins = da.histogram(m0, bins=38, range=[74., 150.]) 
    h = h.compute()
    h = h*np.float32(neff)/np.float32(h.sum())
    binsLow = bins[:-1]
    lhood = 1./h
    lhood = lhood/lhood.sum()
Example #6
0
def generate4D_frms6(data_dir, bin_factor=2, workers=0):
    current_dir = os.getcwd()
    os.chdir(data_dir)
    data_class = st.util.Frms6Reader()
    tot_files = 0

    for file in glob.glob("*.frms6"):
        tot_files += 1
    filesizes = np.zeros((tot_files, 4), dtype=int)
    filenames = np.zeros(tot_files, dtype=object)

    ii = 0
    for file in glob.glob("*.frms6"):
        fname = data_dir + file
        dshape = np.asarray(data_class.getDataShape(fname), dtype=int)
        filesizes[ii, 0:3] = dshape
        filesizes[ii, -1] = fname[-7]
        filenames[ii] = fname
        ii += 1
    os.chdir(current_dir)

    if workers == 0:
        workers = int(1 + tot_files)

    cluster = dd.LocalCluster(n_workers=workers)
    client = dd.Client(cluster)

    draw_shape = (np.mean(filesizes[filesizes[:, -1] != 0, 0:3], axis=0)).astype(int)
    dref_shape = filesizes[filesizes[:, -1] == 0, 0:3][0]
    data_shape = np.copy(dref_shape)
    data_shape[-1] = (np.sum(filesizes[:, -2]) - np.amin(filesizes[:, -2])).astype(int)
    individual_shape = np.zeros(4, dtype=int)
    individual_shape[0:3] = draw_shape
    individual_shape[-1] = int(tot_files - 1)
    data3d_before = []

    ii = np.arange(tot_files)[filesizes[:, -1] == 0][0]
    dark_read = dask.delayed(data_class.readData)(
        filenames[ii],
        image_range=(0, dref_shape[-1]),
        pixels_x=dref_shape[0],
        pixels_y=dref_shape[1],
    )
    dark_data = da.from_delayed(dark_read, filesizes[ii, 0:3], np.float32)
    del ii
    mean_dark_ref = da.mean(dark_data, axis=-1)

    for jj in np.arange(1, tot_files):
        ii = np.arange(tot_files)[filesizes[:, -1] == jj][0]
        test_read = dask.delayed(data_class.readData)(
            filenames[ii],
            image_range=(0, draw_shape[-1]),
            pixels_x=draw_shape[0],
            pixels_y=draw_shape[1],
        )
        test_data = da.from_delayed(test_read, filesizes[ii, 0:3], np.float32)
        test_data = test_data.rechunk(-1, -1, 256)
        data3d_before.append(test_data)

    data3d_dask = da.concatenate(data3d_before, axis=-1)

    data_shape = data3d_dask.shape
    con_shape = tuple((np.asarray(data_shape[0:2]) * np.asarray((0.5, 2))).astype(int))
    xvals = int(data_shape[-1] ** 0.5)

    d3r = da.transpose(data3d_dask, (2, 0, 1))
    d3s = d3r - mean_dark_ref
    d3D_dref = da.transpose(d3s, (1, 2, 0))
    top_part = d3D_dref[0 : con_shape[0], :, :]
    bot_part = d3D_dref[con_shape[0] : data_shape[0], :, :]
    top_part_rs = top_part[::-1, ::-1, :]
    data3d_arranged = da.concatenate([bot_part, top_part_rs], axis=1)
    shape4d = (con_shape[0], con_shape[1], xvals, xvals)
    data4d_dask = da.reshape(data3d_arranged, shape4d)

    bin_nums = int((xvals / bin_factor) ** 2)
    xvals_bin = int(xvals / bin_factor)
    if np.logical_not((np.mod(xvals, bin_factor)).astype(bool)):
        yyb = np.arange(data4d_dask.shape[2])[::bin_factor]
        xxb = np.arange(data4d_dask.shape[3])[::bin_factor]
        data3d_binY = da.reshape(
            data4d_dask[:, :, yyb, :],
            (con_shape[0], con_shape[1], int(xvals * xvals_bin)),
        )
        for ybf in np.arange(1, bin_factor):
            data3d_binY = data3d_binY + da.reshape(
                data4d_dask[:, :, yyb + ybf, :],
                (con_shape[0], con_shape[1], int(xvals * xvals_bin)),
            )
        data4d_binY = da.reshape(
            data3d_binY, (con_shape[0], con_shape[1], xvals_bin, xvals)
        )

        data3d_binYX = da.reshape(
            data4d_binY[:, :, :, xxb], (con_shape[0], con_shape[1], bin_nums)
        )
        for xbf in np.arange(1, bin_factor):
            data3d_binYX = data3d_binYX + da.reshape(
                data4d_binY[:, :, :, xxb + xbf], (con_shape[0], con_shape[1], bin_nums)
            )
        data4d_bin = da.reshape(
            data3d_binYX, (con_shape[0], con_shape[1], xvals_bin, xvals_bin)
        )
        data4D = data4d_bin.compute()
    else:
        data4D = data4d_dask.compute()
    cluster.close()
    return data4D
Example #7
0
nrg_2k.chunks

#Reading in all load files at once
files = sorted(glob.glob('Texas/*.hdf5'))
dsets = [h5py.File(f)['/load'] for f in files]
#Reshaping as a numpy array to give yearly, daily and 15 minute intervals of power readings
arrs = [np.array(d).reshape((1, 365, 96)) for d in dsets[1:]]
#Stacking into 4 years
arrs_stacked = np.concatenate(arrs, axis = 0)
#converting to yearly array, with first dimension as year
da_arrs = da.from_array(arrs_stacked)

#Alternative workflow all in dask with no intermediate NumPy
da_arrs2 = [da.from_array(d) for d in dsets[1:]]
da_arrs2_stack = da.stack(da_arrs2)
da_arrs2_rshp = da.reshape(da_arrs2_stack, (4, 365, 96))

#Working with dask dataframes
import dask.dataframe as dd

df = dd.read_csv('WDI.csv')

#Looking at all indicator filters
np.array(df['Indicator Code'].unique())
fil1 = df['Indicator Code'] == 'SP.POP.0014.TO.ZS'
fil2 = df['Region'] == 'East Asia & Pacific'

#Filtering
df1 = df.loc[fil1 & fil2]

#Basic grouping and plotting output
Example #8
0
def main(argv=None):

    #     cluster = LocalCluster(dashboard_address=None)
    #     client = Client(cluster, memory_limit='{}GB'.format(FLAGS.memory_limit),
    #                     processes=False)

    K.set_floatx('float32')

    chunk_size = FLAGS.chunk_size

    # Read data set
    hdf5_file = h5py.File(FLAGS.data_file, 'r')
    images, labels, _ = hdf52dask(hdf5_file,
                                  FLAGS.group,
                                  chunk_size,
                                  shuffle=FLAGS.shuffle,
                                  seed=FLAGS.seed,
                                  pct=FLAGS.pct)
    n_images = images.shape[0]
    n_batches = int(np.ceil(n_images / float(FLAGS.batch_size)))

    # Data augmentation parameters
    daug_params_file = get_daug_scheme_path(FLAGS.daug_params, FLAGS.data_file)
    daug_params = yaml.load(open(daug_params_file, 'r'),
                            Loader=yaml.FullLoader)
    nodaug_params_file = get_daug_scheme_path('nodaug.yml', FLAGS.data_file)
    nodaug_params = yaml.load(open(nodaug_params_file, 'r'),
                              Loader=yaml.FullLoader)

    # Initialize the network model
    model_filename = FLAGS.model
    model = load_model(model_filename)

    # Print the model summary
    model.summary()

    # Get relevant layers
    if FLAGS.store_input:
        layer_regex = '({}|.*input.*)'.format(FLAGS.layer_regex)
    else:
        layer_regex = FLAGS.layer_regex

    layers = [
        layer.name for layer in model.layers
        if re.compile(layer_regex).match(layer.name)
    ]

    # Create batch generators
    n_daug_rep = FLAGS.n_daug_rep
    n_diff_per_batch = int(FLAGS.batch_size / n_daug_rep)
    image_gen_daug = get_generator(images, **daug_params)
    batch_gen_daug = batch_generator(image_gen_daug,
                                     images,
                                     labels,
                                     batch_size=n_diff_per_batch,
                                     aug_per_im=n_daug_rep,
                                     shuffle=False)
    image_gen_nodaug = get_generator(images, **nodaug_params)
    batch_gen_nodaug = batch_generator(image_gen_nodaug,
                                       images,
                                       labels,
                                       FLAGS.batch_size,
                                       aug_per_im=1,
                                       shuffle=False)

    # Outputs
    if FLAGS.output_dir == '-1':
        FLAGS.output_dir = os.path.dirname(FLAGS.model)

    output_hdf5 = h5py.File(
        os.path.join(FLAGS.output_dir, FLAGS.output_mse_matrix_hdf5), 'w')
    output_pickle = os.path.join(FLAGS.output_dir, FLAGS.output_pickle)
    df_init_idx = 0
    df = pd.DataFrame()

    # Iterate over the layers
    for layer_idx, layer_name in enumerate(layers):

        # Reload the model
        if layer_idx > 0:
            K.clear_session()
            model = load_model(model_filename)

        layer = model.get_layer(layer_name)

        # Rename input layer
        if re.compile('.*input.*').match(layer_name):
            layer_name = 'input'

        hdf5_layer = output_hdf5.create_group(layer_name)

        activation_function = K.function(
            [model.input, K.learning_phase()], [layer.output])

        print('\nComputing pairwise similarity at layer {}'.format(layer_name))

        # Compute activations of original data (without augmentation)
        a_nodaug_da = get_activations(activation_function, batch_gen_nodaug)
        a_nodaug_da = da.squeeze(a_nodaug_da)
        a_nodaug_da = da.rechunk(a_nodaug_da,
                                 (chunk_size, ) + (a_nodaug_da.shape[1:]))
        dim_activations = a_nodaug_da.shape[1]

        # Comute matrix of similarities
        r = da.reshape(da.sum(da.square(a_nodaug_da), axis=1), (-1, 1))
        mse_matrix = (r - 2 * da.dot(a_nodaug_da,
                                     da.transpose(a_nodaug_da)) \
                     + da.transpose(r)) / dim_activations

        # Compute activations with augmentation
        a_daug_da = get_activations(activation_function, batch_gen_daug)
        a_daug_da = da.rechunk(a_daug_da, (chunk_size, dim_activations, 1))

        # Compute similarity of augmentations with respect to the
        # activations of the original data
        a_nodaug_da = da.repeat(da.reshape(a_nodaug_da,
                                           a_nodaug_da.shape + (1, )),
                                repeats=n_daug_rep,
                                axis=2)
        a_nodaug_da = da.rechunk(a_nodaug_da, (chunk_size, dim_activations, 1))
        mse_daug = da.mean(da.square(a_nodaug_da - a_daug_da), axis=1)

        # Compute invariance score
        mse_sum = da.repeat(da.reshape(da.sum(mse_matrix, axis=1),
                                       (n_images, 1)),
                            repeats=n_daug_rep,
                            axis=1)
        mse_sum = da.rechunk(mse_sum, (chunk_size, 1))
        invariance = 1 - n_images * da.divide(mse_daug, mse_sum)

        print('Dimensionality activations: {}x{}x{}'.format(
            n_images, dim_activations, n_daug_rep))

        # Store HDF5 file
        if FLAGS.output_mse_matrix_hdf5:
            mse_matrix_ds = hdf5_layer.create_dataset(
                'mse_matrix',
                shape=mse_matrix.shape,
                chunks=mse_matrix.chunksize,
                dtype=K.floatx())
            mse_daug_ds = hdf5_layer.create_dataset('mse_daug',
                                                    shape=mse_daug.shape,
                                                    chunks=mse_daug.chunksize,
                                                    dtype=K.floatx())
            invariance_ds = hdf5_layer.create_dataset(
                'invariance',
                shape=invariance.shape,
                chunks=invariance.chunksize,
                dtype=K.floatx())
            time_init = time()
            with ProgressBar(dt=1):
                da.store([mse_matrix, mse_daug, invariance],
                         [mse_matrix_ds, mse_daug_ds, invariance_ds])
            time_end = time()
            print('Elapsed time: {}'.format(time_end - time_init))

            invariance = np.ravel(
                np.asarray(output_hdf5[layer_name]['invariance']))
        else:
            time_init = time()
            invariance = da.ravel(invariance).compute()
            time_end = time()
            print('Elapsed time: {}'.format(time_end - time_init))

        # Update pandas data frame for plotting
        df_end_idx = df_init_idx + n_images * n_daug_rep
        d = pd.DataFrame(
            {
                'Layer': layer_name,
                'sample': np.repeat(np.arange(n_images), n_daug_rep),
                'n_daug': np.tile(np.arange(n_daug_rep), n_images),
                'invariance': invariance
            },
            index=np.arange(df_init_idx, df_end_idx).tolist())
        df = df.append(d)
        df_init_idx += df_end_idx

    pickle.dump(df, open(output_pickle, 'wb'))
    output_hdf5.close()