Example #1
0
    def _calc_length_and_angles(self, vectors):
        """Set the three vectors that define the shape of the unit cell

        Parameters
        ----------
        vectors : tuple of three arrays, each of shape=(n_frames, 3)
            The semantics of this array are that the shape of the unit cell
            in frame ``i`` are given by the three vectors, ``value[i, 0, :]``,
            ``value[i, 1, :]``, and ``value[i, 2, :]``.
        """
        if vectors is None:  # or da.all(abs(vectors) < 1e-15):
            self._unitcell_lengths = None
            self._unitcell_angles = None
            return

        if not len(vectors) == len(self):
            raise TypeError("unitcell_vectors must be the same length as "
                            "the trajectory. you provided %s" % str(vectors))

        v1 = vectors[:, 0, :]
        v2 = vectors[:, 1, :]
        v3 = vectors[:, 2, :]
        a, b, c, alpha, beta, gamma = box_vectors_to_lengths_and_angles(
            v1, v2, v3)

        self._unitcell_lengths = da.vstack((a, b, c)).T
        self._unitcell_angles = da.vstack((alpha, beta, gamma)).T
Example #2
0
    def fit(
        self,
        X: Union[ArrayLike, DataFrameType],
        y: Optional[Union[ArrayLike, SeriesType]] = None,
    ) -> "RobustScaler":
        q_min, q_max = self.quantile_range
        if not 0 <= q_min <= q_max <= 100:
            raise ValueError("Invalid quantile range: %s" %
                             str(self.quantile_range))

        if isinstance(X, dd.DataFrame):
            n_columns = len(X.columns)
            partition_lengths = X.map_partitions(len).compute()
            dtype = np.find_common_type(X.dtypes, [])
            blocks = X.to_delayed()
            X = da.vstack([
                da.from_delayed(block.values,
                                shape=(length, n_columns),
                                dtype=dtype)
                for block, length in zip(blocks, partition_lengths)
            ])

        quantiles: Any = [
            da.percentile(col, [q_min, 50.0, q_max]) for col in X.T
        ]
        quantiles = da.vstack(quantiles).compute()
        self.center_: List[float] = quantiles[:, 1]
        self.scale_: List[float] = quantiles[:, 2] - quantiles[:, 0]
        self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
        self.n_features_in_: int = X.shape[1]
        return self
Example #3
0
    def fit(self, X, y=None):
        q_min, q_max = self.quantile_range
        if not 0 <= q_min <= q_max <= 100:
            raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))

        if isinstance(X, dd.DataFrame):
            n_columns = len(X.columns)
            partition_lengths = X.map_partitions(len).compute()
            dtype = np.find_common_type(X.dtypes, [])
            blocks = X.to_delayed()
            X = da.vstack(
                [
                    da.from_delayed(
                        block.values, shape=(length, n_columns), dtype=dtype
                    )
                    for block, length in zip(blocks, partition_lengths)
                ]
            )

        quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T]
        quantiles = da.vstack(quantiles).compute()
        self.center_ = quantiles[:, 1]
        self.scale_ = quantiles[:, 2] - quantiles[:, 0]
        self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False)
        return self
Example #4
0
def test_vstack():
    x = np.arange(5)
    y = np.ones(5)
    a = da.arange(5, chunks=2)
    b = da.ones(5, chunks=2)

    assert_eq(np.vstack((x, y)), da.vstack((a, b)))
    assert_eq(np.vstack((x, y[None, :])), da.vstack((a, b[None, :])))
def test_vstack():
    x = np.arange(5)
    y = np.ones(5)
    a = da.arange(5, chunks=2)
    b = da.ones(5, chunks=2)

    assert_eq(np.vstack((x, y)), da.vstack((a, b)))
    assert_eq(np.vstack((x, y[None, :])), da.vstack((a, b[None, :])))
Example #6
0
def process_data(X, y=None, test_size=0.2):
    if y is None:
        km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100)
        km.fit(X.flatten().reshape(-1, 1))
        y = km.labels_
    y_uniqs = np.unique(y[:,0])

    len_ = X.shape[0]
    X = prepare_dataset(X)

    shape_ = list(X.shape[1:])

    if test_size != 0:
        samples = list()
        samples_labels = list()
        print('Preparing samples ...')
        for _ in range(2):
            for y_uniq in y_uniqs:
                sample = list()
                label = list()
                for xa, ya in zip(chunks(X, 10),chunks(y[:,0], 10)):
                    try:
                        sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                        label.append(y_uniq)
                        if len(sample) >= len(y_uniqs):
                            break
                    except:
                        pass
                samples += sample
                samples_labels += label
        samples = da.vstack(samples)
        samples_labels = da.vstack(samples_labels)

    if test_size == 0:
        print('Training dataset shape x: ', X.shape)
        print('Training dataset shape y: ', y.shape)

        train_dataset = Dataset(X, y)
        return train_dataset

    else:
        X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=config.seeds)

        X_train = X_train.reshape([X_train.shape[0]] + shape_)
        X_test = X_test.reshape([X_test.shape[0]] + shape_)

        print('Training dataset shape: ', X_train.shape)
        print('Validation dataset shape: ', X_test.shape)

        train_dataset = Dataset(X_train, y_train)
        test_dataset = Dataset(X_test, y_test)

        train_dataset.samples = samples
        train_dataset.samples_labels = samples_labels

        print('Sample dataset shape: ', train_dataset.samples.shape)
        return train_dataset, test_dataset
    def batch_function(self, func, p1):
        with tf.Session(graph=self.graph) as session:
            saver = tf.train.Saver()
            if (self.load(session, saver)):
                num_epochs_trained = self.model_graph.cur_epoch_tensor.eval(
                    session)
                print('EPOCHS trained: ', num_epochs_trained)
            else:
                return

            output_l = list()

            start = 0
            end = self.batch_size

            with tqdm(range(p1.shape[0] // self.batch_size)) as pbar:
                while end < p1.shape[0]:
                    output = func(session, p1[start:end])
                    output = np.array(output)
                    output = output.reshape(
                        [output.shape[0] * output.shape[1]] +
                        list(output.shape[2:]))
                    output_l.append(output)

                    start = end
                    end += self.batch_size
                    pbar.update(1)
                else:

                    x1 = p1[start:]
                    xsize = len(x1)
                    p1t = da.zeros([self.batch_size - xsize] +
                                   list(x1.shape[1:]))

                    output = func(session, np.concatenate((x1, p1t), axis=0))
                    output = np.array(output)
                    output = output.reshape(
                        [output.shape[0] * output.shape[1]] +
                        list(output.shape[2:]))[0:xsize]

                    output_l.append(output)

                    pbar.update(1)

        try:
            return da.vstack(output_l)
        except:
            output_l = list(map(lambda l: l.reshape(-1, 1), output_l))
        return da.vstack(output_l)
Example #8
0
    def generate(self):
        """
        Sub-classable method for generating a factorial design of specified 'levels' in the given domain.
        The number of generated points is levels^d.
        
        Returns
        -------
        dask.delayed
            
        """
        if hasattr(self, 'random_idx'):
            del self.random_idx

        # Get grid coordinates
        grid_coords = [
            da.linspace(lb, ub, num=self.levels)
            for lb, ub in zip(self.xmin, self.xmax)
        ]

        # Generate the full grid
        x = da.meshgrid(*grid_coords)
        dim_idx = [item.ravel() for item in x]
        x = da.vstack(dim_idx).T
        x = x.rechunk(('auto', x.shape[1]))
        if self.use_logger:
            self.logger.info(
                "Factorial design: generated {0} points in {1} dimensions".
                format(len(x), len(self.xmin)))
        self.generated = x
        return x
Example #9
0
def prepare_dataset(X):
    
    len_ = X.shape[0]
    shape_ = X.shape

    d = int(da.sqrt(X.flatten().reshape(X.shape[0], -1).shape[1]))

    if len(shape_)==4:
        X = da.reshape(X, [-1, d, d, 3])
        
    elif d==shape_[1] and len(shape_)==3:
        X = da.reshape(X, [-1, d, d])
        X = da.array(list(map(lambda x: grey2rgb(x), X)), dtype=da.float32)

    else:
        r = d**2 - X.shape[1]
        train_padding = da.zeros((shape_[0], r))
        X = da.vstack([X, train_padding])
        
        X = da.reshape(X, [-1, d, d])
        X = da.array(list(map(lambda x: grey2rgb(x), X)), dtype=da.float32)
    
    print('Scaling dataset')
    if scalar is not None:
        X = scaler.transform(X.flatten().reshape(-1,1).astype(da.float32)).reshape(X.shape)
    else:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X.flatten().reshape(-1,1).astype(da.float32)).reshape(X.shape)
        
    return X
Example #10
0
    def pad_hrv_data(self, res):
        """Add empty pixels around the HRV."""
        logger.debug('Padding HRV data to full disk')
        nlines = int(self.mda['number_of_lines'])

        segment_number = self.mda['segment_sequence_number']

        current_first_line = (segment_number
                              - self.mda['planned_start_segment_number']) * nlines
        bounds = self.epilogue['ImageProductionStats']['ActualL15CoverageHRV']

        upper_south_line = bounds[
          'LowerNorthLineActual'] - current_first_line - 1
        upper_south_line = min(max(upper_south_line, 0), nlines)

        data_list = list()
        if upper_south_line > 0:
            # we have some of the lower window
            data_lower = pad_data_horizontally(res[:upper_south_line, :].data,
                                               (upper_south_line, HRV_NUM_COLUMNS),
                                               bounds['LowerEastColumnActual'],
                                               bounds['LowerWestColumnActual'])
            data_list.append(data_lower)

        if upper_south_line < nlines:
            # we have some of the upper window
            data_upper = pad_data_horizontally(res[upper_south_line:, :].data,
                                               (nlines - upper_south_line, HRV_NUM_COLUMNS),
                                               bounds['UpperEastColumnActual'],
                                               bounds['UpperWestColumnActual'])
            data_list.append(data_upper)
        return xr.DataArray(da.vstack(data_list), dims=('y', 'x'), attrs=res.attrs.copy())
Example #11
0
 def _prepare_variable_for_palette(self, variable, info):
     if 'scale_offset_dataset' in info:
         so_dataset = self.nc[info['scale_offset_dataset']]
         scale = so_dataset.attrs['scale_factor']
         offset = so_dataset.attrs['add_offset']
     else:
         scale = 1
         offset = 0
     variable.attrs['palette_meanings'] = [
         int(val) for val in variable.attrs['palette_meanings'].split()
     ]
     if variable.attrs['palette_meanings'][0] == 1:
         variable.attrs['palette_meanings'] = [
             0
         ] + variable.attrs['palette_meanings']
         variable = xr.DataArray(da.vstack(
             (np.array(variable.attrs['fill_value_color']), variable.data)),
                                 coords=variable.coords,
                                 dims=variable.dims,
                                 attrs=variable.attrs)
     val, idx = np.unique(variable.attrs['palette_meanings'],
                          return_index=True)
     variable.attrs['palette_meanings'] = val * scale + offset
     variable = variable[idx]
     return variable
Example #12
0
    def scale_dataset(self, dsid, variable, info):
        """Scale the data set, applying the attributes from the netCDF file."""
        variable = remove_empties(variable)
        scale = variable.attrs.get('scale_factor', np.array(1))
        offset = variable.attrs.get('add_offset', np.array(0))
        if np.issubdtype((scale + offset).dtype, np.floating) or np.issubdtype(variable.dtype, np.floating):
            if '_FillValue' in variable.attrs:
                variable = variable.where(
                    variable != variable.attrs['_FillValue'])
                variable.attrs['_FillValue'] = np.nan
            if 'valid_range' in variable.attrs:
                variable = variable.where(
                    variable <= variable.attrs['valid_range'][1])
                variable = variable.where(
                    variable >= variable.attrs['valid_range'][0])
            if 'valid_max' in variable.attrs:
                variable = variable.where(
                    variable <= variable.attrs['valid_max'])
            if 'valid_min' in variable.attrs:
                variable = variable.where(
                    variable >= variable.attrs['valid_min'])
        attrs = variable.attrs
        variable = variable * scale + offset
        variable.attrs = attrs

        variable.attrs.update({'platform_name': self.platform_name,
                               'sensor': self.sensor})

        if not variable.attrs.get('standard_name', '').endswith('status_flag'):
            # TODO: do we really need to add units to everything ?
            variable.attrs.setdefault('units', '1')

        ancillary_names = variable.attrs.get('ancillary_variables', '')
        try:
            variable.attrs['ancillary_variables'] = ancillary_names.split()
        except AttributeError:
            pass

        if 'palette_meanings' in variable.attrs:
            variable.attrs['palette_meanings'] = [int(val)
                                                  for val in variable.attrs['palette_meanings'].split()]
            if variable.attrs['palette_meanings'][0] == 1:
                variable.attrs['palette_meanings'] = [0] + variable.attrs['palette_meanings']
                variable = xr.DataArray(da.vstack((np.array(variable.attrs['fill_value_color']), variable.data)),
                                        coords=variable.coords, dims=variable.dims, attrs=variable.attrs)

            val, idx = np.unique(variable.attrs['palette_meanings'], return_index=True)
            variable.attrs['palette_meanings'] = val
            variable = variable[idx]

        if 'standard_name' in info:
            variable.attrs.setdefault('standard_name', info['standard_name'])
        if self.sw_version == 'NWC/PPS version v2014' and dsid.name == 'ctth_alti':
            # pps 2014 valid range and palette don't match
            variable.attrs['valid_range'] = (0., 9000.)
        if self.sw_version == 'NWC/PPS version v2014' and dsid.name == 'ctth_alti_pal':
            # pps 2014 palette has the nodata color (black) first
            variable = variable[1:, :]

        return variable
Example #13
0
    def __call__(self, projectables, *args, **kwargs):
        """Generate the composite."""
        from trollimage.image import rgb2ycbcr, ycbcr2rgb
        projectables = self.match_data_arrays(projectables)
        luminance = projectables[0].copy()
        luminance /= 100.
        # Limit between min(luminance) ... 1.0
        luminance = da.where(luminance > 1., 1., luminance)

        # Get the enhanced version of the composite to be sharpened
        rgb_img = enhance2dataset(projectables[1])

        # This all will be eventually replaced with trollimage convert() method
        # ycbcr_img = rgb_img.convert('YCbCr')
        # ycbcr_img.data[0, :, :] = luminance
        # rgb_img = ycbcr_img.convert('RGB')

        # Replace luminance of the IR composite
        y__, cb_, cr_ = rgb2ycbcr(rgb_img.data[0, :, :], rgb_img.data[1, :, :],
                                  rgb_img.data[2, :, :])

        r__, g__, b__ = ycbcr2rgb(luminance, cb_, cr_)
        y_size, x_size = r__.shape
        r__ = da.reshape(r__, (1, y_size, x_size))
        g__ = da.reshape(g__, (1, y_size, x_size))
        b__ = da.reshape(b__, (1, y_size, x_size))

        rgb_img.data = da.vstack((r__, g__, b__))
        return super(LuminanceSharpeningCompositor,
                     self).__call__(rgb_img, *args, **kwargs)
Example #14
0
def StackColumns(*cols):
    """
    Stack the input dask arrays vertically, column by column.

    This uses :func:`dask.array.vstack`.

    Parameters
    ----------
    *cols : :class:`dask.array.Array`
        the dask arrays to stack vertically together

    Returns
    -------
    :class:`dask.array.Array` :
        the dask array where columns correspond to the input arrays

    Raises
    ------
    TypeError
        If the input columns are not dask arrays
    """
    if not all(isinstance(col, da.Array) for col in cols):
        raise TypeError("all input columns in `vstack` must be dask arrays")

    return da.vstack(cols).T
Example #15
0
def get_centroids_distance(x: np.ndarray, means: np.ndarray) -> np.ndarray:
    """Returns the distance values between x and each cluster's centroid.

    The returned values are squared Euclidean distances.

    Parameters
    ----------
    x: ndarray of shape (n_samples, n_features)
        A series of data points.
    means: ndarray of shape (n_clusters, n_features)
        The centroids.

    Returns
    -------
    distances: ndarray of shape (n_clusters, n_samples)
        For each cluster, the squared Euclidian distance (or distances) to x.
    """
    x = np.atleast_2d(x)
    if isinstance(x, da.Array):
        distances = []
        for i in range(means.shape[0]):
            distances.append(np.sum((means[i] - x)**2, axis=-1))
        return da.vstack(distances)
    else:
        return scipy.spatial.distance.cdist(means, x, metric="sqeuclidean")
    def _sampling_reconst(self, std_scales, random_latent=None):
        def aux_fun(session, rand_samp):
            return self.model_graph._sampling_reconst(session=session, std_scales=std_scales, random_latent=rand_samp)

        with tf.Session(graph=self.graph) as session:
            tf.set_random_seed(self.config.seeds)
            self.session = session
            self.saver = tf.train.Saver()

            if (self.config.restore and self.load(self.session, self.saver)):
                load_config = file_utils.load_args(self.config.model_name, self.config.config_dir,
                                                   ['latent_mean', 'latent_std', 'samples', 'y_uniqs'])
                self.config.update(load_config)

                num_epochs_trained = self.model_graph.cur_epoch_tensor.eval(self.session)
                print('EPOCHS trained: ', num_epochs_trained)
            else:
                print('Initializing Variables ...')
                tf.global_variables_initializer().run()

            samples = list()
            if random_latent is None:
                while True:
                    samples.append(self.model_graph._sampling_reconst(session=session, std_scales=std_scales)[0])
                    if len(samples) >= (100//self.config.batch_size)+1:
                        samples = da.vstack(samples)
                        samples = samples[:100]
                        break

            else:
                samples = self.batch_function(aux_fun, random_latent)

        scaler = MinMaxScaler()
        return scaler.fit_transform(samples.flatten().reshape(-1, 1).astype(np.float32)).reshape(samples.shape)
Example #17
0
 def _transform(self, X, inverse=False):
     X = X.copy()  # ...
     transformed = [self._transform_col(X[:, feature_idx],
                                        self.quantiles_[:, feature_idx],
                                        inverse)
                    for feature_idx in range(X.shape[1])]
     return da.vstack(transformed).T
Example #18
0
    def select_points(self, x, n):
        """
        Get 'n' top ranked candidates according to maximin sampling to add to current samples x
        
        Parameters
        ----------
        x : vector or array-like
            existing design to which a new point must be added
        n : integer
            number of new samples to be selected
        
        Returns
        -------
        dask.delayed
        """
        x = da.from_array(x, chunks='auto')
        c = []
        for idx in range(0, n):
            c_new = self.select_point(x)
            x = da.vstack((x, c_new))
            c.append(c_new.to_delayed()[0])

        if self.use_logger:
            self.logger.info(
                "Maximin sequential design: selected {0} new samples".format(
                    n))
        return c
Example #19
0
def polyfit(array, deg=1, dim=None, coord=None):
    """
	Least squares polynomial fit.
	Fit a polynomial ``p(x) = p[deg] * x ** deg + ... + p[0]`` of degree `deg`
	Returns a vector of coefficients `p` that minimises the squared error.

	Parameters
	----------
	x : xarray.DataArray
		The array to fit
	deg : int, optional
		Degree of the fitting polynomial, Default is 1.
	dim : str, optional
		The dimension along which the data will be fitted. If not precised,
		the first dimension will be used
	coord : xarray.Coordinate, optional
		The coordinates used to based the fitting on.

	Returns
	-------
	output : xarray.DataArray
		Polynomial coefficients with a new dimension to sort the polynomial
		coefficients by degree
	"""
    if dim is None:
        dim = array.dims[0]
    # Re-order the array to place the fitting dimension as the first dimension
    # + stack the other dimensions
    array_stacked = _order_and_stack(array, dim)
    dim_chunk = array.chunks[array.get_axis_num(dim)][0]

    if coord is None:
        coord = array[dim]
    if pd.core.common.is_datetime64_dtype(coord.data):
        # Use the 1e-9 to scale nanoseconds to seconds (by default, xarray use
        # datetime in nanoseconds
        t = coord.data.astype('f8') * 1e-9
    else:
        t = coord.data
    # Build coefficient matrix for the fit
    x = da.vstack([t**d for d in range(deg + 1)]).T
    x = x.rechunk((dim_chunk, deg + 1))
    # Solve the least-square system
    p, err, _, _ = da.linalg.lstsq(x, array_stacked.data)
    # TO DO: Compute and store the errors associated to the fit
    # Store the result in a DataArray object
    new_dims = ('degree', ) + array_stacked.dims[1:]
    new_coords = {
        co: array_stacked.coords[co]
        for co in array_stacked.coords if co is not dim
    }
    ds = xr.DataArray(p,
                      name='polynomial_coefficients',
                      coords=new_coords,
                      dims=new_dims)
    ds = ds.assign_coords(degree=range(deg + 1))
    coeffs = _unstack(ds)
    return coeffs
Example #20
0
def dask_array_resolver(obj, resolver, **kw):
    def get_partition(obj_id):
        client = vineyard.connect()
        np_value = client.get(obj_id)
        return da.from_array(np_value)

    meta = obj.meta
    num = int(meta['partitions_-size'])
    dask_client = Client(kw['dask_scheduler'])
    futures = []
    indices = []
    with_index = True
    for i in range(num):
        ts = meta.get_member('partitions_-%d' % i)
        instance_id = int(ts.meta['instance_id'])

        partition_index = json.loads(ts.meta['partition_index_'])
        if partition_index:
            indices.append((partition_index[0], partition_index[1], i))
        else:
            with_index = False

        futures.append(
            # we require the 1-on-1 alignment of vineyard instances and dask workers.
            # vineyard_sockets maps vineyard instance_ids into ipc_sockets, while
            # dask_workers maps vineyard instance_ids into names of dask workers.
            dask_client.submit(get_partition,
                               ts.meta.id,
                               workers={kw['dask_workers'][instance_id]}))

    arrays = dask_client.gather(futures)
    if with_index:
        indices = list(sorted(indices))
        nx = indices[-1][0] + 1
        ny = indices[-1][1] + 1
        assert nx * ny == num
        rows = []
        for i in range(nx):
            cols = []
            for j in range(ny):
                cols.append(arrays[indices[i * ny + j][2]])
            rows.append(da.hstack(cols))
        return da.vstack(rows)

    return da.vstack(arrays)
Example #21
0
 def _transform(self,
                X: Union[ArrayLike, DataFrameType],
                inverse: bool = False) -> Union[ArrayLike, DataFrameType]:
     X = X.copy()  # ...
     transformed = [
         self._transform_col(X[:, feature_idx],
                             self.quantiles_[:, feature_idx], inverse)
         for feature_idx in range(X.shape[1])
     ]
     return da.vstack(transformed, allow_unknown_chunksizes=True).T
Example #22
0
 def _transform(self, X, inverse=False):
     X = X.copy()  # ...
     transformed = [
         self._transform_col(X[:, feature_idx],
                             self.quantiles_[:, feature_idx], inverse)
         for feature_idx in range(X.shape[1])
     ]
     if DASK_110:
         kwargs = {"allow_unknown_chunksizes": True}
     else:
         kwargs = {}
     return da.vstack(transformed, **kwargs).T
Example #23
0
 def _assemble_azimuth_noise_blocks(self, chunks):
     """Assemble the azimuth noise blocks into one single array."""
     # The strategy here is a bit convoluted. The job would be trivial if
     # performed on regular numpy arrays, but here we want to keep the data
     # as xarray/dask array as much as possible.
     # Using a pure xarray approach was tested (with `combine_first`,
     # `interpolate_na`, etc), but was found to be memory-hungry at the time
     # of implementation (March 2021). Hence the usage of a custom algorithm,
     # relying mostly on dask arrays.
     slices = self._create_dask_slices_from_blocks(chunks)
     populated_array = da.vstack(slices).rechunk(chunks)
     populated_array = xr.DataArray(populated_array, dims=['y', 'x'],
                                    coords={'x': np.arange(self._image_shape[1]),
                                            'y': np.arange(self._image_shape[0])})
     return populated_array
Example #24
0
def calibrate_posterior_predictive(post_pred, qc):
    """ Function to calibrate posterior predictive.

    This allows the calibrated model to make predictions. This function is required to compute
    mean and log likelihood of the calibrated model.

    Args:
        post_pred: posterior predictive of shape (num samples, num X values)
        qc: calibration object as defined in class QuantileCalibration

    Returns:
        calibrated posterior predictive of shape (num samples, num X values)
    """

    # Need to convert from jax array to dask array to avoid
    # out of memory error (on a 32GB machine for 8000 samples) in the next step.
    # This also helps to parallelize the task to all cpu cores.
    post_pred_shape = post_pred.shape
    res_main_post_pred = da.from_array(
        np.array(post_pred),
        chunks=(
            1000,  # reduce this value if out of memory!
            np.ceil(post_pred_shape[1] / dask.system.cpu_count()),
        ),
    )
    # expand to 3D: axis 0: num observations; axis 1: num samples; axis 2: num samples
    uncalibrated_pp_quantiles = (
        da.sum(res_main_post_pred.T[:, :, np.newaxis] <=
               res_main_post_pred.T[:, np.newaxis, :],
               axis=1).T / post_pred_shape[0])

    # calculate inverse R
    inverse_calibrated_pp_quantiles = da.apply_along_axis(
        qc.inverse_transform, 0, uncalibrated_pp_quantiles)

    # inverse CDF by looking up existing samples with np.quantile()
    da_combined = da.vstack(
        [res_main_post_pred,
         inverse_calibrated_pp_quantiles.compute()])
    calibrated_post_pred = da.apply_along_axis(
        lambda q: np.quantile(
            q[:post_pred_shape[0]], q[post_pred_shape[0]:], axis=0),
        0,
        da_combined,
    ).compute()

    return calibrated_post_pred
Example #25
0
def scatter_with_regression(
    x: da.Array,
    y: da.Array,
    sample_size: int,
    k: Optional[int] = None
) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array],
           Optional[da.Array]]:
    """Calculate pearson correlation on 2 given arrays.

    Parameters
    ----------
    xarr : da.Array
    yarr : da.Array
    sample_size : int
    k : Optional[int] = None
        Highlight k points which influence pearson correlation most
    """
    if k == 0:
        raise ValueError("k should be larger than 0")

    xp1 = da.vstack([x, da.ones_like(x)]).T
    xp1 = xp1.rechunk((xp1.chunks[0], -1))

    mask = ~(da.isnan(x) | da.isnan(y))
    # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr,
    # where the former does not support nan in shape.

    if len(xp1.chunks[0]) == 1:
        xp1 = xp1.rechunk((2, -1))
        y = y.rechunk((2, -1))
        mask = mask.rechunk((2, -1))

    (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask])

    if sample_size < x.shape[0]:
        samplesel = da.random.choice(x.shape[0],
                                     int(sample_size),
                                     chunks=x.chunksize)
        x = x[samplesel]
        y = y[samplesel]

    if k is None:
        return (coeffa, coeffb), (x, y), None

    influences = pearson_influence(x, y)
    return (coeffa, coeffb), (x, y), influences
Example #26
0
    def mask_seed_region(self, parts, mask_img, medoid_coords, num_workers=16):
        """
        Generate encoders such that all voxels included in the cluster of the
        medoid are assigned the one value and zero otherwise

        :param parts: Dask array, shape(n_sliding windows, n_voxels)
            Parcellations across sliding windows
        :param mask_img: Nifti Image, shape (x,y,z)
            Mask of the data
        :param medoid_coords: tuple, shape (x,y,z)
            medoid of a region
        :param chunksize_voxels: int
            Number of voxels in one chunk
        :return: Dask array, shape(n_sliding windows, n_voxels)
            Masked parcellations for one medoid
        """

        n_sw = parts.shape[0]

        mask_medoid = self.__get_mask_medoid(mask_img, medoid_coords)

        tmp = np.unique(mask_medoid)

        if tmp.shape[0] == 1:

            return None

        else:

            with closing(Pool(processes=num_workers)) as p:

                l_masked_parts = p.starmap(self.get_onhot_vect,
                                           [(parts, mask_medoid, sw_idx)
                                            for sw_idx in range(0, n_sw)])

            del mask_medoid

            gc.collect()

            darr_masked_parts = da.vstack(l_masked_parts)

            del l_masked_parts
            gc.collect()

            return darr_masked_parts
Example #27
0
def compute_importance_gbt(x, y, x_test, y_test):
    """Compute importance based on gradient boosted trees."""
    print("Computing importance based on gradient boosted trees ... ")
    num_factors = y.shape[1]
    #num_codes = x.shape[0]
    importance_matrix = list()
    train_loss = []
    test_loss = []
    for i in range(num_factors):
        model = GradientBoostingClassifier(verbose=1)
        model.fit(x, y[:, i])

        importance_matrix.append(np.abs(model.feature_importances_))
        train_loss.append(da.mean(model.predict(x) == y[:, i]))
        test_loss.append(da.mean(model.predict(x_test) == y_test[:, i]))

    return da.vstack(importance_matrix), np.mean(train_loss), np.mean(
        test_loss)
Example #28
0
def scatter_with_regression(
    xarr: da.Array,
    yarr: da.Array,
    sample_size: int,
    k: Optional[int] = None
) -> Tuple[Tuple[float, float], dd.DataFrame, Optional[np.ndarray]]:
    """
    Calculate pearson correlation on 2 given arrays.

    Parameters
    ----------
    xarr : da.Array
    yarr : da.Array
    sample_size : int
    k : Optional[int] = None
        Highlight k points which influence pearson correlation most

    Returns
    -------
    Intermediate
    """
    if k == 0:
        raise ValueError("k should be larger than 0")

    mask = ~(da.isnan(xarr) | da.isnan(yarr))
    xarr = da.from_array(np.array(xarr)[mask])
    yarr = da.from_array(np.array(yarr)[mask])
    xarrp1 = da.vstack([xarr, da.ones_like(xarr)]).T
    xarrp1 = xarrp1.rechunk((xarrp1.chunks[0], -1))
    (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xarrp1, yarr)

    if sample_size < len(xarr):
        samplesel = np.random.choice(len(xarr), int(sample_size))
        xarr = xarr[samplesel]
        yarr = yarr[samplesel]

    df = dd.concat([dd.from_dask_array(arr) for arr in [xarr, yarr]], axis=1)
    df.columns = ["x", "y"]

    if k is None:
        return (coeffa, coeffb), df, None

    influences = pearson_influence(xarr, yarr)
    return (coeffa, coeffb), df, influences
def process_data(X, y=None, test_size=0.20, dummies=False):
    if y is None:
        y = da.ones(X.shape[0])
    y_uniqs = np.unique(y)

    len_ = X.shape[0]
    X = prepare_dataset(X)

    if dummies:
        y = dd.get_dummies(y)

    shape_ = list(X.shape[1:])

    samples = list()
    for _ in range(10):
        for y_uniq in y_uniqs:
            sample = list()
            for xa, ya in zip(chunks(X, 10),chunks(y, 10)):
                try:
                    sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                    if len(sample) >= 500:
                        break
                except:
                    pass
            samples += sample
    samples = da.vstack(samples)

    X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=4891)

    X_train = X_train.reshape([X_train.shape[0]] + shape_)
    X_test = X_test.reshape([X_test.shape[0]] + shape_)

    print('Training dataset shape: ', X_train.shape)
    print('Validation dataset shape: ', X_test.shape)

    train_dataset = Dataset(X_train, y_train)
    test_dataset = Dataset(X_test, y_test)

    train_dataset.samples = samples
    print('Sample dataset shape: ', train_dataset.samples.shape)
    return train_dataset, test_dataset
Example #30
0
def SkyToUnitSphere(ra, dec, degrees=True):
    """
    Convert sky coordinates (``ra``, ``dec``) to Cartesian coordinates on
    the unit sphere.

    Parameters
    ----------
    ra : :class:`dask.array.Array`; shape: (N,)
        the right ascension angular coordinate
    dec : :class:`dask.array.Array`; ; shape: (N,)
        the declination angular coordinate
    degrees : bool, optional
        specifies whether ``ra`` and ``dec`` are in degrees or radians

    Returns
    -------
    pos : :class:`dask.array.Array`; shape: (N,3)
        the cartesian position coordinates, where columns represent
        ``x``, ``y``, and ``z``

    Raises
    ------
    TypeError
        If the input columns are not dask arrays
    """
    if not all(isinstance(col, da.Array) for col in [ra, dec]):
        raise TypeError("both ``ra`` and ``dec`` must be dask arrays")

    # put into radians from degrees
    if degrees:
        ra  = da.deg2rad(ra)
        dec = da.deg2rad(dec)

    # cartesian coordinates
    x = da.cos( dec ) * da.cos( ra )
    y = da.cos( dec ) * da.sin( ra )
    z = da.sin( dec )
    return da.vstack([x,y,z]).T
Example #31
0
def StackColumns(*cols):
    """
    Stack the input dask arrays vertically, column by column.

    This uses :func:`dask.array.vstack`.

    Parameters
    ----------
    *cols : :class:`dask.array.Array`
        the dask arrays to stack vertically together

    Returns
    -------
    :class:`dask.array.Array` :
        the dask array where columns correspond to the input arrays

    Raises
    ------
    TypeError
        If the input columns are not dask arrays
    """
    cols = da.broadcast_arrays(*cols)
    return da.vstack(cols).T
Example #32
0
def StackColumns(*cols):
    """
    Stack the input dask arrays vertically, column by column.

    This uses :func:`dask.array.vstack`.

    Parameters
    ----------
    *cols : :class:`dask.array.Array`
        the dask arrays to stack vertically together

    Returns
    -------
    :class:`dask.array.Array` :
        the dask array where columns correspond to the input arrays

    Raises
    ------
    TypeError
        If the input columns are not dask arrays
    """
    cols = da.broadcast_arrays(*cols)
    return da.vstack(cols).T
Example #33
0
def SkyToUnitSphere(ra, dec, degrees=True, frame='icrs'):
    """
    Convert sky coordinates (``ra``, ``dec``) to Cartesian coordinates on
    the unit sphere.

    Parameters
    ----------
    ra : :class:`dask.array.Array`; shape: (N,)
        the right ascension angular coordinate
    dec : :class:`dask.array.Array`; ; shape: (N,)
        the declination angular coordinate
    degrees : bool, optional
        specifies whether ``ra`` and ``dec`` are in degrees or radians
    frame : string ('icrs' or 'galactic')
        speciefies which frame the Cartesian coordinates is. Useful if you know
        the simulation (usually cartesian) is in galactic units but you want
        to convert to the icrs (ra, dec) usually used in surveys.

    Returns
    -------
    pos : :class:`dask.array.Array`; shape: (N,3)
        the cartesian position coordinates, where columns represent
        ``x``, ``y``, and ``z``

    Raises
    ------
    TypeError
        If the input columns are not dask arrays
    """
    ra, dec = da.broadcast_arrays(ra, dec)

    if frame == 'icrs':
        # no frame transformation
        # put into radians from degrees
        if degrees:
            ra  = da.deg2rad(ra)
            dec = da.deg2rad(dec)

        # cartesian coordinates
        x = da.cos( dec ) * da.cos( ra )
        y = da.cos( dec ) * da.sin( ra )
        z = da.sin( dec )
        return da.vstack([x,y,z]).T
    else:
        from astropy.coordinates import SkyCoord

        if degrees:
            ra  = da.deg2rad(ra)
            dec = da.deg2rad(dec)

        def eq_to_cart(ra, dec):
            try:
                sc = SkyCoord(ra, dec, unit='rad', representation_type='unitspherical', frame='icrs')
            except:
                sc = SkyCoord(ra, dec, unit='rad', representation='unitspherical', frame='icrs')

            scg = sc.transform_to(frame=frame)
            scg = scg.cartesian

            x, y, z = scg.x.value, scg.y.value, scg.z.value
            return numpy.stack([x, y, z], axis=1)

        arr = da.apply_gufunc(eq_to_cart, '(),()->(p)', ra, dec, output_dtypes=[ra.dtype], output_sizes={'p': 3})
        return arr
Example #34
0
 def vstack(self, *others, **kwargs):
     others = tuple(ensure_dask_array(d) for d in others)
     tup = (self,) + others
     out = da.vstack(tup)
     return view_subclass(out, type(self))