def _expand_tiepoint_array_5km(self, arr, lines, cols):
     arr = da.repeat(arr, lines * 2, axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)), cols, axis=1)
     if self.cscan_full_width == 271:
         return da.hstack((arr[:, :2], arr, arr[:, -2:]))
     else:
         return da.hstack((arr[:, :2], arr, arr[:, -5:], arr[:, -2:]))
Exemple #2
0
def navigate_dnb(h5f):

    scans = h5f.get_node("/All_Data/NumberOfScans").read()[0]
    geo_dset = h5f.get_node("/All_Data/VIIRS-DNB-GEO_All")
    all_c_align = geo_dset.AlignmentCoefficient.read()[np.newaxis,
                                                       np.newaxis, :,
                                                       np.newaxis]
    all_c_exp = geo_dset.ExpansionCoefficient.read()[np.newaxis, np.newaxis, :,
                                                     np.newaxis]
    all_lon = geo_dset.Longitude.read()
    all_lat = geo_dset.Latitude.read()

    res = []

    # FIXME: this supposes there is only one tiepoint zone in the
    # track direction
    scan_size = h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                                  "TiePointZoneSizeTrack")[0]
    track_offset = h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                                     "PixelOffsetTrack")[0]
    scan_offset = h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                                    "PixelOffsetScan")[0]

    try:
        group_locations = geo_dset.TiePointZoneGroupLocationScanCompact.read()
    except KeyError:
        group_locations = [0]
    param_start = 0
    for tpz_size, nb_tpz, start in \
        zip(h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                              "TiePointZoneSizeScan"),
            geo_dset.NumberOfTiePointZonesScan.read(),
            group_locations):
        lon = all_lon[:, start:start + nb_tpz + 1]
        lat = all_lat[:, start:start + nb_tpz + 1]
        c_align = all_c_align[:, :, param_start:param_start + nb_tpz, :]
        c_exp = all_c_exp[:, :, param_start:param_start + nb_tpz, :]
        param_start += nb_tpz
        nties = nb_tpz
        if (np.max(lon) - np.min(lon) > 90) or (np.max(abs(lat)) > 60):
            x, y, z = lonlat2xyz(lon, lat)
            x, y, z = (expand_array(x, scans, c_align, c_exp, scan_size,
                                    tpz_size, nties, track_offset,
                                    scan_offset),
                       expand_array(y, scans, c_align, c_exp, scan_size,
                                    tpz_size, nties, track_offset,
                                    scan_offset),
                       expand_array(z, scans, c_align, c_exp, scan_size,
                                    tpz_size, nties, track_offset,
                                    scan_offset))
            res.append(xyz2lonlat(x, y, z))
        else:
            res.append(
                (expand_array(lon, scans, c_align, c_exp, scan_size, tpz_size,
                              nties, track_offset, scan_offset),
                 expand_array(lat, scans, c_align, c_exp, scan_size, tpz_size,
                              nties, track_offset, scan_offset)))
    lons, lats = zip(*res)
    return da.hstack(lons), da.hstack(lats)
Exemple #3
0
 def _expand_tiepoint_array_5km(self, arr, lines, cols):
     arr = da.repeat(arr, lines * 2, axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)), cols, axis=1)
     factor = self.fscan_width // self.cscan_width
     if self.cscan_full_width == 271:
         return da.hstack((arr[:, :2 * factor], arr, arr[:, -2 * factor:]))
     else:
         return da.hstack((arr[:, :2 * factor], arr, arr[:, -self.fscan_width:], arr[:, -2 * factor:]))
Exemple #4
0
def test_hstack():
    x = np.arange(5)
    y = np.ones(5)
    a = da.arange(5, chunks=2)
    b = da.ones(5, chunks=2)

    assert_eq(np.hstack((x[None, :], y[None, :])), da.hstack((a[None, :], b[None, :])))
    assert_eq(np.hstack((x, y)), da.hstack((a, b)))
Exemple #5
0
def test_hstack():
    x = np.arange(5)
    y = np.ones(5)
    a = da.arange(5, chunks=2)
    b = da.ones(5, chunks=2)

    assert_eq(np.hstack((x[None, :], y[None, :])),
              da.hstack((a[None, :], b[None, :])))
    assert_eq(np.hstack((x, y)), da.hstack((a, b)))
Exemple #6
0
def navigate_dnb(h5f):

    scans = h5f.get_node("/All_Data/NumberOfScans").read()[0]
    geo_dset = h5f.get_node("/All_Data/VIIRS-DNB-GEO_All")
    all_c_align = geo_dset.AlignmentCoefficient.read()[
        np.newaxis, np.newaxis, :, np.newaxis]
    all_c_exp = geo_dset.ExpansionCoefficient.read()[np.newaxis, np.newaxis, :,
                                                     np.newaxis]
    all_lon = geo_dset.Longitude.read()
    all_lat = geo_dset.Latitude.read()

    res = []

    # FIXME: this supposes there is only one tiepoint zone in the
    # track direction
    scan_size = h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                                  "TiePointZoneSizeTrack")[0]
    track_offset = h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                                     "PixelOffsetTrack")[0]
    scan_offset = h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                                    "PixelOffsetScan")[0]

    try:
        group_locations = geo_dset.TiePointZoneGroupLocationScanCompact.read()
    except KeyError:
        group_locations = [0]
    param_start = 0
    for tpz_size, nb_tpz, start in \
        zip(h5f.get_node_attr("/All_Data/VIIRS-DNB-SDR_All",
                              "TiePointZoneSizeScan"),
            geo_dset.NumberOfTiePointZonesScan.read(),
            group_locations):
        lon = all_lon[:, start:start + nb_tpz + 1]
        lat = all_lat[:, start:start + nb_tpz + 1]
        c_align = all_c_align[:, :, param_start:param_start + nb_tpz, :]
        c_exp = all_c_exp[:, :, param_start:param_start + nb_tpz, :]
        param_start += nb_tpz
        nties = nb_tpz
        if (np.max(lon) - np.min(lon) > 90) or (np.max(abs(lat)) > 60):
            x, y, z = lonlat2xyz(lon, lat)
            x, y, z = (
                expand_array(x, scans, c_align, c_exp, scan_size, tpz_size,
                             nties, track_offset, scan_offset),
                expand_array(y, scans, c_align, c_exp, scan_size, tpz_size,
                             nties, track_offset, scan_offset), expand_array(
                                 z, scans, c_align, c_exp, scan_size, tpz_size,
                                 nties, track_offset, scan_offset))
            res.append(xyz2lonlat(x, y, z))
        else:
            res.append(
                (expand_array(lon, scans, c_align, c_exp, scan_size, tpz_size,
                              nties, track_offset, scan_offset),
                 expand_array(lat, scans, c_align, c_exp, scan_size, tpz_size,
                              nties, track_offset, scan_offset)))
    lons, lats = zip(*res)
    return da.hstack(lons), da.hstack(lats)
Exemple #7
0
    def _create_dask_slice_from_block_line(self, current_line, chunks):
        """Create a dask slice from the blocks at the current line."""
        pieces = self._get_array_pieces_for_current_line(current_line)
        dask_pieces = self._get_padded_dask_pieces(pieces, chunks)
        new_slice = da.hstack(dask_pieces)

        return new_slice
    def _sampling_reconst(self, session, std_scales, random_latent=None):

        if random_latent is None:
            random_latent = list()
            for m, sig, sc in zip(self.config.latent_mean,
                                  self.config.latent_std, std_scales):
                random_latent.append(
                    session.run(
                        tf.random_normal((self.config.batch_size, 1),
                                         m,
                                         sc * sig,
                                         dtype=tf.float32)))
            random_latent = da.hstack(random_latent)
        else:
            try:
                random_latent = random_latent.compute()
            except:
                pass
            for m, sig, sc, ic in zip(self.config.latent_mean,
                                      self.config.latent_std, std_scales,
                                      range(random_latent.shape[0])):
                random_latent[:, ic] = m + (sc * np.sqrt(sig) *
                                            random_latent[:, ic])

        tensors = [self.x_recons]
        feed_dict = {self.latent_batch: random_latent}

        return session.run(tensors, feed_dict=feed_dict)
def test_func(default_val, dataset_flat, shape, dataset):
    shift_up = array.hstack([
        array.zeros((shape[0], 1, shape[2])), dataset[:, :-1, :]
    ]).transpose([1, 2, 0]).reshape([shape[1] * shape[2], -1])

    shift_up_mult = dataset_flat * shift_up
    del shift_up
    return array.mean(shift_up_mult, axis=1)
 def _expand_tiepoint_array_1km(self, arr, lines, cols):
     arr = da.repeat(arr, lines, axis=1)
     arr = da.concatenate(
         (arr[:, :lines // 2, :], arr, arr[:, -(lines // 2):, :]), axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)),
                     cols,
                     axis=1)
     return da.hstack((arr, arr[:, -cols:]))
Exemple #11
0
    def angles(self, azi_name, zen_name):

        all_lat = self.geostuff["Latitude"].value
        all_zen = self.geostuff[zen_name].value
        all_azi = self.geostuff[azi_name].value

        res = []

        param_start = 0
        for tpz_size, nb_tpz, start in zip(self.tpz_sizes, self.nb_tpzs,
                                           self.group_locations):
            lat = all_lat[:, start:start + nb_tpz + 1]
            zen = all_zen[:, start:start + nb_tpz + 1]
            azi = all_azi[:, start:start + nb_tpz + 1]

            c_align = self.c_align[:, :, param_start:param_start + nb_tpz, :]
            c_exp = self.c_exp[:, :, param_start:param_start + nb_tpz, :]

            param_start += nb_tpz

            if (np.max(azi) - np.min(azi) >
                    5) or (np.min(zen) < 10) or (np.max(abs(lat)) > 80):
                expanded = []
                for data in angle2xyz(azi, zen):
                    expanded.append(
                        expand_array(data, self.scans, c_align, c_exp,
                                     self.scan_size, tpz_size, nb_tpz,
                                     self.track_offset, self.scan_offset))

                azi, zen = xyz2angle(*expanded)
                res.append((azi, zen))
            else:
                expanded = []
                for data in (azi, zen):
                    expanded.append(
                        expand_array(data, self.scans, c_align, c_exp,
                                     self.scan_size, tpz_size, nb_tpz,
                                     self.track_offset, self.scan_offset))
                res.append(expanded)

        azi, zen = zip(*res)
        return da.hstack(azi), da.hstack(zen)
Exemple #12
0
    def angles(self, azi_name, zen_name):
        """Compute the angle datasets."""
        all_lat = da.from_array(self.geostuff["Latitude"])
        all_lon = da.from_array(self.geostuff["Longitude"])
        all_zen = da.from_array(self.geostuff[zen_name])
        all_azi = da.from_array(self.geostuff[azi_name])

        res = []
        param_start = 0
        for tpz_size, nb_tpz, start in zip(self.tpz_sizes, self.nb_tpzs,
                                           self.group_locations):
            lat = all_lat[:, start:start + nb_tpz + 1]
            lon = all_lon[:, start:start + nb_tpz + 1]
            zen = all_zen[:, start:start + nb_tpz + 1]
            azi = all_azi[:, start:start + nb_tpz + 1]

            c_align = self.c_align[:, :, param_start:param_start + nb_tpz, :]
            c_exp = self.c_exp[:, :, param_start:param_start + nb_tpz, :]

            param_start += nb_tpz

            if (np.max(azi) - np.min(azi) > 5) or (np.min(zen) < 10) or (
                    np.max(abs(lat)) > 80):
                expanded = []
                cart = convert_from_angles(azi, zen, lon, lat)
                for data in cart:
                    expanded.append(expand_array(
                        data, self.scans, c_align, c_exp, self.scan_size,
                        tpz_size, nb_tpz, self.track_offset, self.scan_offset))

                azi, zen = convert_to_angles(*expanded, lon=self.lons, lat=self.lats)
                res.append((azi, zen))
            else:
                expanded = []
                for data in (azi, zen):
                    expanded.append(expand_array(
                        data, self.scans, c_align, c_exp, self.scan_size,
                        tpz_size, nb_tpz, self.track_offset, self.scan_offset))
                res.append(expanded)

        azi, zen = zip(*res)
        return da.hstack(azi), da.hstack(zen)
Exemple #13
0
    def _create_dask_slice_from_block_line(self, current_line, chunks):
        """Create a dask slice from the blocks at the current line."""
        current_blocks = self._find_blocks_covering_line(current_line)
        current_blocks.sort(key=(lambda x: x.coords['x'][0]))

        next_line = min((arr.coords['y'][-1] for arr in current_blocks))
        current_y = np.arange(current_line, next_line + 1)

        pieces = [arr.sel(y=current_y) for arr in current_blocks]
        dask_pieces = self._get_padded_dask_pieces(pieces, chunks)
        new_slice = da.hstack(dask_pieces)
        return new_slice
Exemple #14
0
    def angles(self, azi_name, zen_name):

        all_lat = self.geostuff["Latitude"].value
        all_zen = self.geostuff[zen_name].value
        all_azi = self.geostuff[azi_name].value

        res = []

        param_start = 0
        for tpz_size, nb_tpz, start in zip(self.tpz_sizes, self.nb_tpzs,
                                           self.group_locations):
            lat = all_lat[:, start:start + nb_tpz + 1]
            zen = all_zen[:, start:start + nb_tpz + 1]
            azi = all_azi[:, start:start + nb_tpz + 1]

            c_align = self.c_align[:, :, param_start:param_start + nb_tpz, :]
            c_exp = self.c_exp[:, :, param_start:param_start + nb_tpz, :]

            param_start += nb_tpz

            if (np.max(azi) - np.min(azi) > 5) or (np.min(zen) < 10) or (
                    np.max(abs(lat)) > 80):
                expanded = []
                for data in angle2xyz(azi, zen):
                    expanded.append(expand_array(
                        data, self.scans, c_align, c_exp, self.scan_size,
                        tpz_size, nb_tpz, self.track_offset, self.scan_offset))

                azi, zen = xyz2angle(*expanded)
                res.append((azi, zen))
            else:
                expanded = []
                for data in (azi, zen):
                    expanded.append(expand_array(
                        data, self.scans, c_align, c_exp, self.scan_size,
                        tpz_size, nb_tpz, self.track_offset, self.scan_offset))
                res.append(expanded)

        azi, zen = zip(*res)
        return da.hstack(azi), da.hstack(zen)
Exemple #15
0
    def navigate(self):

        all_lon = self.geostuff["Longitude"].value
        all_lat = self.geostuff["Latitude"].value

        res = []
        param_start = 0
        for tpz_size, nb_tpz, start in zip(self.tpz_sizes, self.nb_tpzs,
                                           self.group_locations):

            lon = all_lon[:, start:start + nb_tpz + 1]
            lat = all_lat[:, start:start + nb_tpz + 1]

            c_align = self.c_align[:, :, param_start:param_start + nb_tpz, :]
            c_exp = self.c_exp[:, :, param_start:param_start + nb_tpz, :]

            param_start += nb_tpz

            expanded = []
            switch_to_cart = ((np.max(lon) - np.min(lon) > 90)
                              or (np.max(abs(lat)) > 60))

            if switch_to_cart:
                arrays = lonlat2xyz(lon, lat)
            else:
                arrays = (lon, lat)

            for data in arrays:
                expanded.append(
                    expand_array(data, self.scans, c_align, c_exp,
                                 self.scan_size, tpz_size, nb_tpz,
                                 self.track_offset, self.scan_offset))

            if switch_to_cart:
                res.append(xyz2lonlat(*expanded))
            else:
                res.append(expanded)
        lons, lats = zip(*res)
        return da.hstack(lons), da.hstack(lats)
 def _expand_tiepoint_array_5km(self, arr, lines, cols):
     if self.level == 2:  # Repeat the last column to complete L2 data
         arr = da.dstack([arr, arr[:, :, -1]])
     arr = da.repeat(arr, lines * 2, axis=1)
     if self.level == 1:
         arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)),
                         cols,
                         axis=1)
     elif self.level == 2:
         arr = da.repeat(arr.reshape((-1, self.cscan_full_width)),
                         cols,
                         axis=1)
     return da.hstack((arr[:, :2], arr, arr[:, -2:]))
Exemple #17
0
    def navigate(self):

        all_lon = self.geostuff["Longitude"].value
        all_lat = self.geostuff["Latitude"].value

        res = []
        param_start = 0
        for tpz_size, nb_tpz, start in zip(self.tpz_sizes, self.nb_tpzs,
                                           self.group_locations):

            lon = all_lon[:, start:start + nb_tpz + 1]
            lat = all_lat[:, start:start + nb_tpz + 1]

            c_align = self.c_align[:, :, param_start:param_start + nb_tpz, :]
            c_exp = self.c_exp[:, :, param_start:param_start + nb_tpz, :]

            param_start += nb_tpz

            expanded = []
            switch_to_cart = ((np.max(lon) - np.min(lon) > 90) or
                              (np.max(abs(lat)) > 60))

            if switch_to_cart:
                arrays = lonlat2xyz(lon, lat)
            else:
                arrays = (lon, lat)

            for data in arrays:
                expanded.append(expand_array(
                    data, self.scans, c_align, c_exp, self.scan_size,
                    tpz_size, nb_tpz, self.track_offset, self.scan_offset))

            if switch_to_cart:
                res.append(xyz2lonlat(*expanded))
            else:
                res.append(expanded)
        lons, lats = zip(*res)
        return da.hstack(lons), da.hstack(lats)
Exemple #18
0
def project_cone(K, x):
    s = x[0].compute()
    v = x[1:]
    norm_v = da.linalg.norm(v).compute()

    if norm_v <= -s:
        projx = 0 * x
    elif norm_v <= s:
        projx = 1 * x
    else:
        scal = 0.5 * (1 + s / norm_v)
        s = da.from_array(np.array([norm_v]), chunks=(1, ))
        projx = scal * da.hstack((s, v))
    return projx
Exemple #19
0
    def navigate(self):
        """Generate lon and lat datasets."""
        all_lon = da.from_array(self.geostuff["Longitude"])
        all_lat = da.from_array(self.geostuff["Latitude"])

        res = []
        param_start = 0
        for tpz_size, nb_tpz, start in zip(self.tpz_sizes, self.nb_tpzs,
                                           self.group_locations):

            lon = all_lon[:, start:start + nb_tpz + 1]
            lat = all_lat[:, start:start + nb_tpz + 1]

            c_align = self.c_align[:, :, param_start:param_start + nb_tpz, :]
            c_exp = self.c_exp[:, :, param_start:param_start + nb_tpz, :]

            param_start += nb_tpz

            expanded = []

            if self.switch_to_cart:
                arrays = lonlat2xyz(lon, lat)
            else:
                arrays = (lon, lat)

            for data in arrays:
                expanded.append(
                    expand_array(data, self.scans, c_align, c_exp,
                                 self.scan_size, tpz_size, nb_tpz,
                                 self.track_offset, self.scan_offset))

            if self.switch_to_cart:
                res.append(xyz2lonlat(*expanded))
            else:
                res.append(expanded)
        lons, lats = zip(*res)
        return da.hstack(lons), da.hstack(lats)
Exemple #20
0
def setup_input(samples, input_pattern, seqid, field):
    log('Setting up input array ...')
    input_paths = [input_pattern.format(sample=s) for s in samples]
    input_stores = [zarr.ZipStore(ip, mode='r') for ip in input_paths]
    input_roots = [zarr.group(store) for store in input_stores]
    input_arrays = [
        root[s][seqid][field] for root, s in zip(input_roots, samples)
    ]
    input_arrays = [da.from_array(a, chunks=a.chunks) for a in input_arrays]

    # here we add a dim to allow the hstack to work. must share the shape (X, 1, )
    input_arrays = [a[:, None] if a.ndim == 1 else a for a in input_arrays]

    input_array = da.hstack(input_arrays)
    log('Input array:', input_array)
    return input_array
Exemple #21
0
def dask_array_resolver(obj, resolver, **kw):
    def get_partition(obj_id):
        client = vineyard.connect()
        np_value = client.get(obj_id)
        return da.from_array(np_value)

    meta = obj.meta
    num = int(meta['partitions_-size'])
    dask_client = Client(kw['dask_scheduler'])
    futures = []
    indices = []
    with_index = True
    for i in range(num):
        ts = meta.get_member('partitions_-%d' % i)
        instance_id = int(ts.meta['instance_id'])

        partition_index = json.loads(ts.meta['partition_index_'])
        if partition_index:
            indices.append((partition_index[0], partition_index[1], i))
        else:
            with_index = False

        futures.append(
            # we require the 1-on-1 alignment of vineyard instances and dask workers.
            # vineyard_sockets maps vineyard instance_ids into ipc_sockets, while
            # dask_workers maps vineyard instance_ids into names of dask workers.
            dask_client.submit(get_partition,
                               ts.meta.id,
                               workers={kw['dask_workers'][instance_id]}))

    arrays = dask_client.gather(futures)
    if with_index:
        indices = list(sorted(indices))
        nx = indices[-1][0] + 1
        ny = indices[-1][1] + 1
        assert nx * ny == num
        rows = []
        for i in range(nx):
            cols = []
            for j in range(ny):
                cols.append(arrays[indices[i * ny + j][2]])
            rows.append(da.hstack(cols))
        return da.vstack(rows)

    return da.vstack(arrays)
Exemple #22
0
    def _hstack(self, Xs):
        """
        Stacks X horizontally.

        Supports input types (X): list of
            numpy arrays, sparse arrays and DataFrames
        """
        types = set(type(X) for X in Xs)

        if self.sparse_output_:
            return sparse.hstack(Xs).tocsr()
        elif dd.Series in types or dd.DataFrame in types:
            return dd.concat(Xs, axis="columns")
        elif da.Array in types:
            return da.hstack(Xs)
        elif self.preserve_dataframe and (pd.Series in types or pd.DataFrame in types):
            return pd.concat(Xs, axis="columns")
        else:
            return np.hstack(Xs)
Exemple #23
0
def create_fft_freq(dt, nfft, full):
    """
    Creates a list of Fourier Frequencies
    Inputs:
    =======
    dt: float, Sampling frequency in Hz
    nfft: int, number of data points to apply FFT to
    full: bool, if True, then we want the DFT from -fN...0...fN. If false, we do a half-DFT from 0...fN

    Returns:
    ========
    freqs, ndarray(float): Frequencies of the Fourier Coefficients calculated by FFT.
    """
    freqs = da.fft.fftfreq(nfft, d=dt)

    if (nfft % 2 == 0):
        freqs = da.hstack(
            [freqs[0:nfft // 2], -freqs[nfft // 2], freqs[nfft // 2:nfft]])
    if full:
        freqs = da.fft.fftshift(freqs)
    else:
        freqs = freqs[0:nfft // 2 + 1]

    return (freqs)
Exemple #24
0
def read_bgen(
    path: PathType,
    metafile_path: Optional[PathType] = None,
    sample_path: Optional[PathType] = None,
    chunks: Union[str, int, Tuple[int, int, int]] = "auto",
    lock: bool = False,
    persist: bool = True,
    contig_dtype: DType = "str",
    gp_dtype: DType = "float32",
) -> Dataset:
    """Read BGEN dataset.

    Loads a single BGEN dataset as dask arrays within a Dataset
    from a ``.bgen`` file.

    Parameters
    ----------
    path
        Path to BGEN file.
    metafile_path
        Path to companion index file used to determine BGEN byte offsets.
        Defaults to ``path`` + ".metafile" if not provided.
        This file is necessary for reading BGEN genotype probabilities and it will be
        generated the first time the file is read if it does not already exist.
        If it needs to be created, it can make the first call to this function
        much slower than subsequent calls.
    sample_path
        Path to ``.sample`` file, by default None. This is used to fetch sample identifiers
        and when provided it is preferred over sample identifiers embedded in the ``.bgen`` file.
    chunks
        Chunk size for genotype probability data (3 dimensions),
        by default "auto".
    lock
        Whether or not to synchronize concurrent reads of
        file blocks, by default False. This is passed through to
        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
    persist
        Whether or not to persist variant information in memory, by default True.
        This is an important performance consideration as the metadata file for this data will
        be read multiple times when False.
    contig_dtype
        Data type for contig names, by default "str".
        This may also be an integer type (e.g. "int"), but will fail if any of the contig names
        cannot be converted to integers.
    gp_dtype
        Data type for genotype probabilities, by default "float32".

    Warnings
    --------
    Only bi-allelic, diploid BGEN files are currently supported.

    Returns
    -------
    A dataset containing the following variables:

    - :data:`sgkit.variables.variant_id_spec` (variants)
    - :data:`sgkit.variables.variant_contig_spec` (variants)
    - :data:`sgkit.variables.variant_position_spec` (variants)
    - :data:`sgkit.variables.variant_allele_spec` (variants)
    - :data:`sgkit.variables.sample_id_spec` (samples)
    - :data:`sgkit.variables.call_dosage_spec` (variants, samples)
    - :data:`sgkit.variables.call_dosage_mask_spec` (variants, samples)
    - :data:`sgkit.variables.call_genotype_probability_spec` (variants, samples, genotypes)
    - :data:`sgkit.variables.call_genotype_probability_mask_spec` (variants, samples, genotypes)

    """
    if isinstance(chunks, tuple) and len(chunks) != 3:
        raise ValueError(f"`chunks` must be tuple with 3 items, not {chunks}")
    if not np.issubdtype(gp_dtype, np.floating):
        raise ValueError(
            f"`gp_dtype` must be a floating point data type, not {gp_dtype}"
        )
    if not np.issubdtype(contig_dtype, np.integer) and np.dtype(
        contig_dtype
    ).kind not in {"U", "S"}:
        raise ValueError(
            f"`contig_dtype` must be of string or int type, not {contig_dtype}"
        )

    path = Path(path)
    sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")

    if sample_path.exists():
        sample_id = read_samples(sample_path).sample_id.values.astype("U")
    else:
        sample_id = _default_sample_ids(path)

    bgen_reader = BgenReader(path, metafile_path=metafile_path, dtype=gp_dtype)

    df = read_metafile(bgen_reader.metafile_path)
    if persist:
        df = df.persist()
    arrs = dataframe_to_dict(df, METAFILE_DTYPE)

    variant_id = arrs["id"]
    variant_contig = arrs["chrom"].astype(contig_dtype)
    variant_contig, variant_contig_names = encode_contigs(variant_contig)
    variant_contig_names = list(variant_contig_names)
    variant_position = arrs["pos"]
    variant_allele = da.hstack((arrs["a1"][:, np.newaxis], arrs["a2"][:, np.newaxis]))

    call_genotype_probability = da.from_array(
        bgen_reader,
        chunks=chunks,
        lock=lock,
        fancy=False,
        asarray=False,
        name=f"{bgen_reader.name}:read_bgen:{path}",
    )
    call_dosage = _to_dosage(call_genotype_probability)

    ds: Dataset = create_genotype_dosage_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_allele=variant_allele,
        sample_id=sample_id,
        call_dosage=call_dosage,
        call_genotype_probability=call_genotype_probability,
        variant_id=variant_id,
    )

    return ds
    xinfo['xtermcols'] = json.load(jf)

h5read = tables.open_file('knockoff-data.h5', mode='r')
h5regression = tables.open_file('regression-data.h5', mode='r')
X = da.from_array(h5read.root.X)
pdim = X.shape[1]
Xtilde = da.from_array(h5read.root.Xtilde)
Y = da.from_array(h5regression.root.Y)
keepcols_svd = list(h5read.root.keepcols)
xcolnames_pdim = []
for k in xinfo['xcolnames']:
    if xinfo['xcolnames'][k] in keepcols_svd:
        xcolnames_pdim.append(k)

with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof:
    Xaug = da.hstack([X, Xtilde])
    betahat_aug = da.linalg.solve(da.matmul(Xaug.T, Xaug),
                                  da.matmul(Xaug.T, Y)).compute()
    Wstat = [
        abs(betahat_aug[i]) - abs(betahat_aug[i + pdim]) for i in range(pdim)
    ]
    threshold = knockoff_threshold(Wstat, FDR, offset=1)
    sel = [Wstat[j] >= threshold for j in range(pdim)]
    Xdrop = X[:, sel]
    betahat_final = da.linalg.solve(da.matmul(Xdrop.T, Xdrop),
                                    da.matmul(Xdrop.T, Y)).compute()
    colnames_final = [i for i, j in zip(xcolnames_pdim, sel) if j]
    colnames_dropped = [i for i, j in zip(xcolnames_pdim, sel) if not j]
    print("desired FDR: ")
    print(FDR)
    print("\nKnockoff drops these columns:\n")
Exemple #26
0
def make_blobs(
    n_samples=100,
    n_features=2,
    centers=None,
    cluster_std=1.0,
    center_box=(-10.0, 10.0),
    shuffle=True,
    random_state=None,
    chunks=None,
):
    """
    Generate isotropic Gaussian blobs for clustering.

    This can be used to generate very large Dask arrays on a cluster of
    machines. When using Dask in distributed mode, the client machine
    only needs to allocate a single block's worth of data.

    Parameters
    ----------
    n_samples : int or array-like, optional (default=100)
        If int, it is the total number of points equally divided among
        clusters.
        If array-like, each element of the sequence indicates
        the number of samples per cluster.

    n_features : int, optional (default=2)
        The number of features for each sample.

    centers : int or array of shape [n_centers, n_features], optional
        (default=None)
        The number of centers to generate, or the fixed center locations.
        If n_samples is an int and centers is None, 3 centers are generated.
        If n_samples is array-like, centers must be
        either None or an array of length equal to the length of n_samples.

    cluster_std : float or sequence of floats, optional (default=1.0)
        The standard deviation of the clusters.

    center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
        The bounding box for each cluster center when centers are
        generated at random.

    shuffle : boolean, optional (default=True)
        Shuffle the samples.

    random_state : int, RandomState instance or None (default)
        Determines random number generation for dataset creation. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    chunks : int, tuple
        How to chunk the array. Must be one of the following forms:
        -   A blocksize like 1000.
        -   A blockshape like (1000, 1000).
        -   Explicit sizes of all blocks along all dimensions like
            ((1000, 1000, 500), (400, 400)).

    Returns
    -------
    X : array of shape [n_samples, n_features]
        The generated samples.

    y : array of shape [n_samples]
        The integer labels for cluster membership of each sample.

    Examples
    --------
    >>> from dask_ml.datasets import make_blobs
    >>> X, y = make_blobs(n_samples=100000, chunks=10000)
    >>> X
    dask.array<..., shape=(100000, 2), dtype=float64, chunksize=(10000, 2)>
    >>> y
    dask.array<concatenate, shape=(100000,), dtype=int64, chunksize=(10000,)>

    See Also
    --------
    make_classification: a more intricate variant
    """
    chunks = da.core.normalize_chunks(chunks, (n_samples, n_features))
    _check_axis_partitioning(chunks, n_features)

    if centers is None:
        # TODO: non-int n_samples?
        centers = 3
    if isinstance(centers, numbers.Integral):
        # Make a prototype
        n_centers = centers
        X, y = sklearn.datasets.make_blobs(
            n_samples=chunks[0][0],
            n_features=n_features,
            centers=centers,
            shuffle=shuffle,
            cluster_std=cluster_std,
            center_box=center_box,
            random_state=random_state,
        )
        centers = []
        centers = np.zeros((n_centers, n_features))

        for i in range(n_centers):
            centers[i] = X[y == i].mean(0)

    objs = [
        dask.delayed(sklearn.datasets.make_blobs, nout=2)(
            n_samples=n_samples_per_block,
            n_features=n_features,
            centers=centers,
            cluster_std=cluster_std,
            shuffle=shuffle,
            center_box=center_box,
            random_state=i,
        )
        for i, n_samples_per_block in enumerate(chunks[0])
    ]
    Xobjs, yobjs = zip(*objs)

    Xarrs = [
        da.from_delayed(arr, shape=(n, n_features), dtype="f8")
        for arr, n in zip(Xobjs, chunks[0])
    ]
    X_big = da.vstack(Xarrs)

    yarrs = [
        da.from_delayed(arr, shape=(n,), dtype=np.dtype("int"))
        for arr, n in zip(yobjs, chunks[0])
    ]
    y_big = da.hstack(yarrs)
    return X_big, y_big
Exemple #27
0
def read_plink(
    *,
    path: Optional[PathType] = None,
    bed_path: Optional[PathType] = None,
    bim_path: Optional[PathType] = None,
    fam_path: Optional[PathType] = None,
    chunks: Union[str, int, tuple] = "auto",  # type: ignore[type-arg]
    fam_sep: str = " ",
    bim_sep: str = "\t",
    bim_int_contig: bool = False,
    count_a1: bool = True,
    lock: bool = False,
    persist: bool = True,
) -> Dataset:
    """Read PLINK dataset.

    Loads a single PLINK dataset as dask arrays within a Dataset
    from bed, bim, and fam files.

    Parameters
    ----------
    path : Optional[PathType]
        Path to PLINK file set.
        This should not include a suffix, i.e. if the files are
        at `data.{bed,fam,bim}` then only 'data' should be
        provided (suffixes are added internally).
        Either this path must be provided or all 3 of
        `bed_path`, `bim_path` and `fam_path`.
    bed_path: Optional[PathType]
        Path to PLINK bed file.
        This should be a full path including the `.bed` extension
        and cannot be specified in conjunction with `path`.
    bim_path: Optional[PathType]
        Path to PLINK bim file.
        This should be a full path including the `.bim` extension
        and cannot be specified in conjunction with `path`.
    fam_path: Optional[PathType]
        Path to PLINK fam file.
        This should be a full path including the `.fam` extension
        and cannot be specified in conjunction with `path`.
    chunks : Union[str, int, tuple], optional
        Chunk size for genotype (i.e. `.bed`) data, by default "auto"
    fam_sep : str, optional
        Delimiter for `.fam` file, by default " "
    bim_sep : str, optional
        Delimiter for `.bim` file, by default "\t"
    bim_int_contig : bool, optional
        Whether or not the contig/chromosome name in the `.bim`
        file should be interpreted as an integer, by default False.
        If False, then the `variant/contig` field in the resulting
        dataset will contain the indexes of corresponding strings
        encountered in the first `.bim` field.
    count_a1 : bool, optional
        Whether or not allele counts should be for A1 or A2,
        by default True. Typically A1 is the minor allele
        and should be counted instead of A2. This is not enforced
        by PLINK though and it is up to the data generating process
        to ensure that A1 is in fact an alternate/minor/effect
        allele. See https://www.cog-genomics.org/plink/1.9/formats
        for more details.
    lock : bool, optional
        Whether or not to synchronize concurrent reads of `.bed`
        file blocks, by default False. This is passed through to
        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
    persist : bool, optional
        Whether or not to persist `.fam` and `.bim` information in
        memory, by default True. This is an important performance
        consideration as the plain text files for this data will
        be read multiple times when False. This can lead to load
        times that are upwards of 10x slower.

    Returns
    -------
    Dataset
        A dataset containing genotypes as 3 dimensional calls along with
        all accompanying pedigree and variant information. The content
        of this dataset matches that of `sgkit.create_genotype_call_dataset`
        with all pedigree-specific fields defined as:
            - sample_family_id: Family identifier commonly referred to as FID
            - sample_id: Within-family identifier for sample
            - sample_paternal_id: Within-family identifier for father of sample
            - sample_maternal_id: Within-family identifier for mother of sample
            - sample_sex: Sex code equal to 1 for male, 2 for female, and -1
                for missing
            - sample_phenotype: Phenotype code equal to 1 for control, 2 for case,
                and -1 for missing

        See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.

    Raises
    ------
    ValueError
        If `path` and one of `bed_path`, `bim_path` or `fam_path` are provided.
    """
    if path and (bed_path or bim_path or fam_path):
        raise ValueError(
            "Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both"
        )
    if path:
        bed_path, bim_path, fam_path = [
            f"{path}.{ext}" for ext in ["bed", "bim", "fam"]
        ]

    # Load axis data first to determine dimension sizes
    df_fam = read_fam(fam_path, sep=fam_sep)  # type: ignore[arg-type]
    df_bim = read_bim(bim_path, sep=bim_sep)  # type: ignore[arg-type]

    if persist:
        df_fam = df_fam.persist()
        df_bim = df_bim.persist()

    arr_fam = _to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
    arr_bim = _to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)

    # Load genotyping data
    call_genotype = da.from_array(
        # Make sure to use asarray=False in order for masked arrays to propagate
        BedReader(bed_path, (len(df_bim), len(df_fam)),
                  count_A1=count_a1),  # type: ignore[arg-type]
        chunks=chunks,
        # Lock must be true with multiprocessing dask scheduler
        # to not get bed-reader errors (it works w/ threading backend though)
        lock=lock,
        asarray=False,
        name=f"bed_reader:read_plink:{bed_path}",
    )

    # If contigs are already integers, use them as-is
    if bim_int_contig:
        variant_contig = arr_bim["contig"].astype("int16")
        variant_contig_names = da.unique(variant_contig).astype(str)
        variant_contig_names = list(variant_contig_names.compute())
    # Otherwise create index for contig names based
    # on order of appearance in underlying .bim file
    else:
        variant_contig, variant_contig_names = encode_array(
            arr_bim["contig"].compute())
        variant_contig = variant_contig.astype("int16")
        variant_contig_names = list(variant_contig_names)

    variant_position = arr_bim["pos"]
    a1 = arr_bim["a1"].astype("str")
    a2 = arr_bim["a2"].astype("str")

    # Note: column_stack not implemented in Dask, must use [v|h]stack
    variant_alleles = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
    variant_alleles = variant_alleles.astype("S")
    variant_id = arr_bim["variant_id"]

    sample_id = arr_fam["member_id"]

    ds = create_genotype_call_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_alleles=variant_alleles,
        sample_id=sample_id,
        call_genotype=call_genotype,
        variant_id=variant_id,
    )

    # Assign PLINK-specific pedigree fields
    ds = ds.assign(
        **{
            f"sample_{f}": (DIM_SAMPLE, arr_fam[f])
            for f in arr_fam if f != "member_id"
        })
    return ds  # type: ignore[no-any-return]
    def run_analysis_wrinkling(self,
                               filter_width,
                               filter_type,
                               c_analytical=False,
                               Parallel=False,
                               every_nth=1):
        '''
        :param filter_width: DNS points to filter
        :param filter_type: use 'TOPHAT' rather than 'GAUSSIAN
        :param c_analytical: compute c minus analytically
        :param Parallel: use False
        :param every_nth: every nth DNS point to compute the isoArea
        :return:
        '''
        # run the analysis and compute the wrinkling factor -> real 3D cases
        # interval is like nth point, skips some nodes
        self.filter_type = filter_type

        # joblib parallel computing of c_iso
        self.Parallel = Parallel

        self.every_nth = int(every_nth)

        print('You are using %s filter!' % self.filter_type)

        self.filter_width = int(filter_width)

        self.c_analytical = c_analytical
        if self.c_analytical is True:
            print('You are using Hypergeometric function for c_minus (Eq.35)!')

        # filter the c and rho field
        print('Filtering c field ...')
        self.rho_filtered = self.apply_filter(self.rho_data_np)
        self.c_filtered = self.apply_filter(self.c_data_np)

        # # reduce c for computation of conditioned wrinkling factor
        # self.reduce_c(c_min=0.75,c_max=0.85)
        # self.c_filtered_reduced = self.apply_filter(self.c_data_reduced_np)

        # Compute the scaled Delta (Pfitzner PDF)
        self.Delta_LES = self.delta_x * self.filter_width * self.Sc * self.Re * np.sqrt(
            self.p / self.p_0)
        print('Delta_LES is: %.3f' % self.Delta_LES)
        flame_thickness = self.compute_flamethickness()
        print('Flame thickness: ', flame_thickness)

        #maximum possible wrinkling factor
        print('Maximum possible wrinkling factor: ',
              self.Delta_LES / flame_thickness)

        # Set the Gauss kernel
        self.set_gaussian_kernel()

        # compute the wrinkling factor
        self.get_wrinkling()
        self.compute_Pfitzner_model()

        #c_bins = self.compute_c_binning(c_low=0.8,c_high=0.9)

        # start = time.time()
        # if self.Parallel is True:
        #     isoArea_coefficient = self.compute_isoArea_parallel(c_iso=0.85)
        # else:
        #     isoArea_coefficient = self.compute_isoArea(c_iso=0.85)
        #
        # end=time.time()
        print('No c_iso was computed')

        # write the filtered omega and omega_model * isoArea to file
        print(
            'writing omega DNS filtered and omega_model x isoArea to file ...')
        filename = join(
            self.case, 'filtered_data', 'omega_filtered_modeled_' +
            str(self.filter_width) + '_nth' + str(self.every_nth) + '.csv')

        # om_iso = self.omega_model_cbar*isoArea_coefficient
        om_wrinkl = self.omega_model_cbar * self.wrinkling_factor

        # pd.DataFrame(data=np.hstack([self.omega_DNS.reshape(self.Nx**3,1),
        #                    self.omega_DNS_filtered.reshape(self.Nx**3,1),
        #                    om_iso.reshape(self.Nx**3,1),
        #                    om_wrinkl.reshape(self.Nx**3,1),
        #                    self.c_filtered.reshape(self.Nx ** 3, 1)]),
        #                    columns=['omega_DNS',
        #                             'omega_filtered',
        #                             'omega_model_by_isoArea',
        #                             'omega_model_by_wrinkling',
        #                             'c_bar']).to_csv(filename)

        # creat dask array and reshape all data
        dataArray_da = da.hstack([
            self.c_filtered.reshape(self.Nx**3, 1),
            self.wrinkling_factor.reshape(self.Nx**3, 1),
            # isoArea_coefficient.reshape(self.Nx**3,1),
            # self.wrinkling_factor_LES.reshape(self.Nx ** 3, 1),
            # self.wrinkling_factor_reduced.reshape(self.Nx ** 3, 1),
            # self.wrinkling_factor_LES_reduced.reshape(self.Nx ** 3, 1),
            self.omega_model_cbar.reshape(self.Nx**3, 1),
            self.omega_DNS_filtered.reshape(self.Nx**3, 1),
            #self.omega_LES_noModel.reshape(self.Nx**3,1),
            self.c_plus.reshape(self.Nx**3, 1),
            self.c_minus.reshape(self.Nx**3, 1)
        ])

        if self.c_analytical is True:
            # write data to csv file
            filename = join(
                self.case, 'filter_width_' + self.filter_type + '_' +
                str(self.filter_width) + '_analytical.csv')
        else:
            # write data to csv file
            filename = join(
                self.case, 'filter_width_' + self.filter_type + '_' +
                str(self.filter_width) + '.csv')

        self.dataArray_dd = dd.io.from_dask_array(
            dataArray_da,
            columns=[
                'c_bar',
                'wrinkling',
                # 'isoArea',
                # 'wrinkling_LES',
                # 'wrinkling_reduced',
                # 'wrinkling_LES_reduced',
                'omega_model',
                'omega_DNS_filtered',
                # 'omega_cbar',
                'c_plus',
                'c_minus'
            ])

        # filter the data set and remove unecessary entries
        self.dataArray_dd = self.dataArray_dd[
            self.dataArray_dd['c_bar'] > 0.01]
        self.dataArray_dd = self.dataArray_dd[
            self.dataArray_dd['c_bar'] < 0.99]

        if self.case is 'planar_flame_test':
            self.dataArray_dd = self.dataArray_dd[
                self.dataArray_dd['wrinkling'] < 1.1]

        self.dataArray_dd = self.dataArray_dd[
            self.dataArray_dd['wrinkling'] > 0.99]
        #self.dataArray_dd = self.dataArray_dd[self.dataArray_dd['isoArea'] >= 1.0]

        # this is to reduce the storage size
        #self.dataArray_dd = self.dataArray_dd.sample(frac=0.3)

        print('Computing data array ...')
        self.dataArray_df = self.dataArray_dd.compute()

        print('Writing output to csv ...')
        self.dataArray_df.to_csv(filename, index=False)
        print('Data has been written.\n\n')
Exemple #29
0
def plsa_em_step_dask(
    block_rows_ndarray,
    block_cols_ndarray,
    block_vals_ndarray,
    p_w_given_z,
    p_z_given_d,
    block_row_size,
    block_col_size,
    e_step_thresh=1e-32,
):
    n_d_blocks = block_rows_ndarray.shape[0]
    n_w_blocks = block_rows_ndarray.shape[1]

    n = p_z_given_d.shape[0]
    m = p_w_given_z.shape[1]
    k = p_z_given_d.shape[1]

    result_p_w_given_z = [[] for i in range(n_w_blocks)]
    result_p_z_given_d = [[] for i in range(n_d_blocks)]
    result_norm_pwz = []
    result_norm_pdz = [[] for i in range(n_d_blocks)]

    for i in range(n_d_blocks):

        row_start = block_row_size * i
        row_end = min(row_start + block_row_size, n)

        for j in range(n_w_blocks):
            col_start = block_col_size * j
            col_end = min(col_start + block_col_size, m)

            row_block = block_rows_ndarray[i, j]
            col_block = block_cols_ndarray[i, j]
            val_block = block_vals_ndarray[i, j]

            kernel_results = plsa_em_step_block_kernel(
                row_block,
                col_block,
                val_block,
                p_w_given_z[:, col_start:col_end],
                p_z_given_d[row_start:row_end, :],
                e_step_thresh=e_step_thresh,
            )

            result_p_w_given_z[j].append(
                da.from_delayed(kernel_results[0], (k, block_col_size),
                                dtype=np.float32))
            result_p_z_given_d[i].append(
                da.from_delayed(kernel_results[1], (block_row_size, k),
                                dtype=np.float32))
            result_norm_pwz.append(
                da.from_delayed(kernel_results[2], (k, ), dtype=np.float32))

            result_norm_pdz[i].append(
                da.from_delayed(kernel_results[3], (block_row_size, ),
                                dtype=np.float32))

    p_w_given_z_blocks = [
        da.dstack(result_p_w_given_z[i]).sum(axis=-1)
        for i in range(n_w_blocks)
    ]
    p_z_given_d_blocks = [
        da.dstack(result_p_z_given_d[i]).sum(axis=-1)
        for i in range(n_d_blocks)
    ]
    norm_pdz_blocks = [
        da.dstack(result_norm_pdz[i]).sum(axis=-1) for i in range(n_d_blocks)
    ]

    p_w_given_z = (da.hstack(p_w_given_z_blocks) /
                   da.dstack(result_norm_pwz).sum(axis=-1).T)
    p_z_given_d = da.vstack(p_z_given_d_blocks) / da.hstack(norm_pdz_blocks).T

    result = compute(p_w_given_z, p_z_given_d)

    return result
Exemple #30
0
 def hstack(self, *others, **kwargs):
     others = tuple(ensure_dask_array(d) for d in others)
     tup = (self, ) + others
     out = da.hstack(tup)
     return view_subclass(out, type(self))
 def _expand_tiepoint_array_1km(self, arr, lines, cols):
     arr = da.repeat(arr, lines, axis=1)
     arr = da.concatenate((arr[:, :lines//2, :], arr, arr[:, -(lines//2):, :]), axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)), cols, axis=1)
     return da.hstack((arr, arr[:, -cols:]))
 def _expand_tiepoint_array_5km(self, arr, lines, cols):
     arr = da.repeat(arr, lines * 2, axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)),
                     cols,
                     axis=1)
     return da.hstack((arr[:, :2], arr, arr[:, -2:]))
    famfile = "/shared/ukbiobank_filtered/filtered_200k.2.fam"

    G = read_plink1_bin(bedfile, fam=famfile, verbose=False)

    n = G.shape[0]
    p_pheno = 11
    p = G.shape[1] + 6

    start_ind = (p // size) * rank
    end_ind   = (p // size) * (rank + 1) 
    pheno = genfromtxt("/shared/ukbiobank_filtered/ukb_short.filtered.200k.tab", skip_header=1)

    if rank != size - 1:
        X_chunk = G[:, start_ind:end_ind].data.compute()
    else:
        X_chunk = da.hstack([G[:,start_ind:].data, da.zeros((n, 6))]).compute()
        X_chunk[:, -11:] = pheno[:, 1:p_pheno + 1]

    from utils import impute_na
    X_chunk = impute_na(X_chunk)

    # normalize
    if args.normalize:
        X_chunk -= X_chunk.mean(0)
        X_chunk /= X_chunk.std(0)

    X_chunk = torch.tensor(X_chunk)
    print(X_chunk.shape)
    X = THDistMat.from_chunks(X_chunk, force_bycol=True)
    
    time = torch.tensor(pheno[:, 12]).view(-1, 1).type(TType)
fPitch = lambda Gxyz: -da.arctan2(Gxyz[0, :], da.sqrt(da.sum(da.square(Gxyz[1:, :]), 0)))
fRoll = lambda Gxyz: da.arctan2(Gxyz[1, :], Gxyz[2,
                                            :])  # da.arctan2(Gxyz[1,:], da.sqrt(da.sum(da.square(Gxyz[(0,2),:]), 0)))  #=da.arctan2(Gxyz[1,:], da.sqrt(da.square(Gxyz[0,:])+da.square(Gxyz[2,:])) )
fInclination = lambda Gxyz: da.arctan2(da.sqrt(da.sum(da.square(Gxyz[:-1, :]), 0)), Gxyz[2, :])
fHeading = lambda H, p, r: da.arctan2(H[2, :] * da.sin(r) - H[1, :] * da.cos(r),
                                      H[0, :] * da.cos(p) + (H[1, :] * da.sin(r) + H[2, :] * da.cos(r)) * da.sin(p))

fG = lambda Axyz, Ag, Cg: da.dot(Ag.T, (Axyz - Cg[0, :]).T)
fGi = lambda Ax, Ay, Az, Ag, Cg, i: da.dot(Ag.T, (da.column_stack((Ax, Ay, Az))[slice(*i)] - Cg[0, :]).T)

fbinningClip = lambda x, bin2_iStEn, bin1_nAverage: da.mean(da.reshape(x[slice(*bin2_iStEn)], (-1, bin1_nAverage)), 1)
fbinning = lambda x, bin1_nAverage: da.mean(da.reshape(x, (-1, bin1_nAverage)), 1)
repeat3shift1 = lambda A2: [A2[t:(len(A2) - 2 + t)] for t in range(3)]
median3cols = lambda a, b, c: da.where(a < b, da.where(c < a, a, da.where(b < c, b, c)),
                                       da.where(a < c, a, da.where(c < b, b, c)))
median3 = lambda x: da.hstack((np.NaN, median3cols(*repeat3shift1(x)), np.NaN))
# not convertable to dask easily:
fVabs_old = lambda Gxyz, kVabs: np.polyval(kVabs.flat, np.sqrt(np.tan(fInclination(Gxyz))))
rep2mean = lambda x, bOk: np.interp(np.arange(len(x)), np.flatnonzero(bOk), x[bOk], np.NaN, np.NaN)
fForce2Vabs_fitted = lambda x: da.where(x > 2, 2, da.where(x < 1, 0.25 * x, 0.25 * x + 0.3 * (x - 1) ** 4))
fIncl2Force = lambda incl: da.sqrt(da.tan(incl))
fVabs = lambda Gxyz, kVabs: fForce2Vabs_fitted(fIncl2Force(fInclination(Gxyz)))
f = lambda fun, *args: fun(*args)
positiveInd = lambda i, L: np.int32(da.where(i < 0, L - i, i))
minInterval = lambda iLims1, iLims2, L: f(
    lambda iL1, iL2: da.transpose([max(iL1[:, 0], iL2[:, 0]), min(iL1[:, -1], iL2[:, -1])]), positiveInd(iLims1, L),
    positiveInd(iLims2, L))
fStEn2bool = lambda iStEn, length: da.hstack(
    [(da.ones(iEn2iSt, dtype=np.bool8) if b else da.zeros(iEn2iSt, dtype=np.bool8)) for iEn2iSt, b in da.vstack((
        da.diff(
            da.hstack(
Exemple #35
0
 def hstack(self, *others, **kwargs):
     others = tuple(ensure_dask_array(d) for d in others)
     tup = (self,) + others
     out = da.hstack(tup)
     return view_subclass(out, type(self))