Example #1
0
File: ds.py Project: elaeon/ML
    def stadistics(self):
        headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"]
        self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes)
        table = []
        for group, (dtype, _) in self.dtypes.fields.items():
            values = dict()
            values["dtype"] = dtype
            values["group"] = group
            darray = self.data[group].da
            if dtype == np.dtype(float) or dtype == np.dtype(int):
                da_mean = da.around(darray.mean(), decimals=3)
                da_std = da.around(darray.std(), decimals=3)
                da_min = da.around(darray.min(), decimals=3)
                da_max = da.around(darray.max(), decimals=3)
                result = dask.compute([da_mean, da_std, da_min, da_max])[0]
                values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute()
                values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute()
                values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute()
                values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute()
                if len(self.shape[group]) == 1:
                    da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3)
                    result = da_percentile.compute()
                    values["25%"] = result[0]
                    values["50%"] = result[1]
                    values["75%"] = result[2]
                else:
                    values["25%"] = "-"
                    values["50%"] = "-"
                    values["75%"] = "-"
                values["nonzero"] = da.count_nonzero(darray).compute()
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                values["unique"] = "-"
            else:
                values["mean"] = "-"
                values["std dev"] = "-"
                values["min"] = "-"
                values["max"] = "-"
                values["25%"] = "-"
                values["50%"] = "-"
                values["75%"] = "-"
                values["nonzero"] = "-"
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                vunique = darray.to_dask_dataframe().fillna('').nunique().compute()
                values["unique"] = vunique

            row = []
            for column in headers:
                row.append(values[column])
            table.append(row)

        print("# rows {}".format(self.shape[0]))
        return tabulate(table, headers)
def test_count_nonzero_str():
    x = np.array(list("Hello world"))
    d = da.from_array(x, chunks=(4, ))

    x_c = np.count_nonzero(x)
    d_c = da.count_nonzero(d)

    assert x_c == d_c.compute()
Example #3
0
def test_count_nonzero_str():
    x = np.array(list("Hello world"))
    d = da.from_array(x, chunks=(4,))

    x_c = np.count_nonzero(x)
    d_c = da.count_nonzero(d)

    assert x_c == d_c.compute()
def test_count_nonzero_obj():
    x = np.random.randint(10, size=(15, 16)).astype(object)
    d = da.from_array(x, chunks=(4, 5))

    x_c = np.count_nonzero(x)
    d_c = da.count_nonzero(d)

    if d_c.shape == tuple():
        assert x_c == d_c.compute()
    else:
        assert_eq(x_c, d_c)
Example #5
0
def test_count_nonzero_obj():
    x = np.random.randint(10, size=(15, 16)).astype(object)
    d = da.from_array(x, chunks=(4, 5))

    x_c = np.count_nonzero(x)
    d_c = da.count_nonzero(d)

    if d_c.shape == tuple():
        assert x_c == d_c.compute()
    else:
        assert_eq(x_c, d_c)
Example #6
0
 def count_value(pixel_value):
     global total_count
     if total_count >= max_pixels:
         return 0
     if parallel:
         count = da.count_nonzero(dataset == pixel_value).compute()
     else:
         count = np.count_nonzero(dataset == pixel_value)
     total_count += count
     progress.setValue(int(total_count * 100 / max_pixels))
     return count
def test_count_nonzero_axis(axis):
    for shape, chunks in [((0, 0), (0, 0)), ((15, 16), (4, 5))]:
        x = np.random.randint(10, size=shape)
        d = da.from_array(x, chunks=chunks)

        x_c = np.count_nonzero(x, axis)
        d_c = da.count_nonzero(d, axis)

        if d_c.shape == tuple():
            assert x_c == d_c.compute()
        else:
            assert_eq(x_c, d_c)
Example #8
0
def test_count_nonzero_axis(axis):
    for shape, chunks in [((0, 0), (0, 0)), ((15, 16), (4, 5))]:
        x = np.random.randint(10, size=shape)
        d = da.from_array(x, chunks=chunks)

        x_c = np.count_nonzero(x, axis)
        d_c = da.count_nonzero(d, axis)

        if d_c.shape == tuple():
            assert x_c == d_c.compute()
        else:
            assert_eq(x_c, d_c)
Example #9
0
def coverage_total(division_dict, cost_coverage=False):

    conus_path = DP.join("rasters", "albers", "acre", "masks", "conus.tif")
    code_path = DP.join("rasters", "albers", "acre", "cost_codes.tif")
    cost_path = DP.join("rasters/albers/acre/rent_map.tif")
    chunks = {"band": 1, "x": 5000, "y": 5000}

    # Read in the tifs
    codes = xr.open_rasterio(code_path, chunks=chunks)[0].data
    conus = xr.open_rasterio(conus_path, chunks=chunks)[0].data
    costs = xr.open_rasterio(cost_path, chunks=chunks)[0].data
    divisions = xr.open_rasterio(DIVISIONS_PATH, chunks=chunks)[0].data

    # Set nans to zero (count_nonzero counts nans)
    codes[da.isnan(codes)] = 0

    coverages = {}
    with Client():
        for key, item in tqdm(division_dict.items(), position=0):

            div = conus[divisions == key]
            total = da.count_nonzero(div)

            # If calculating costs
            if cost_coverage:
                coverage = codes[((costs > 0) | (codes == 9999))
                                 & (divisions == key)]
                coded = da.count_nonzero(coverage)
            else:
                coverage = codes[divisions == key]
                coded = da.count_nonzero(coverage)
            ratio = coded / total
            coverages[item] = ratio.compute()

    df = pd.DataFrame(coverages, index=[0]).T
    df.columns = ["total_coverage"]

    return df
Example #10
0
def get_counts(cost_coverage=False):
    """Get cell counts for each category."""

    code_dict = get_codes(cost_coverage)

    # Read in code and conus rasters
    chunks = {"band": 1, "x": 5000, "y": 5000}
    code_path = DP.join("rasters/albers/acre/cost_codes.tif")
    cost_path = DP.join("rasters/albers/acre/rent_map.tif")
    conus_path = DP.join("rasters/albers/acre/masks/conus.tif")
    codes = xr.open_rasterio(code_path, chunks=chunks)[0].data
    costs = xr.open_rasterio(cost_path, chunks=chunks)[0].data
    conus = xr.open_rasterio(conus_path, chunks=chunks)[0].data

    # Dask array's `count_nonzero` counts na values
    codes[da.isnan(codes)] = 0
    conus[da.isnan(conus)] = 0

    # If calculating costs
    if cost_coverage:
        coverage = codes[(costs > 0) | (codes == 9999)]  # No exclusion in cost
    else:
        coverage = codes.copy()

    # Extract code from dictionary
    blm_codes = code_dict["blm"]
    tribal_codes = code_dict["tribal"]
    state_codes = code_dict["state"]
    private_codes = code_dict["private"]

    # Arrays
    developable = conus[codes != 9999]
    dev_covered = coverage[coverage != 9999]
    excl = coverage[coverage == 9999]
    blm = coverage[da.isin(coverage, blm_codes)]
    tribal = coverage[da.isin(coverage, tribal_codes)]
    state = coverage[da.isin(coverage, state_codes)]
    private = coverage[da.isin(coverage, private_codes)]
    arrays = {"excl": excl, "blm": blm, "tribal": tribal, "state": state,
              "private": private, "covered": coverage, "total": conus, 
              "developable": developable, "dev_covered": dev_covered}

    # Collect counts
    counts = {}
    with Client():
        for key, item in tqdm(arrays.items(), position=0):
            counts["n" + key] = da.count_nonzero(item).compute()

    return counts
def test_count_nonzero_obj_axis(axis):
    x = np.random.randint(10, size=(15, 16)).astype(object)
    d = da.from_array(x, chunks=(4, 5))

    x_c = np.count_nonzero(x, axis)
    d_c = da.count_nonzero(d, axis)

    if d_c.shape == tuple():
        assert x_c == d_c.compute()
    else:
        #######################################################
        # Workaround oddness with Windows and object arrays.  #
        #                                                     #
        # xref: https://github.com/numpy/numpy/issues/9468    #
        #######################################################
        assert_eq(x_c.astype(np.int64), d_c)
Example #12
0
def test_count_nonzero_obj_axis(axis):
    x = np.random.randint(10, size=(15, 16)).astype(object)
    d = da.from_array(x, chunks=(4, 5))

    x_c = np.count_nonzero(x, axis)
    d_c = da.count_nonzero(d, axis)

    if d_c.shape == tuple():
        assert x_c == d_c.compute()
    else:
        #######################################################
        # Workaround oddness with Windows and object arrays.  #
        #                                                     #
        # xref: https://github.com/numpy/numpy/issues/9468    #
        #######################################################
        assert_eq(x_c.astype(np.intp), d_c)
Example #13
0
def two_steps(x,y,r0,r,method):
    n=y.shape[0]
    if method=='uni':
        pis_uni=np.repeat(1/n,n)
        x_uni,y_uni,pi_uni = subsample(x,y,r+r0,pis_uni)
        beta_uni=newton(x_uni,y_uni,pi_uni)
        result=beta_uni
    
    else:
        n1=da.count_nonzero(y).compute()
        n0=n-n1
        pis_prop=y.compute()*(1/(2*n1))
        for count in range(y.shape[0]):
            if pis_prop[count]==0:
                pis_prop[count]=(1/(2*n0))
        pis_prop=da.from_array(pis_prop,(500000,))
        x_prop,y_prop,pis_prop = subsample(x,y,r0,pis_prop)
        beta0 = newton(x_prop,y_prop,pis_prop)
    
        if method=='mvc':
            pi_mVc=pis_mVc(x,y,beta0)
            x_mVc,y_mVc,pismVc = subsample(x,y,r,pi_mVc)
            x1=da.concatenate([x_prop,x_mVc])
            y1=da.concatenate([y_prop,y_mVc])
            pis1=da.concatenate([pis_prop,pismVc])
            beta_mVc=newton(x1,y1,pis1)
            result=beta_mVc
        
        elif method=='mmse':
            pi_mMSE=pis_mMSE(x,y,x_prop,pis_prop,beta0)
            #pi_mMSE=pis_mMSE(x,y,beta0)
            x_mMSE,y_mMSE,pismMSE = subsample(x,y,r,pi_mMSE)
            x1=np.append(x_prop,x_mMSE,axis=0)
            y1=np.append(y_prop,y_mMSE,axis=0)
            pis1=np.append(pis_prop,pismMSE,axis=0)
            beta_mMSE=newton(x1,y1,pis1)
            result=beta_mMSE
        
        elif method=='lcc':
            pi_LCC=pis_LCC(x,y,beta0)
            x_LCC,y_LCC,pisLCC = subsample(x,y,r,pi_LCC)
            beta_LCC=mle(x_LCC,y_LCC)+beta0
            result=beta_LCC
    return result
Example #14
0
    def _benchmark_pca(self, gt):
        # Count alleles at each variant
        self.benchmark_profiler.start_benchmark('PCA: Count alleles')
        ac = gt.count_alleles()
        self.benchmark_profiler.end_benchmark()

        # Count number of multiallelic SNPs
        self.benchmark_profiler.start_benchmark('PCA: Count multiallelic SNPs')
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            num_multiallelic_snps = da.count_nonzero(
                ac.max_allele() > 1).compute()
        else:
            num_multiallelic_snps = np.count_nonzero(ac.max_allele() > 1)
        self.benchmark_profiler.end_benchmark()
        del num_multiallelic_snps

        # Count number of biallelic singletons
        self.benchmark_profiler.start_benchmark(
            'PCA: Count biallelic singletons')
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            num_biallelic_singletons = da.count_nonzero(
                (ac.max_allele() == 1) & ac.is_singleton(1)).compute()
        else:
            num_biallelic_singletons = np.count_nonzero((ac.max_allele() == 1)
                                                        & ac.is_singleton(1))
        self.benchmark_profiler.end_benchmark()
        del num_biallelic_singletons

        # Apply filtering to remove singletons and multiallelic SNPs
        flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
        flt_count = np.count_nonzero(flt)
        self.benchmark_profiler.start_benchmark(
            'PCA: Remove singletons and multiallelic SNPs')
        if flt_count > 0:
            if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
                gf = gt.take(np.flatnonzero(flt), axis=0)
            else:
                gf = gt.compress(condition=flt, axis=0)
        else:
            # Don't apply filtering
            print(
                '[Exec][PCA] Cannot remove singletons and multiallelic SNPs as no data would remain. Skipping...'
            )
            gf = gt
        self.benchmark_profiler.end_benchmark()
        del ac, flt, flt_count

        # Transform genotype data into 2-dim matrix
        self.benchmark_profiler.start_benchmark(
            'PCA: Transform genotype data for PCA')
        gn = gf.to_n_alt()
        self.benchmark_profiler.end_benchmark()
        del gf

        # Randomly choose subset of SNPs
        if self.bench_conf.pca_subset_size == -1:
            print('[Exec][PCA] Including all ({}) variants for PCA.'.format(
                gn.shape[0]))
            gnr = gn
        else:
            n = min(gn.shape[0], self.bench_conf.pca_subset_size)
            print(
                '[Exec][PCA] Including {} random variants for PCA.'.format(n))
            vidx = np.random.choice(gn.shape[0], n, replace=False)
            vidx.sort()
            if self.bench_conf.genotype_array_type in [
                    config.GENOTYPE_ARRAY_NORMAL, config.GENOTYPE_ARRAY_CHUNKED
            ]:
                gnr = gn.take(vidx, axis=0)
            elif self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
                gnr = gn[
                    vidx]  # Use indexing workaround since Dask Array's take() method is not working properly
            else:
                print(
                    '[Exec][PCA] Error: Unspecified genotype array type specified.'
                )
                exit(1)
            del vidx

        if self.bench_conf.pca_ld_enabled:
            if self.bench_conf.genotype_array_type != config.GENOTYPE_ARRAY_DASK:
                # Apply LD pruning to subset of SNPs
                size = self.bench_conf.pca_ld_pruning_size
                step = self.bench_conf.pca_ld_pruning_step
                threshold = self.bench_conf.pca_ld_pruning_threshold
                n_iter = self.bench_conf.pca_ld_pruning_number_iterations

                self.benchmark_profiler.start_benchmark(
                    'PCA: Apply LD pruning')
                gnu = self._pca_ld_prune(gnr,
                                         size=size,
                                         step=step,
                                         threshold=threshold,
                                         n_iter=n_iter)
                self.benchmark_profiler.end_benchmark()
            else:
                print(
                    '[Exec][PCA] Cannot apply LD pruning because Dask genotype arrays do not support this operation.'
                )
                gnu = gnr
        else:
            print('[Exec][PCA] LD pruning disabled. Skipping this operation.')
            gnu = gnr

        # Run PCA analysis
        pca_num_components = self.bench_conf.pca_number_components
        scaler = self.bench_conf.pca_data_scaler

        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Rechunk Dask array to work with Dask's svd function (single chunk for transposed column)
            gnu_pca_conv = gnu.rechunk({0: -1, 1: gt.values.chunksize[1]})
        else:
            gnu_pca_conv = gnu

        # Run conventional PCA analysis
        self.benchmark_profiler.start_benchmark(
            'PCA: Run conventional PCA analysis (scaler: {})'.format(
                scaler if scaler is not None else 'none'))
        coords, model = allel.pca(gnu_pca_conv,
                                  n_components=pca_num_components,
                                  scaler=scaler)
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            coords.compute()
        self.benchmark_profiler.end_benchmark()
        del gnu_pca_conv, coords, model

        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Rechunk Dask array to match original genotype chunk size
            gnu_pca_rand = gnu.rechunk(
                (gt.values.chunksize[0], gt.values.chunksize[1]))
        else:
            gnu_pca_rand = gnu

        # Run randomized PCA analysis
        self.benchmark_profiler.start_benchmark(
            'PCA: Run randomized PCA analysis (scaler: {})'.format(
                scaler if scaler is not None else 'none'))
        coords, model = allel.randomized_pca(gnu_pca_rand,
                                             n_components=pca_num_components,
                                             scaler=scaler)
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            coords.compute()
        self.benchmark_profiler.end_benchmark()
        del gnu_pca_rand, coords, model
    in_cat[5]['Mass'] = in_cat[5]['BH_Mass']
    
    # Make a combined catalog of all particles and masses
    if sim_type == 'hydro':
        part_types = ['gas','DM','stars','BHs']
        comb_cat = nbk.MultipleSpeciesCatalog(part_types,
                                              in_cat[0],in_cat[1],in_cat[4],in_cat[5])
    elif sim_type == 'baryons':
        part_types = ['gas','stars','BHs']
        comb_cat = nbk.MultipleSpeciesCatalog(part_types,
                                              in_cat[0],in_cat[4],in_cat[5])

    if do_mass_moments_only:
        # Compute moments of particle mass distribution. I don't know a one-line
        # command for taking the mean over multiple fields, so I do it manually.
        m_len = {pt: comb_cat.compute(da.count_nonzero(comb_cat[pt + '/Mass'])) for pt in part_types}
        m_sum = {pt: comb_cat.compute(comb_cat[pt + '/Mass'].sum()) for pt in part_types}
        m2_arr = {pt: comb_cat[pt + '/Mass']**2. for pt in part_types}
        m2_sum = {pt: comb_cat.compute(m2_arr[pt].sum()) for pt in part_types}
        m3_arr = {pt: comb_cat[pt + '/Mass']**3. for pt in part_types}
        m3_sum = {pt: comb_cat.compute(m3_arr[pt].sum()) for pt in part_types}
        
        m_len_all = np.sum(m_len.values())
        m_mean = np.sum(m_sum.values())/m_len_all
        m2_mean = np.sum(m2_sum.values())/m_len_all
        m3_mean = np.sum(m3_sum.values())/m_len_all

        print_status(comm,start_time,'Mean particle mass: %g' % m_mean)
        print_status(comm,start_time,'Mean squared particle mass: %g' % m2_mean)
        print_status(comm,start_time,'Mean cubed particle mass: %g' % m3_mean)
    else: