Example #1
0
def targets_entrypoint(
    batchMB: float,
    shapefile: str,
    records: List[str],
    name: str,
    every: int,
    categorical: bool,
    normalise: bool,
    random_seed: int,
) -> None:
    """Targets entrypoint without click cruft."""
    log.info("Loading shapefile targets")
    out_filename = os.path.join(os.getcwd(), "targets_{}.hdf5".format(name))
    nworkers = 0  # shapefile reading breaks with concurrency

    with tables.open_file(out_filename, mode="w", title=name) as h5file:
        log.info("Reading shapefile point coordinates")
        cocon_src = CoordinateShpArraySource(shapefile, random_seed)
        cocon_batchsize = mb_to_points(batchMB,
                                       ndim_con=0,
                                       ndim_cat=0,
                                       ndim_coord=2)
        write_coordinates(cocon_src, h5file, cocon_batchsize)

        if categorical:
            log.info("Reading shapefile categorical records")
            cat_source = CategoricalShpArraySource(shapefile, records,
                                                   random_seed)
            cat_batchsize = mb_to_points(batchMB,
                                         ndim_con=0,
                                         ndim_cat=cat_source.shape[-1])
            catdata = get_maps(cat_source, cat_batchsize)
            mappings, counts = catdata.mappings, catdata.counts
            ncats = np.array([len(m) for m in mappings])
            write_categorical(cat_source, h5file, nworkers, cat_batchsize,
                              mappings)
            cat_meta = meta.CategoricalTarget(
                N=cat_source.shape[0],
                labels=cat_source.columns,
                nvalues=ncats,
                mappings=mappings,
                counts=counts,
            )
            write_target_metadata(cat_meta, h5file)
        else:
            log.info("Reading shapefile continuous records")
            con_source = ContinuousShpArraySource(shapefile, records,
                                                  random_seed)
            con_batchsize = mb_to_points(batchMB,
                                         ndim_con=con_source.shape[-1],
                                         ndim_cat=0)
            mean, sd = get_stats(con_source,
                                 con_batchsize) if normalise else None, None
            write_continuous(con_source, h5file, nworkers, con_batchsize)
            con_meta = meta.ContinuousTarget(N=con_source.shape[0],
                                             labels=con_source.columns,
                                             means=mean,
                                             sds=sd)
            write_target_metadata(con_meta, h5file)
    log.info("Target import complete")
Example #2
0
def test_get_categories(mocker):
    rnd = np.random.RandomState(seed=666)
    x = rnd.randint(0, 10, size=(20, 3), dtype=CategoricalType)
    missing_in = -1
    columns = ["1", "2", "3"]
    source = NPCatArraySource(x, missing_in, columns)
    batchsize = 3
    res = category.get_maps(source, batchsize)
    mappings, counts = res.mappings, res.counts
    for m, c, x in zip(mappings, counts, x.T):
        assert set(x) == set(m)
        for m_i, c_i in zip(m, c):
            assert c_i == np.sum(x == m_i)
Example #3
0
def tifs_entrypoint(nworkers: int, batchMB: float, categorical: List[str],
                    continuous: List[str], normalise: bool, name: str,
                    ignore_crs: bool) -> None:
    """Entrypoint for tifs without click cruft."""
    out_filename = os.path.join(os.getcwd(), "features_{}.hdf5".format(name))

    con_filenames = tifnames(continuous)
    cat_filenames = tifnames(categorical)
    log.info("Found {} continuous TIF files".format(len(con_filenames)))
    log.info("Found {} categorical TIF files".format(len(cat_filenames)))
    has_con = len(con_filenames) > 0
    has_cat = len(cat_filenames) > 0
    all_filenames = con_filenames + cat_filenames
    if not len(all_filenames) > 0:
        raise errors.NoTifFilesFound()

    N_con, N_cat = None, None
    con_meta, cat_meta = None, None
    spec = shared_image_spec(all_filenames, ignore_crs)

    with tables.open_file(out_filename, mode="w", title=name) as outfile:
        if has_con:
            con_source = ContinuousStackSource(spec, con_filenames)
            ndims_con = con_source.shape[-1]
            con_rows_per_batch = mb_to_rows(batchMB, spec.width, ndims_con, 0)
            N_con = con_source.shape[0] * con_source.shape[1]
            N = N_con
            log.info("Continuous missing value set to {}".format(
                con_source.missing))
            stats = None
            if normalise:
                stats = get_stats(con_source, con_rows_per_batch)
                sd = stats[1]
                if any(sd == 0.0):
                    raise errors.ZeroDeviation(sd, con_source.columns)
                log.info("Writing normalised continuous data to output file")
            else:
                log.info("Writing unnormalised continuous data to output file")
            con_meta = meta.ContinuousFeatureSet(labels=con_source.columns,
                                                 missing=con_source.missing,
                                                 stats=stats)
            write_continuous(con_source, outfile, nworkers, con_rows_per_batch,
                             stats)

        if has_cat:
            cat_source = CategoricalStackSource(spec, cat_filenames)
            N_cat = cat_source.shape[0] * cat_source.shape[1]
            N = N_cat
            if N_con and N_cat != N_con:
                raise errors.ConCatNMismatch(N_con, N_cat)

            ndims_cat = cat_source.shape[-1]
            cat_rows_per_batch = mb_to_rows(batchMB, spec.width, 0, ndims_cat)
            log.info("Categorical missing value set to {}".format(
                cat_source.missing))
            catdata = get_maps(cat_source, cat_rows_per_batch)
            maps, counts = catdata.mappings, catdata.counts
            ncats = np.array([len(m) for m in maps])
            log.info("Writing mapped categorical data to output file")
            cat_meta = meta.CategoricalFeatureSet(labels=cat_source.columns,
                                                  missing=cat_source.missing,
                                                  nvalues=ncats,
                                                  mappings=maps,
                                                  counts=counts)
            write_categorical(cat_source, outfile, nworkers,
                              cat_rows_per_batch, maps)
        m = meta.FeatureSet(continuous=con_meta,
                            categorical=cat_meta,
                            image=spec,
                            N=N,
                            halfwidth=0)
        write_feature_metadata(m, outfile)
    log.info("Tif import complete")