Exemple #1
0
    def test_complicated_subdirs(self, ):
        """Check whether FileSet can find files in subdirectories that contain
        text and placeholders.
        """
        # The Pinocchio fileset from the cloud toolbox: a folder name contains
        # normal text and a placeholder:
        pinocchio = FileSet(
            join(
                self.refdir,
                "pinocchio",
                "t{year2}{month}{day}",
                "tm{year2}{month}{day}{hour}{minute}{second}{millisecond}.jpg",
            ), )

        # Find all files:
        files = list(pinocchio)

        check = [
            FileInfo(
                join(self.refdir, 'pinocchio', 't171102',
                     'tm171102132855573.jpg'), [
                         datetime.datetime(2017, 11, 2, 13, 28, 55, 573000),
                         datetime.datetime(2017, 11, 2, 13, 28, 55, 573000)
                     ], {}),
        ]
        assert files == check
def main():
    args = get_cmd_line_parser().parse_args()

    # TODO: Delete this line if you fixed this script
    print("Please check the code before running this.")
    #exit()

    images_path = join(
        args.root_dir,
        "{temperature}/m{year2}{month}{day}{hour}{minute}{second}*.jpg")
    images = FileSet(
        path=images_path,
        handler=cloud.pinocchio.ThermalCam(to_temperatures=False),
        name="Calibration Images",
    )

    # Create the calibration mask. Only a small part of the image will show the
    # the correct pixel values for the corresponding temperature. The rest will
    # not see the calibration target.
    calibration_mask = np.zeros((252, 336))
    calibration_mask[115:130, 160:180] = 1
    calibration_mask = calibration_mask.astype("bool")

    # Get the temperatures (normally from a file, but this is not implemented
    # yet):
    temperature = get_temperatures()

    create_calibration_file(
        images,
        temperature,
        calibration_mask,
        'pinocchio_calibration_%Y%m%d.csv',
    )
    def test_search(self):
        """Collocate fake MHS filesets"""
        fake_mhs1 = FileSet(
            path=join(
                self.refdir, "{satname}_mhs_{year}", "{month}", "{day}",
                "*NSS.MHSX.*.S{hour}{minute}.E{end_hour}{end_minute}.*.h5"),
            handler=MHS_HDF(),
        )
        fake_mhs2 = fake_mhs1.copy()

        with TemporaryDirectory() as outdir:
            collocations = Collocations(path=join(
                outdir, "{year}-{month}-{day}",
                "{hour}{minute}{second}-{end_hour}{end_minute}{end_second}"), )
            collocations.search([fake_mhs1, fake_mhs2],
                                start="2007",
                                end="2008",
                                max_interval="1h",
                                max_distance="10km")
def collect_spareice(version):
    spareice_files = FileSet(
        name="SPAREICE",
        path=f"/work/um0878/user_data/jmrziglod/spareice/{version}/noaa18/"
        "{year}/{month}/{day}/{year}{month}{day}_{hour}{minute}{second}-"
        "{end_hour}{end_minute}{end_second}.nc",
        max_processes=PROCESSES,
        placeholder={"version": version})

    print("Collect SPARE-ICE...")
    data_list = spareice_files.map(
        get_gridded_mean,
        start=START,
        end=END,
        on_content=True,
        pass_info=True,
    )
    data = xr.concat(data_list, dim="time")
    #data.to_netcdf(f"data/{version}_SPARE-ICE_{START}.nc")
    return data
def collect_cloudsat():
    cloudsat_files = FileSet(
        name="2C-ICE",
        path="/work/um0878/data/cloudsat/2C-ICE.P1_R04/{year}/{doy}/"
        "{year}{doy}{hour}{minute}{second}_*.hdf.zip",
        handler=CloudSat(),
        # Each file of CloudSat covers exactly 5933 seconds. Since we state it
        # here, the indexing of files is much faster
        time_coverage="5933 seconds",
        # Load only the fields that we need:
        read_args={
            "fields": ["ice_water_path"],
        },
        max_threads=15,
    )

    print("Collect 2C-ICE...")
    data = xr.concat(cloudsat_files[START:END], dim="scnline")

    data.to_netcdf(f"data/2C-ICE_{START}.nc")
    return data
Exemple #6
0
    not_null.lat, range=[[0, 90], [-90, 90]], bisectrix=False,
    cmap="density", vmin=1,
)
scat.cmap.set_under("w")
plt.colorbar(scat)
plt.savefig(f"experiments/{experiment}/scnpos_lat_heatmap.png")

test_ratio = 0.3
train_data, test_data = train_test_split(
    bdata, test_size=test_ratio, shuffle=True, random_state=5
)

print(f"Use {int(not_null.lat.size*(1-test_ratio))} points for training")
print(f"Use {int(not_null.lat.size*test_ratio)} points for testing")

experiments = FileSet("experiments/{experiment}/spareice.json")

if plot_all:
    spareice = SPAREICE(
        verbose=2, processes=processes,
        sea_mask_file="data/land_water_mask_5min.png",
        elevation_file="data/surface_elevation_1deg.nc",
    )
    for parameters in experiments:
        try:
            print(f"plot experiment {parameters.attr['experiment']}")
            spareice.load(parameters)
            spareice.report(
                "experiments", parameters.attr["experiment"], test_data
            )
        except:
Exemple #7
0
    def __init__(
        self,
        images,
        labels=None,
        augmentator=None,
        reader=None,
        batch_size=None,
        balance=False,
        label_encoding='one-hot',
        yield_mode='both',
        shuffle=True,
        random_seed=42,
        max_workers=None,
        classes=None,
        preprocess_input=None,
        target_size=None,
    ):
        """Create an ImageLoader

        Args:
            images: Must be either an iterable of image filenames, a path to a
                directory (e.g. /path/to/images/*.tif) or a path containing the
                placeholder *{label}* (e.g. /path/to/{label}/*.tif to match
                /path/to/car/001.tif). In the latter case, you do not have to
                set the parameter *labels*.
            labels: This must be given or *images* must contain a placeholder
                with *{label}* if you want to balance this dataset. Must be
                an iterable of labels with the same length as *images*.
            reader: Function to read the images. If None, images will be read
                by opencv.imread function.
                Default None.
            shuffle: Shuffle the dataset once before yielding. Default: True.
            random_seed: Number to initialize a random state. Default: 42.
            augmentator: Use your favourite augmentator object. Actively
                supported are keras, imgaug and Albumentations image
                augmentators. Can be also set to a function that will be called
                on each image before yielding it to the model. Default: None
            classes: Classes which will be encoded in this dataset.
            batch_size: Size of one batch. Default: 32.
            balance: Can be either:
                * *True*: the minority classes are going to be oversampled so
                    they have the same number as the majority class. If this is
                    used, *labels* must be given.
                * *iterable*: An iterable with the weights for each sample. The
                    sum of all weights should be 1.
                Default: False.
            balance_batch: If *True*, all classes appear with equal numbers in
                each batch. Works only if the number of classes is equal or
                lower than the batch size. Defult: False.
            label_encoding: Can be either:
                * *False*: No encoding.
                * *one-hot*: 1D numpy array of binary labels
                * *binary*: Use when you have only two classes. One will be
                    labelled with 0, the other one with 1.
                ...
                Default: *one-hot*.
            yield_mode: Defines what the ImageLoader will yield for each batch:
                * *both*: Yields inputs and labels (required for training models).
                * *inputs*: Yields only the inputs.
                * *labels*: Yields only the labels.
                Default: *both*.
            target_size: Set target size of images as a tuple of (height, width)
                in pixels. Default: None

        Examples:

            from ai4eo.preprocessing import ImageLoader
            from keras.preprocessing.image import ImageDataGenerator


            keras_augmentator = ImageDataGenerator(
                featurewise_center=True,
                featurewise_std_normalization=True,
                rotation_range=20,
                width_shift_range=0.2,
                height_shift_range=0.2,
                horizontal_flip=True
            )

            data = ImageLoader(
                '/path/to/images/{label}/*.tif', augmentator=keras_augmentator,
            )

            # Create keras model
            model = ...

            model.fit_generator(data, ...)
        """

        if isinstance(images, str):
            # Let's try to find all images in the given path
            files = FileSet(images).to_dataframe()
            images = files.index.values

            if "label" in files.columns:
                labels = files['label'].values

        if labels is not None and len(labels) != len(images):
            raise ValueError("images and labels must have the same length!")

        self.images = np.array(images)
        self.labels = None if labels is None else np.array(labels)

        self.classes = classes
        if self.classes is None and self.labels is not None:
            self.classes = np.unique(self.labels)

        if self.classes is not None:
            self.class_indices = {
                index: label
                for index, label in enumerate(self.classes)
            }
        else:
            self.class_indices = None

        if self.labels is not None:
            if label_encoding == 'one-hot' or label_encoding == 'binary':
                self.labels = label_binarize(self.labels, classes=self.classes)
                if label_encoding == 'binary':
                    self.labels = np.squeeze(self.labels)

        self.yield_mode = yield_mode

        self.reader = reader
        self.augmentator = augmentator
        self.augmentator_type = None
        if callable(getattr(self.augmentator, "random_transform", None)):
            self.augmentator_type = 'keras'
        elif callable(getattr(self.augmentator, "augment_batches", None)):
            self.augmentator_type = 'imgaug'
        # elif callable(getattr(self.augmentator, "augment_batches", None)):
        #     self.augmentator_type = 'imgaug'
        self.batch_size = batch_size or 32
        self.preprocess_input = preprocess_input
        self.target_size = target_size

        # To make the experiments reproducible:
        self.random_state = np.random.RandomState(random_seed)
        self.random_seed = random_seed

        self.max_workers = max_workers

        self._indices = list(range(len(self.images)))
        if shuffle:
            self.random_state.shuffle(self._indices)

        if not balance:
            self._weights = None
        # Check explicitly for True because iterables could also return True in
        # a boolean context
        elif balance is True:
            # We want to oversample the minority classes, i.e. the set the
            # weights accordingly (the lower the amount of samples per class,
            # the higher the weight for them).
            if self.labels is None:
                raise ValueError('Cannot balance samples by myself without'
                                 'having any labels! Please set *labels*!')
            unique_labels, counts = np.unique(labels, return_counts=True)
            label_counts = pd.Series(counts, index=unique_labels)
            self._weights = \
                1 / label_counts.loc[labels].values / len(label_counts)
        else:
            self._weights = balance

        self.reset()
Exemple #8
0
    def init_filesets(self):
        if self.filesets is not None:
            return self.filesets

        self.filesets = FileSetManager()

        self.filesets += FileSet(
            join(
                self.refdir, "tutorial", "{satellite}", "{year}-{month}-{day}",
                "{hour}{minute}{second}-{end_hour}{end_minute}{end_second}.nc"
            ),
            name="tutorial",
        )

        self.filesets += FileSet(
            join(
                self.refdir,
                "single_file.nc",
            ),
            name="single",
            time_coverage=["2018-01-01", "2018-01-03"],
        )

        def sequence_get_info(file_info, **kwargs):
            """Small helper function for sequence fileset."""
            with open(file_info) as f:
                file_info.times[0] = datetime.datetime.strptime(
                    f.readline().rstrip(), "Start: %Y-%m-%d %H:%M:%S")
                file_info.times[1] = datetime.datetime.strptime(
                    f.readline().rstrip(), "End: %Y-%m-%d %H:%M:%S")
            return file_info

        self.filesets += FileSet(
            join(
                self.refdir,
                "sequence",
                "{year}",
                "{doy}",
                "sequence*.txt",
            ),
            name="sequence-wildcard",
            handler=FileHandler(info=sequence_get_info, ),
            info_via="handler",
        )
        self.filesets += FileSet(join(
            self.refdir,
            "sequence",
            "{year}",
            "{doy}",
            "sequence{id}.txt",
        ),
                                 handler=FileHandler(info=sequence_get_info, ),
                                 name="sequence-placeholder",
                                 info_via="both",
                                 placeholder={"id": "\d{4}"})

        self.filesets += FileSet(
            join(
                self.refdir,
                # NSS.HIRX.NJ.D99127.S0632.E0820.B2241718.WI.gz
                "regex",
                "NSS.HIR[XS].{satcode}.D{year2}{doy}.S{hour}"
                "{minute}.E{end_hour}{end_minute}.B{B}.{station}.gz"),
            name="regex-HIRS",
        )
        self.filesets["regex-HIRS"].set_placeholders(
            satcode=".{2}",
            B="\d{7}",
            station=".{2}",
        )

        return self.filesets
Exemple #9
0
    def test_glob(self):
        files = FileSet(
            join(self.refdir, "tutorial", "{satellite}", "*", "*.nc"),
            placeholder={"satellite": 'SatelliteA'},
        )

        self._print_files(list(files))

        # Sort this after paths rather than times (because the times are all
        # equal)
        check = list(
            sorted([
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02',
                         '000000-040000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02',
                         '080000-120000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02',
                         '200000-000000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02',
                         '040000-080000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02',
                         '120000-160000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-02',
                         '160000-200000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01',
                         '000000-040000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01',
                         '080000-120000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01',
                         '200000-000000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01',
                         '040000-080000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01',
                         '120000-160000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
                FileInfo(
                    join(self.refdir, 'tutorial', 'SatelliteA', '2018-01-01',
                         '160000-200000.nc'),
                    [
                        datetime.datetime(1, 1, 1, 0, 0),
                        datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                    ], {'satellite': 'SatelliteA'}),
            ],
                   key=lambda x: x.path))

        assert list(sorted(files, key=lambda x: x.path)) == check
"""Create fold files for training and test images
"""

import numpy as np
import pandas as pd
from typhon.files import FileSet

df = pd.concat([
    FileSet(
        '/scratch-a/jmrziglod/sen2agri/data/malawi_summer/patches/original/{label}/*.png'
    ).to_dataframe(),
    FileSet(
        '/scratch-a/jmrziglod/sen2agri/data/malawi_summer/patches/augmented/{label}/*.png'
    ).to_dataframe()
])

unique_ids = np.unique(df.id.values)
shuffled_ids = np.random.choice(unique_ids,
                                size=len(unique_ids),
                                replace=False)
ratio = 0.1
test_ids = shuffled_ids[:int(shuffled_ids.size * ratio)]
train_ids = shuffled_ids[int(shuffled_ids.size * ratio):]

with open(
        '/home/jmrziglod/projects/sen2agri/drone-crop-type/folds/malawi_summer/train_test_all_mosaics/train.txt',
        'w') as txt_file:
    txt_file.write("\n".join(df.index[df.id.isin(train_ids)].tolist()))
with open(
        '/home/jmrziglod/projects/sen2agri/drone-crop-type/folds/malawi_summer/train_test_all_mosaics/test.txt',
        'w') as txt_file:
START_TIME = "2007"
END_TIME = "March 2010"
PROCESSES = 12

TRAINING_FILE = "spareice_training_data.nc"

# Define a fileset with the files from MHS / NOAA18:
mhs = FileSet(
    name="MHS",
    path="/work/um0878/data/amsub_mhs_l1c_hdf/AAPP7_13/noaa18"
    "_mhs_{year}/{month}/{day}/*NSS.MHSX.NN.*."
    "S{hour}{minute}.E{end_hour}{end_minute}.*.h5",
    handler=MHS_HDF(),
    # Load only the fields that we need:
    read_args={
        "fields": [
            "Data/btemps",
            "Geolocation/Satellite_azimuth_angle",
            "Geolocation/Satellite_zenith_angle",
            "Geolocation/Solar_azimuth_angle",
            "Geolocation/Solar_zenith_angle",
        ]
    },
)

# Define a fileset with files from CloudSat / 2C-ICE:
cloudsat = FileSet(
    name="2C-ICE",
    path="/work/um0878/data/cloudsat/2C-ICE.P1_R04/{year}/{doy}/"
    "{year}{doy}{hour}{minute}{second}_*.hdf.zip",
    handler=CloudSat(),
Exemple #12
0
END_TIME = "21 Jun 2013"
PROCESSES = 1
SAVE_COLLOCATIONS = False

version = "typhon"

# Define a fileset with the files from MHS / NOAA18:
mhs_files = FileSet(
    name="MHS",
    path="/work/um0878/data/amsub_mhs_l1c_hdf/AAPP7_13/noaa18"
         "_mhs_{year}/{month}/{day}/*NSS.MHSX.NN.*."
         "S{hour}{minute}.E{end_hour}{end_minute}.*.h5",
    handler=MHS_HDF(),
    # Load only the fields that we need:
    read_args={
        "fields": [
            "Data/btemps",
            "Geolocation/Satellite_azimuth_angle",
            "Geolocation/Satellite_zenith_angle",
            "Geolocation/Solar_azimuth_angle",
            "Geolocation/Solar_zenith_angle",
        ]
    },
)

# Define a fileset with the files from AVHRR / NOAA18:
avhrr_files = FileSet(
    name="AVHRR",
    path="/work/um0878/user_data/jmrziglod/avhrr_gac_hdf5/noaa18_gac_"
         "{year}/{month}/{day}/NSS.*."
         "S{hour}{minute}.E{end_hour}{end_minute}.*.h5",
Exemple #13
0
def load_filesets(config):
    """Load all filesets into one FileSetManager object

    Args:
        config: Dictionary with configuration keys and values.

    Returns:
        A FileSetManager object.
    """

    basedir = config["General"]["basedir"]

    # This FileSetManager can handle all dataset objects:
    filesets = FileSetManager()

    ###########################################################################
    # Pinocchio - FileSets:
    filesets += FileSet(
        name="Pinocchio-netcdf",
        path=os.path.join(
            config["General"]["basedir"],
            config["Pinocchio"]["nc_files"],
        ),
        max_processes=int(config["General"]["processes"]),
    )
    filesets += FileSet(
        name="Pinocchio-archive",
        path=os.path.join(basedir, config["Pinocchio"]["archive_files"]),
        max_processes=int(config["General"]["processes"]),
    )

    # Load logbook from Pinocchio. This logbook contains time intervals where
    # the data is corrupted or bad.
    logbook = None
    if "logbook" in config["Pinocchio"]:
        logbook = load_logbook(
            os.path.join(basedir, config["Pinocchio"]["logbook"]))

    pinocchio_calibration = os.path.join(
        config["General"]["basedir"],
        config["Pinocchio"]["calibration"],
    )
    filesets += FileSet(
        name="Pinocchio-raw",
        path=os.path.join(
            config["General"]["basedir"],
            os.path.splitext(config["Pinocchio"]["archive_files"])[0],
            config["Pinocchio"]["files_in_archive"],
        ),
        # Set the pinocchio file handler with the calibration file
        handler=pinocchio.ThermalCam(calibration_file=pinocchio_calibration),
        max_processes=int(config["General"]["processes"]),
        # Exclude the time intervals from the logbook when searching for files:
        exclude=logbook,
    )

    filesets += FileSet(
        path=os.path.join(basedir, config["Pinocchio"]["stats"]),
        name="Pinocchio-stats",
        max_processes=int(config["General"]["processes"]),
    )
    ###########################################################################

    ###########################################################################
    # Dumbo - FileSets:
    filesets += FileSet(
        name="Dumbo-netcdf",
        path=os.path.join(
            config["General"]["basedir"],
            config["Dumbo"]["nc_files"],
        ),
        max_processes=int(config["General"]["processes"]),
    )

    # Load logbook from Dumbo:
    logbook = None
    if "logbook" in config["Dumbo"]:
        logbook = load_logbook(
            os.path.join(basedir, config["Dumbo"]["logbook"]))

    filesets += FileSet(
        name="Dumbo-raw",
        path=os.path.join(
            config["General"]["basedir"],
            config["Dumbo"]["raw_files"],
        ),
        handler=dumbo.ThermalCamASCII(),
        # Since the raw files have no temporal information in their filename,
        # we have to retrieve it from by their handler.
        info_via="handler",
        max_processes=int(config["General"]["processes"]),
        # Exclude the time intervals from the logbook when searching for files:
        exclude=logbook,
    )
    filesets += FileSet(
        path=os.path.join(basedir, config["Dumbo"]["stats"]),
        name="Dumbo-stats",
        max_processes=int(config["General"]["processes"]),
    )
    ###########################################################################

    filesets += FileSet(
        path=os.path.join(basedir, config["Ceilometer"]["files"]),
        name="Ceilometer",
        # Each file covers roughly 24 hours:
        time_coverage="24 hours",
        max_processes=int(config["General"]["processes"]),
    )
    filesets += FileSet(
        path=os.path.join(basedir, config["DShip"]["files"]),
        handler=metadata.ShipMSM(),
        name="DShip",
        max_processes=int(config["General"]["processes"]),
    )
    filesets += FileSet(
        path=os.path.join(basedir, config["Plots"]["files"]),
        name="plots",
        handler=Plotter(),
        max_processes=int(config["General"]["processes"]),
    )

    return filesets