Ejemplo n.º 1
0
    def run_datasets(self, sources, start=None, end=None, output=None,
                     inputs=None, extra_fields=None, cleaner=None):
        """Predict the target values for data coming from datasets

        Args:
            sources: List of Dataset objects.
            start: Start date either as datetime object or as string
                ("YYYY-MM-DD hh:mm:ss"). Year, month and day are required.
                Hours, minutes and seconds are optional.
            end: End date. Same format as "start".
            output: Either None (default), a path as string containing
                placeholders or a Dataset-like object. If None, all data will
                be returned as one object.
            inputs: A dictionary of input field names. The keys must be the
                same labels as used in :meth:`train`. The values are the field
                names in the original data coming from *sources*.
            extra_fields: Extra fields that should be copied to output. If you
                want to save the output to a Dataset, this must contain a
                *time* field.
            cleaner: A filter function that can be used to clean the input
                data.

        Returns:
            If output is not None, a :class:`Dataset` object holding the data.
            Otherwise an GroupedArrays / xarray.Dataset with the retrieved
            data.
        """

        if output is None or isinstance(output, Dataset):
            pass
        elif isinstance(output, str):
            output = Dataset(path=output, name="RetrievedData")
        else:
            raise ValueError("The parameter output must be None, a string or "
                             "a Dataset object!")

        results = []

        # Slide through all input sources and apply the regression on them
        for files, data in DataSlider(start, end, *sources):
            retrieved_data = self.run(data, inputs, extra_fields, cleaner)

            if retrieved_data is None:
                continue

            if output is None:
                results.append(retrieved_data)
            else:
                # Store the generated data.
                times = retrieved_data["time"].min().item(0), \
                        retrieved_data["time"].max().item(0)

                output.write(retrieved_data, times=times, in_background=True)

        if output is None:
            if results:
                return GroupedArrays.concat(results)
            else:
                return None
Ejemplo n.º 2
0
    def train(self, sources, inputs, targets, cleaner=None, test_size=None,
              verbose=0):
        """Train this retriever with data from arrays

        Args:
            sources: Sources where the data come from. Must be a dictionary of
                dict-like objects (such as xarray.Dataset) with numpy arrays.
            inputs: A dictionary of input field names. The keys are labels of
                the input fields. The values are the field names in the
                original data coming from *sources*.
            targets: A dictionary of target field names. The keys are labels of
                the target fields. The values are the field names in the
                original data coming from *sources*.
            cleaner: A filter function that can be used to clean the training
                data.
            test_size: Fraction of the data should be used for testing not
                training.
            verbose: Level of verbosity (=number of debug messages). Default is
                0.

        Returns:
            The data from *sources* split into training input, testing input,
            training target and testing target.
        """

        if self.trainer is None:
            self.trainer = self._default_trainer()

        # The input and target labels will be saved because they can be used in
        # run as well.
        self.parameter["inputs"] = inputs
        self.parameter["targets"] = targets

        if isinstance(sources, dict):
            data = GroupedArrays.from_dict(sources)
        else:
            raise ValueError("Only a dictionary of arrays is allowed!")

        train_input, test_input, train_target, test_target = \
            self._prepare_training_data(
                data, inputs, targets, cleaner, test_size
            )

        # Unleash the trainer
        self.trainer.verbose = verbose
        self.trainer.fit(train_input, train_target)

        # Use the best estimator from now on:
        self.estimator = self.trainer.best_estimator_

        if verbose:
            print()

        return (
            train_input, test_input, train_target, test_target
        )
Ejemplo n.º 3
0
    def read(self, file_info, extra_fields=None, mapping=None):
        """Read and parse HDF4 files and load them to an GroupedArrays.

        Args:
            file_info: Path and name of the file as string or FileInfo object.
            extra_fields: Additional field names that you want to extract from
                this file as a list.
            mapping: A dictionary that maps old field names to new field names.
                If given, *extra_fields* must contain the old field names.

        Returns:
            An GroupedArrays object.
        """

        dataset = GroupedArrays(name="CloudSat")

        # The files are in HDF4 format therefore we cannot use the netCDF4
        # module. This code is taken from
        # http://hdfeos.org/zoo/OTHER/2010128055614_21420_CS_2B-GEOPROF_GRANULE_P_R04_E03.hdf.py
        # and adapted by John Mrziglod. A description about all variables in
        # CloudSat dataset can be found in
        # http://www.cloudsat.cira.colostate.edu/data-products/level-2c/2c-ice?term=53.

        file = HDF.HDF(file_info.path)

        try:
            vs = file.vstart()

            # Extract the standard fields:
            dataset["time"] = self._get_time_field(vs, file_info)
            dataset["lat"] = self._get_field(vs, "Latitude")
            dataset["lon"] = self._get_field(vs, "Longitude")
            dataset["scnline"] = Array(
                np.arange(dataset["time"].size), dims=["time_id"]
            )
            dataset["scnpos"] = Array(
                [1 for _ in range(dataset["time"].size)], dims=["time_id"]
            )

            # Get the extra fields:
            if extra_fields is not None:
                for field, dimensions in self.parse_fields(extra_fields):
                    data = self._get_field(vs, field)

                    # Add the field data to the dataset.
                    dataset[field] = self.select(data, dimensions)
        except Exception as e:
            raise e
        finally:
            file.close()

        return dataset
Ejemplo n.º 4
0
    def _add_fields_to_data(data, original_dataset, group, fields):
        try:
            original_file = data[group].attrs["__original_file"]
        except KeyError:
            raise KeyError(
                "The collocation files does not contain information about "
                "their original files.")
        original_data = original_dataset.read(original_file)[fields]
        original_indices = data[group]["__original_indices"]
        data[group] = GroupedArrays.merge(
            [data[group], original_data[original_indices]],
            overwrite_error=False
        )

        return data
Ejemplo n.º 5
0
    def read(self,
             filename,
             fields=None,
             mapping=None,
             main_group=None,
             **kwargs):
        """Reads and parses NetCDF files and load them to an GroupedArrays.

        If you need another return value, change it via the parameter
        *return_type* of the :meth:`__init__` method.

        Args:
            filename: Path and name of the file as string or FileInfo object.
                If *return_type* is *GroupedArrays*, this can also be a tuple/list
                of file names.
            fields: List of field names that should be read. The other fields
                will be ignored.
            mapping: A dictionary which is used for renaming the fields. The
                keys are the old and the values are the new names.
            main_group: If the file contains multiple groups, the main group
                will be linked to this one (only valid for GroupedArrays).

        Returns:
            An GroupedArrays object.
        """

        # GroupedArrays supports reading from multiple files.
        if self.return_type == "GroupedArrays":
            ds = GroupedArrays.from_netcdf(filename.path, fields, **kwargs)
            if not ds:
                return None
            if main_group is not None:
                ds.set_main_group(main_group)
        elif self.return_type == "xarray":
            ds = xr.open_dataset(filename.path, **kwargs)
            if not ds.variables:
                return None
        else:
            raise ValueError("Unknown return type '%s'!" % self.return_type)

        if fields is not None:
            ds = ds[fields]

        if mapping is not None:
            ds.rename(mapping, inplace=True)

        return ds
Ejemplo n.º 6
0
    def read(self, filename, fields=None, **read_csv):
        """Read a CSV file and return an GroupedArrays object with its content.

        Args:
            filename: Path and name of the file as string or FileInfo object.
            fields: Field that you want to extract from the file. If not given,
                all fields are going to be extracted.
            **read_csv: Additional keyword arguments for the pandas function
                `pandas.read_csv`. See for more details:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

        Returns:
            An GroupedArrays object.
        """

        kwargs = self.read_csv.copy()
        kwargs.update(read_csv)

        if self.return_type == "GroupedArrays":
            return GroupedArrays.from_csv(filename.path, fields, **kwargs)
        else:
            dataframe = pd.read_csv(filename.path, **kwargs)
            return xr.Dataset.from_dataframe(dataframe)
Ejemplo n.º 7
0
    def read(self, file_info, extra_fields=None, mapping=None):
        """"Read and parse HDF4 files and load them to an GroupedArrays.

        Args:
            file_info: Path and name of the file as string or FileInfo object.
            extra_fields: Additional field names that you want to extract from
                this file as a list.
            mapping: A dictionary that maps old field names to new field names.
                If given, *extra_fields* must contain the old field names.

        Returns:
            An GroupedArrays object.
        """

        if extra_fields is None:
            extra_fields = []

        fields = self.standard_fields | set(extra_fields)

        dataset = GroupedArrays.from_netcdf(file_info.path, fields)
        dataset.name = "MHS"

        # We do the internal mapping first so we do not deal with difficult
        # names in the following loop.
        dataset.rename(self.mapping, inplace=True)

        # Handle the standard fields:
        dataset["time"] = self._get_time_field(dataset)

        # Flat the latitude and longitude vectors:
        dataset["lon"] = dataset["lon"].flatten()
        dataset["lat"] = dataset["lat"].flatten()

        # Repeat the scanline and create the scnpos:
        dataset["scnpos"] = np.tile(np.arange(1, 91), dataset["scnline"].size)
        dataset["scnline"] = np.repeat(dataset["scnline"], 90)

        # Remove fields that we do not need any longer (expect the user asked
        # for them explicitly)
        dataset.drop({"Data/scnlinyr", "Data/scnlindy", "Data/scnlintime"} -
                     set(extra_fields),
                     inplace=True)

        # Some fields need special treatment
        for var in dataset.vars(deep=True):
            # Unfold the variable automatically if it is a swath variable.
            if len(dataset[var].shape) > 1 and dataset[var].shape[1] == 90:
                # Unfold the dimension of the variable
                # to the shapes of the time vector.
                dataset[var] = dataset[var].reshape(-1, dataset[var].shape[-1])

            # Some variables are scaled. If the user wants us to do
            # rescaling, we do it and delete the note in the attributes.
            if self.apply_scaling and "Scale" in dataset[var].attrs:
                dataset[var] = dataset[var] * dataset[var].attrs["Scale"]
                del dataset[var].attrs["Scale"]

            dataset[var].dims = ["time_id"]

        if "Data/btemps" in dataset:
            # Mask error values:
            dataset["Data/btemps"][dataset["Data/btemps"] <= -9999.] = np.nan

        if mapping is not None:
            dataset.rename(mapping)

        return dataset
Ejemplo n.º 8
0
    def run(self, sources, inputs, extra_fields=None, cleaner=None):
        """Predict the target values for data coming from arrays

        Args:
            sources: Sources where the data come from. Must be a dict of
                dict-like objects (such as xarray.Dataset) with numpy arrays.
            inputs: A dictionary of input field names. The keys must be the
                same labels as used in :meth:`train`. The values are the field
                names in the original data coming from *sources*.
            extra_fields: Extra fields that should be copied to output. If you
                want to save the output to a Dataset, this must contain a
                *time* field.
            cleaner: A filter function that can be used to clean the input
                data.

        Returns:
            If output is not None, a :class:`Dataset` object holding the data.
            Otherwise an GroupedArrays / xarray.Dataset with the retrieved
            data.

        Examples:

        .. :code-block:: python

            # TODO
        """

        if not isinstance(sources, dict):
            raise ValueError("Only a dictionary of arrays is allowed!")

        # We running SPARE-ICE only on arrays:
        data = GroupedArrays.from_dict(sources)

        if callable(cleaner):
            data = data[cleaner(data)]

        input_data = np.asmatrix([
            data[field]
            for _, field in sorted(inputs.items())
        ]).T

        # Skip to small datasets
        if not input_data.any():
            print("Skip this data!")
            return None

        # Retrieve the data from the neural network:
        output_data = self.predict(input_data)

        if len(self.parameter["targets"]) == 1:
            retrieved_data = GroupedArrays()
            target_label = list(self.parameter["targets"].keys())[0]
            retrieved_data[target_label] = output_data
        else:
            retrieved_data = GroupedArrays.from_dict({
                name: output_data[:, i]
                for i, name in enumerate(sorted(self.parameter["targets"]))
            })

        if extra_fields is not None:
            for new_name, old_name in extra_fields.items():
                retrieved_data[new_name] = data[old_name]

        return retrieved_data
Ejemplo n.º 9
0
def _store_collocations(
        output, datasets, raw_data, collocations,
        files, **collocate_args):
    """Merge the data, original indices, collocation indices and
    additional information of the datasets to one GroupedArrays object.

    Args:
        output:
        datasets:
        raw_data:
        collocations:
        files:

    Returns:
        List with number of collocations
    """

    # The data that will be stored to a file:
    output_data = GroupedArrays(name="CollocatedData")

    # We need this name to store the collocation metadata in an adequate
    # group
    collocations_name = datasets[0].name+"."+datasets[1].name
    output_data["__collocations/"+collocations_name] = GroupedArrays()
    metadata = output_data["__collocations/"+collocations_name]

    max_interval = collocate_args.get("max_interval", None)
    if max_interval is not None:
        max_interval = to_timedelta(max_interval).total_seconds()
    metadata.attrs["max_interval"] = f"Max. interval in secs: {max_interval}"

    max_distance = collocate_args.get("max_distance", None)
    metadata.attrs["max_distance"] = \
        f"Max. distance in kilometers: {max_distance}"
    metadata.attrs["primary"] = datasets[0].name
    metadata.attrs["secondary"] = datasets[1].name

    pairs = []
    number_of_collocations = []

    for i, dataset in enumerate(datasets):
        dataset_data = raw_data[dataset.name]

        if "__collocations" in dataset_data.groups():
            # This dataset contains already-collocated datasets,
            # therefore we do not select any data but copy all of them.
            # This keeps the indices valid, which point to the original
            # files and data:
            output_data = GroupedArrays.merge(
                [output_data, dataset_data]
            )

            # Add the collocation indices. We do not have to adjust them
            # since we do not change the original data.
            pairs.append(collocations[i])
            continue

        # These are the indices of the points in the original data that
        # have collocations. Remove the duplicates since we want to copy
        # the required data only once:
        original_indices = pd.unique(collocations[i])

        number_of_collocations.append(len(original_indices))

        # After selecting the collocated data, the original indices cannot
        # be applied any longer. We need new indices that indicate the
        # pairs in the collocated data.
        indices_in_collocated_data = {
            original_index: new_index
            for new_index, original_index in enumerate(original_indices)
        }
        collocation_indices = [
            indices_in_collocated_data[index]
            for index in collocations[i]
        ]

        # Save the collocation indices in the metadata group:
        pairs.append(collocation_indices)

        data = dataset_data[original_indices]
        data["__original_indices"] = Array(
            original_indices, dims=["time_id", ],
            attrs={
                "long_name": "Index in the original file",
            }
        )

        if "__original_files" not in data:
            # Set where the data came from:
            data.attrs["__original_files"] = \
                ";".join(file.path for file in files[datasets[i].name])
        output_data[datasets[i].name] = data

    metadata["pairs"] = pairs

    # Use only the times of the primary dataset as start and end time (makes it
    # easier to find corresponding files later):
    time_coverage = output_data[datasets[0].name].get_range(
        "time",
    )
    output_data.attrs["start_time"] = \
        time_coverage[0].strftime("%Y-%m-%dT%H:%M:%S.%f")
    output_data.attrs["end_time"] = \
        time_coverage[1].strftime("%Y-%m-%dT%H:%M:%S.%f")

    # Prepare the name for the output file:
    filename = output.generate_filename(time_coverage)

    # Write the data to the file.
    output.write(output_data, filename)

    return filename, number_of_collocations
Ejemplo n.º 10
0
    def collapse_data(
            collocated_data, file_info, reference, include_stats, collapser):
        """TODO: Write documentation."""

        # Get the bin indices by the main dataset to which all other
        # shall be collapsed:
        reference_bins = list(
            collocated_data[reference][COLLOCATION_FIELD].group().values()
        )

        collapsed_data = GroupedArrays()

        # Add additional statistics about one binned variable:
        if include_stats is not None:
            statistic_functions = {
                "variation": scipy.stats.variation,
                "mean": np.nanmean,
                "number": lambda x, _: x.shape[0],
                "std": np.nanstd,
            }

            # Create the bins for the varaible from which you want to have
            # the statistics:
            group, _ = GroupedArrays.parse(include_stats)
            bins = collocated_data[group][COLLOCATION_FIELD].bin(
                reference_bins
            )
            collapsed_data["__statistics"] = \
                collocated_data[include_stats].apply_on_bins(
                    bins, statistic_functions
                )
            collapsed_data["__statistics"].attrs["description"] = \
                "Statistics about the collapsed bins of '{}'.".format(
                    include_stats
                )

        for dataset in collocated_data.groups():
            if dataset.startswith("__"):
                collapsed_data[dataset] = collocated_data[dataset]

            collocations = collocated_data[dataset][COLLOCATION_FIELD]

            if (dataset == reference
                or collocated_data[dataset].attrs.get("COLLAPSED_TO", None)
                    == reference):
                # The collocation indices will become useless
                del collocated_data[dataset][COLLOCATION_FIELD]

                # This is the main dataset to which all other will be
                # collapsed. Therefore, we do not need explicitly
                # collapse here.
                collapsed_data[dataset] = \
                    collocated_data[dataset][np.unique(collocations)]
            else:
                # We do not need the original and collocation indices from the
                # dataset that will be collapsed because they will soon become
                # useless. Moreover, they could have a different dimension
                # length than the other variables and lead to errors in the
                # selecting process.

                del collocated_data[dataset]["__original_indices"]
                del collocated_data[dataset][COLLOCATION_FIELD]

                bins = collocations.bin(reference_bins)

                # We ignore some warnings rather than fixing them
                # TODO: Maybe fix them?
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        message="invalid value encountered in double_scalars")
                    collapsed_data[dataset] = \
                        collocated_data[dataset].collapse(
                            bins, collapser=collapser,
                        )

                collapsed_data[dataset].attrs["COLLAPSED_TO"] = reference

        # Set the collapsed flag:
        collapsed_data.attrs["COLLAPSED"] = 1

        # Overwrite the content of the old file:
        return collapsed_data