Exemple #1
0
    def generate_dataset(self,
                         ahead=1,
                         mode=None,
                         ensemble=False,
                         ens_slice=None,
                         remote=None):
        """
        Generates the dataset for training, test and validation

          0 = One site - wind
          1 = One site - all variables
          2 = All sites - wind
          3 = All sites - all variables
          4 = All sites - all variables stacked
          5 = Uses neighbor sites around a radius

        :param ens_slice: (not yet used)
        :param remote: Use remote data
        :param ensemble: (not yet used)
        :param datanames: Name of the wind datafiles
        :param ahead: number of steps ahead for prediction
        :param mode: type of dataset (pair indicating the type of dimension for input and output)
        :return:
        """
        self.generated = True
        self.mode = mode

        datanames = self.config['datanames']
        datasize = self.config['datasize']
        testsize = self.config['testsize']

        lag = self.config['lag']
        vars = self.config['vars']
        wind = {}
        if 'angle' in self.config:
            angle = self.config['angle']
        else:
            angle = False

        #ahead = self.config['ahead'] if (type(self.config['ahead']) == list) else [1, self.config['ahead']]

        if type(ahead) == list:
            dahead = ahead[1]
            slice = (ahead[1] - ahead[0]) + 1
        else:
            dahead = ahead
            slice = ahead

        # Augment the dataset with the closest neighbors
        if self.config['dataset'] == 5 or self.config['dataset'] == 31:
            if 'radius' not in self.config:
                raise NameError(
                    "Radius missing for neighbours augmented dataset")
            else:
                radius = self.config['radius']
            if 'nneighbors' in self.config:
                datanames = get_closest_k_neighbors(datanames[0], radius,
                                                    self.config['nneighbors'])
            else:
                datanames = get_all_neighbors(datanames[0], radius)
        # Reads numpy arrays for all sites and keeps only selected columns
        for d in datanames:
            if remote:
                srv = pysftp.Connection(host=remote_data[0],
                                        username=remote_data[1])
                srv.get(remote_wind_data_path + f"/{d}.npy",
                        self.data_path + f"/{d}.npy")
                srv.close()
            if angle:
                wind[d] = np.load(self.data_path + '_angle' + f"/{d}.npy")
            else:
                wind[d] = np.load(self.data_path + f"/{d}.npy")
            if remote:
                os.remove(self.data_path + f"/{d}.npy")

            # If there is a list in vars attribute it should be a list of integers
            if type(vars) == list:
                for v in vars:
                    if type(v) != int or v > wind[d].shape[1]:
                        raise NameError('Error in variable selection')
                wind[d] = wind[d][:, vars]

        if (self.config['dataset'] == 0) or (self.config['dataset']
                                             == 'onesiteonevar'):
            if not ensemble:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_one_var(wind[datanames[0]][:, 0].reshape(-1, 1), datasize, testsize,
                                                   lag=lag, ahead=dahead, slice=slice, mode=mode)
            else:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_one_var(wind[datanames[0]][ens_slice[0]::ens_slice[1], 0].reshape(-1, 1),
                                                   datasize, testsize,
                                                   lag=lag, ahead=dahead, slice=slice, mode=mode)

        elif (self.config['dataset'] == 1) or (self.config['dataset']
                                               == 'onesitemanyvar'):
            if not ensemble:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_multiple_var(wind[datanames[0]], datasize, testsize,
                                                        lag=lag, ahead=dahead, slice=slice, mode=mode)
            else:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_multiple_var(wind[datanames[0][ens_slice[0]::ens_slice[1], :]], datasize,
                                                        testsize,
                                                        lag=lag, ahead=dahead, slice=slice, mode=mode)

        elif self.config['dataset'] == 2 or self.config[
                'dataset'] == 'manysiteonevar':
            stacked = np.vstack([wind[d][:, 0] for d in datanames]).T
            self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                self._generate_dataset_multiple_var(stacked, datasize, testsize,
                                                    lag=lag, ahead=dahead, slice=slice, mode=mode)
        elif self.config['dataset'] == 3 or self.config[
                'dataset'] == 31 or self.config['dataset'] == 'manysitemanyvar':
            stacked = np.hstack([wind[d] for d in datanames])
            self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                self._generate_dataset_multiple_var(stacked, datasize, testsize,
                                                    lag=lag, ahead=dahead, slice=slice, mode=mode)
        elif self.config['dataset'] == 4 or self.config['dataset'] == 5 or \
                self.config['dataset'] == 'manysitemanyvarstack':
            stacked = [
                self._generate_dataset_multiple_var(wind[d],
                                                    datasize,
                                                    testsize,
                                                    lag=lag,
                                                    ahead=dahead,
                                                    slice=slice,
                                                    mode=mode)
                for d in datanames
            ]

            self.train_x = np.vstack([x[0] for x in stacked])
            self.train_y = np.vstack([x[1] for x in stacked])

            self.val_x = stacked[0][2]
            self.val_y = stacked[0][3]
            self.test_x = stacked[0][4]
            self.test_y = stacked[0][5]
        else:
            raise NameError('ERROR: No such dataset type')
Exemple #2
0
    def generate_dataset(self,
                         ahead=1,
                         mode=None,
                         ensemble=False,
                         ens_slice=None,
                         remote=None):
        """
        Generates the dataset for training, test and validation

          0 = One site - wind
          1 = One site - all variables
          2 = All sites - wind
          3 = All sites - all variables
          4 = All sites - all variables stacked
          5 = Uses neighbor sites around a radius
          6 = Uses random sites outside a radius

        :param ens_slice: (not yet used)
        :param remote: Use remote data
        :param ensemble: (not yet used)
        :param datanames: Name of the wind datafiles
        :param ahead: number of steps ahead for prediction
        :param mode: type of dataset (pair indicating the type of dimension for input and output)
        :return:
        """
        self.generated = True
        self.mode = mode

        datanames = self.config['datanames']
        datasize = self.config['datasize']
        testsize = self.config['testsize']

        lag = self.config['lag']
        vars = self.config['vars']
        period = self.config['period'] if 'period' in self.config else None
        wind = {}
        if 'angle' in self.config:
            angle = self.config['angle']
        else:
            angle = False

        # ahead = self.config['ahead'] if (type(self.config['ahead']) == list) else [1, self.config['ahead']]

        if type(ahead) == list:
            dahead = ahead[1]
            slice = (ahead[1] - ahead[0]) + 1
        else:
            dahead = ahead
            slice = ahead

        # Augment the dataset with the closest neighbors
        if self.config['dataset'] == 5 or self.config['dataset'] == 31:
            if 'radius' not in self.config:
                raise NameError(
                    "Radius missing for neighbours augmented dataset")
            else:
                radius = self.config['radius']
            if 'nneighbors' in self.config:
                datanames = get_closest_k_neighbors(datanames[0], radius,
                                                    self.config['nneighbors'])
            else:
                datanames = get_all_neighbors(datanames[0], radius)

        # Augment the dataset with the random not neighbors (out of a radius)
        if self.config['dataset'] == 6:
            if 'radius' not in self.config:
                raise NameError(
                    "Radius missing for neighbours augmented dataset")
            else:
                radius = self.config['radius']
            nonneigh = 100 if 'nonneighbors' not in self.config else self.config[
                'nonneighbors']
            nndnames = get_random_k_nonneighbors(datanames[0], radius,
                                                 nonneigh)
            # print(nndnames)
            datanames.extend(nndnames)

        # Reads numpy arrays for all sites and keeps only selected columns
        for d in datanames:
            if remote:
                srv = pysftp.Connection(host=remote_data[0],
                                        username=remote_data[1])
                srv.get(remote_wind_data_path + f"/{d}.npy",
                        self.data_path + f"/{d}.npy")
                srv.close()
            if angle:
                wind[d] = np.load(self.data_path + '_angle' + f"/{d}.npy")
            else:
                wind[d] = np.load(self.data_path + f"/{d}.npy")
            if remote:
                os.remove(self.data_path + f"/{d}.npy")

            # If there is a list in vars attribute it should be a list of integers
            if type(vars) == list:
                for v in vars:
                    if type(v) != int or v > wind[d].shape[1]:
                        raise NameError('Error in variable selection')
                wind[d] = wind[d][:, vars]
            # If the period flag is on we add sinusoidal variables to the data with period a day and a year
            if period is not None:
                day = np.zeros((wind[d].shape[0], 1))
                freq = int(24 * 60 / period)
                for i in range(freq):
                    day[i::freq] = np.sin((2 * np.pi / freq) * i)
                # print(day.shape)
                year = np.zeros((wind[d].shape[0], 1))
                freq = int(365 * 24 * 60 / period)
                for i in range(freq):
                    year[i::freq] = np.sin((2 * np.pi / freq) * i)
                # print(year.shape)
                # print(wind[d].shape)
                wind[d] = np.concatenate((wind[d], day, year), axis=1)

        if (self.config['dataset'] == 0) or (self.config['dataset']
                                             == 'onesiteonevar'):
            if not ensemble:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_one_var(wind[datanames[0]][:, 0].reshape(-1, 1), datasize, testsize,
                                                   lag=lag, ahead=dahead, slice=slice, mode=mode)
            else:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_one_var(wind[datanames[0]][ens_slice[0]::ens_slice[1], 0].reshape(-1, 1),
                                                   datasize, testsize,
                                                   lag=lag, ahead=dahead, slice=slice, mode=mode)

        elif (self.config['dataset'] == 1) or (self.config['dataset']
                                               == 'onesitemanyvar'):
            if not ensemble:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_multiple_var(wind[datanames[0]], datasize, testsize,
                                                        lag=lag, ahead=dahead, slice=slice, mode=mode)
            else:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_multiple_var(wind[datanames[0][ens_slice[0]::ens_slice[1], :]], datasize,
                                                        testsize,
                                                        lag=lag, ahead=dahead, slice=slice, mode=mode)

        elif self.config['dataset'] == 2 or self.config[
                'dataset'] == 'manysiteonevar':
            stacked = np.vstack([wind[d][:, 0] for d in datanames]).T
            self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                self._generate_dataset_multiple_var(stacked, datasize, testsize,
                                                    lag=lag, ahead=dahead, slice=slice, mode=mode)
        elif self.config['dataset'] == 3 or self.config[
                'dataset'] == 31 or self.config['dataset'] == 'manysitemanyvar':
            stacked = np.hstack([wind[d] for d in datanames])
            self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                self._generate_dataset_multiple_var(stacked, datasize, testsize,
                                                    lag=lag, ahead=dahead, slice=slice, mode=mode)
        elif self.config['dataset'] == 4 or self.config['dataset'] == 5 or \
                self.config['dataset'] == 'manysitemanyvarstack':
            stacked = [
                self._generate_dataset_multiple_var(wind[d],
                                                    datasize,
                                                    testsize,
                                                    lag=lag,
                                                    ahead=dahead,
                                                    slice=slice,
                                                    mode=mode)
                for d in datanames
            ]

            self.train_x = np.vstack([x[0] for x in stacked])
            self.train_y = np.vstack([x[1] for x in stacked])

            self.val_x = stacked[0][2]
            self.val_y = stacked[0][3]
            self.test_x = stacked[0][4]
            self.test_y = stacked[0][5]
        # Training augmenting the dataset with random sites outside a radius
        elif self.config['dataset'] == 6:
            stacked = [
                self._generate_dataset_multiple_var(wind[d],
                                                    datasize,
                                                    testsize,
                                                    lag=lag,
                                                    ahead=dahead,
                                                    slice=slice,
                                                    mode=mode)
                for d in datanames
            ]
            # Training with all the sites
            self.train_x = np.vstack([x[0] for x in stacked])
            self.train_y = np.vstack([x[1] for x in stacked])

            # Testing and validating only with the experiment site
            self.val_x = stacked[0][2]
            self.val_y = stacked[0][3]
            self.test_x = stacked[0][4]
            self.test_y = stacked[0][5]
        else:
            raise NameError('ERROR: No such dataset type')