Beispiel #1
0
 def get_X_and_y(self, dataset):
     """Returns X and y based on a dataset. """
     if self.type == 'ar':
         if self.order > 0:
             #print('Dataset has order {}'.format(order))
             X, y = dataset_to_numpy_order(dataset,
                                           self.order,
                                           bias=self.bias)
         else:
             #print('Dataset has order {} -- should be zero.'.format(order))
             X, y = dataset_to_numpy(dataset, bias=self.bias)
     else:
         X, y = dataset_to_numpy_order_traditional_ar(dataset,
                                                      order=self.order,
                                                      bias=self.bias)
     return X, y
def train_ar_model(transform=False,
                   bias=False,
                   sig=False,
                   order=0,
                   overwrite_results=True):
    # path_transform = '/home/hanna/lagrings/results/stats/2014-01-01_2018-12-31/'
    # path = '/home/hanna/lagrings/ar_data/'
    # print(bias)
    # print(transform)
    if transform and bias:
        print('Not valid model....')
        raise OSError('Not valid model config')

    # path_transform = '/home/hanna/lagrings/results/stats/2014-01-01_2018-12-31/'
    # path = '/home/hanna/lagrings/ar_data/'

    #lagr_path = '/uio/lagringshotell/geofag/students/metos/hannasv/'
    #path_transform = '{}results/stats/2014-01-01_2018-12-31'.format(lagr_path)

    #path = '/global/D1/homes/hannasv/ar_data/'
    #path_ar_results = '/home/hannasv/results_ar/'
    #list_files = ['/global/D1/homes/hannasv/ar_data/all_vars_lat_lon_35.0_-15.0.nc', '/global/D1/homes/hannasv/ar_data/all_vars_lat_lon_35.0_-14.75.nc']
    list_files = glob.glob('/global/D1/homes/hannasv/ar_data3/*.nc')

    import xarray as xr
    path = '/global/D1/homes/hannasv/ar_data3/'
    path_ar_results = '/home/hannasv/results_ar_stripped/'

    #latitude  = 30.0
    #longitude = 5.25
    SPATIAL_RESOLUTION = 0.25

    latitudes = np.arange(30.0,
                          50.0 + SPATIAL_RESOLUTION,
                          step=SPATIAL_RESOLUTION)
    longitudes = np.arange(-15,
                           25 + SPATIAL_RESOLUTION,
                           step=SPATIAL_RESOLUTION)
    base = '/home/hannasv/results/stats/2014-01-01_2018-12-31/'

    explaination = ['q', 't2m', 'r', 'sp', 'bias', 'L1']
    tr_e = []
    tr_index = 4

    full_name = generate_model_name('AR', bias, transform, sig, order)
    config = get_config_from_model_name(full_name)

    full_name_tr = generate_model_name('TR', bias, transform, sig, order)
    tr_config = get_config_from_model_name(full_name_tr)
    path_ar_results = os.path.join(path_ar_results, full_name)

    if not os.path.isdir(path_ar_results):
        os.mkdir(path_ar_results)
        print('Created directory {}'.format(path_ar_results))
    print(full_name)
    list_to_regridd = get_list_of_files_to_regridd(model_name=full_name,
                                                   path=path_ar_results)
    #list_to_regridd = glob.glob('/global/D1/homes/hannasv/ar_data/*.nc')
    list_files = list_to_regridd
    print('Detected {}/13041 num files to regridd.'.format(
        len(list_to_regridd)))
    corrupt_files = []
    for i, fil in enumerate(list_files):  #list_to_regridd:
        #print(xr.open_dataset(fil))
        #print(fil==filnavn)
        print('Working on fil {}/{}'.format(i, len(list_files)))
        print(fil)
        splits = fil.split('_')
        latitude = splits[-2]
        longitude = splits[-1][:-3]

        #try:
        data = xr.open_dataset(fil, engine='netcdf4')
        #print(data)
        #print(data.sp)
        #except Exception as e:
        #    print('FATAL ERROR !!! Unable to load {} got error {}'.format(fil, e))

        #print(data)
        explain = explaination.copy()
        print(explain)
        tr_explain = tr_e.copy()

        #for o in range(0, order +1):
        o = 1
        name = full_name + '-L{}'.format(o)
        tr_name = full_name_tr + '-L{}'.format(o)

        w_filename = '{}weights_{}_{}_{}.nc'.format(path_ar_results, name,
                                                    longitude, latitude)
        p_filename = '{}performance_{}_{}_{}.nc'.format(
            path_ar_results, name, longitude, latitude)
        #data = xr.open_dataset(path+fil)
        start_time = timeit()
        try:
            X_train, y_train = dataset_to_numpy_order(
                dataset=data.sel(time=slice('2004', '2013')),
                order=order,
                bias=bias)
            X_test, y_test = dataset_to_numpy_order(
                dataset=data.sel(time=slice('2014', '2018')),
                order=order,
                bias=bias)

            if transform:  # and not bias):# or (not transform and bias):
                X_train = transform_X(X_train,
                                      lat=latitude,
                                      lon=longitude,
                                      data=stats_data,
                                      order=o)
                X_test = transform_X(X_test,
                                     lat=latitude,
                                     lon=longitude,
                                     data=stats_data,
                                     order=o)

            if sig:
                y_train = inverse_sigmoid(y_train)
                y_test = inverse_sigmoid(y_test)

            name = full_name + '-L{}'.format(o)
            tr_name = full_name_tr + '-L{}'.format(o)

            eval_dict = {}
            eval_tr_dict = {}
            weights_dict = {}
            weights_tr_dict = {}

            Xtr, ytr = drop_nans(X_train[:, :int(tr_index + o + bias)],
                                 y_train)
            Xte, yte = drop_nans(X_test[:, :int(tr_index + o + bias)], y_test)
            #print(Xtr.shape)

            ############# Fitting
            try:
                coeffs = fit_pixel(Xtr, ytr)

                y_test_pred = predict_pixel(Xte, coeffs)
                y_train_pred = predict_pixel(Xtr, coeffs)

                ################ Evaluation
                mse_test = mean_squared_error(y_test_pred, yte)
                mse_train = mean_squared_error(y_train_pred, ytr)

                mae_test = mean_squared_error(y_test_pred, yte)
                mae_train = mean_squared_error(y_train_pred, ytr)

            except np.linalg.LinAlgError:
                coeffs = np.ones(len(explain))
                mse_test = [np.nan]
                mse_train = [np.nan]  #mean_squared_error(y_train_pred, ytr)

                mae_test = [np.nan]  #mean_squared_error(y_test_pred, yte)
                mae_train = [np.nan]  #mean_squared_error(y_train_pred, ytr)

            ##################### Adding the autoregressive model
            #print(coeffs)
            #print(explaination)
            weights_dict['coeffs'] = (['weights'], coeffs.flatten()
                                      )  # 'latitude', 'longitude',

            eval_dict['mse_test'] = mse_test[0]  #(['latitude', 'longitude'],)
            eval_dict['mse_train'] = mse_train[0]

            eval_dict['mae_test'] = mae_test[0]  #(['latitude', 'longitude'], )
            eval_dict['mae_train'] = mae_train[
                0]  #(['latitude', 'longitude'], )

            num_test_samples = len(yte)
            num_train_samples = len(ytr)

            eval_dict[
                'num_test_samples'] = num_test_samples  # (['latitude', 'longitude'], )
            eval_dict[
                'num_train_samples'] = num_train_samples  # (['latitude', 'longitude'], )

            eval_dict.update(config)
            weights_dict.update(config)

            stop_time = timeit()
            #print(stop_time - start_time)
            eval_dict['time_elapsed_seconds'] = (
                stop_time - start_time)  #(['latitude', 'longitude'], )

            w_filename = '{}/weights_{}_{}_{}.nc'.format(
                path_ar_results, name, longitude, latitude)
            p_filename = '{}/performance_{}_{}_{}.nc'.format(
                path_ar_results, name, longitude, latitude)
            print(w_filename)
            ds = xr.Dataset(weights_dict,
                            coords={
                                'latitude': (['latitude'], [latitude]),
                                'longitude': (['longitude'], [longitude]),
                                'weights': (['weights'], explain)
                            })
            ds.to_netcdf(w_filename, engine='h5netcdf')

            ds = xr.Dataset(eval_dict,
                            coords={
                                'latitude': (['latitude'], [latitude]),
                                'longitude': (['longitude'], [longitude])
                            })
            ds.to_netcdf(p_filename, engine='h5netcdf')
            print(
                'finished calibrating bias {}, sigmoid {}, Transform {}, order/Lag {} - ({}, {})'
                .format(bias, sig, transform, o, longitude, latitude))
        #else:
        #print('Model config already calibrated bias {}, sigmoid {}, Transform {}, order/Lag {} - ({}, {})'.format(bias, sig, transform, o, longitude, latitude))
        except RuntimeError as e:
            print('It happens when trying to acess the data')
            corrupt_files.append(fil)
        except ValueError as e:
            print('issues with time dimension')
            corrupt_files.append(fil)
        print('Corrupt files')
        print(corrupt_files)
            tr_name = full_name_tr + '-o{}'.format(o)

            w_filename = '{}weights_{}_{}_{}.nc'.format(
                path_ar_results, name, longitude, latitude)
            p_filename = '{}performance_{}_{}_{}.nc'.format(
                path_ar_results, name, longitude, latitude)

            fil = 'all_vars_lat_lon_{}_{}.nc'.format(latitude, longitude)
            data = xr.open_dataset(path + fil)
            #if not os.path.exists(w_filename) and os.path.exists(p_filename):
            if o > 0:
                explain.append('O{}'.format(o))
                tr_explain.append('O{}'.format(o))
            start_time = timeit()
            X_train, y_train = dataset_to_numpy_order(
                dataset=data.sel(time=slice('2004', '2013')),
                order=order,
                bias=bias)

            X_test, y_test = dataset_to_numpy_order(
                dataset=data.sel(time=slice('2014', '2018')),
                order=order,
                bias=bias)
            if transform:
                X_train = transform_X(X_train,
                                      lat=latitude,
                                      lon=longitude,
                                      data=stats_data,
                                      order=o)
                X_test = transform_X(X_test,
                                     lat=latitude,
                                     lon=longitude,
Beispiel #4
0
def train_ar_model(transform=False,
                   bias=False,
                   sig=False,
                   order=0,
                   overwrite_results=True):
    path_transform = '/home/hanna/lagrings/results/stats/2014-01-01_2018-12-31/'
    path = '/home/hanna/lagrings/ar_data/'
    #print(bias)
    #print(transform)
    if transform and bias:
        print('Not valid model....')
        raise OSError('Not valid model config')

    # path_transform = '/home/hanna/lagrings/results/stats/2014-01-01_2018-12-31/'
    # path = '/home/hanna/lagrings/ar_data/'

    lagr_path = '/uio/lagringshotell/geofag/students/metos/hannasv/'
    path_transform = '{}results/stats/2014-01-01_2018-12-31'.format(lagr_path)

    path = '{}ar_data/'.format(lagr_path)
    path_ar_results = '{}/results/ar/'.format(lagr_path)

    latitude = 30.0
    longitude = 5.25
    SPATIAL_RESOLUTION = 0.25

    latitudes = np.arange(30.0,
                          50.0 + SPATIAL_RESOLUTION,
                          step=SPATIAL_RESOLUTION)
    longitudes = np.arange(-15,
                           25 + SPATIAL_RESOLUTION,
                           step=SPATIAL_RESOLUTION)
    base = '{}/results/stats/2014-01-01_2018-12-31/'.format(lagr_path)

    if transform:
        ds_tcc = xr.open_dataset(base + 'stats_pixel_tcc_all.nc')
        ds_r = xr.open_dataset(base + 'stats_pixel_r_all.nc')
        ds_q = xr.open_dataset(base + 'stats_pixel_q_all.nc')
        ds_t2m = xr.open_dataset(base + 'stats_pixel_t2m_all.nc')
        ds_sp = xr.open_dataset(base + 'stats_pixel_sp_all.nc')

        stats_data = {
            'q': ds_q,
            't2m': ds_t2m,
            'r': ds_r,
            'sp': ds_sp,
            'tcc': ds_tcc
        }

    explaination = ['q', 't2m', 'r', 'sp']
    tr_e = []
    tr_index = 4

    if bias:
        explaination.append('bias')
        tr_e.append('bias')
        #tr_index +=1

    full_name = generate_model_name('AR', bias, transform, sig, order)
    config = get_config_from_model_name(full_name)

    full_name_tr = generate_model_name('TR', bias, transform, sig, order)
    tr_config = get_config_from_model_name(full_name_tr)

    for latitude in latitudes:
        for longitude in longitudes:

            explain = explaination.copy()
            tr_explain = tr_e.copy()

            for o in range(0, order + 1):
                name = full_name + '-L{}'.format(o)
                tr_name = full_name_tr + '-L{}'.format(o)

                w_filename = '{}weights_{}_{}_{}.nc'.format(
                    path_ar_results, name, longitude, latitude)
                p_filename = '{}performance_{}_{}_{}.nc'.format(
                    path_ar_results, name, longitude, latitude)

                if not (os.path.exists(w_filename)
                        and os.path.exists(p_filename)) or overwrite_results:
                    fil = 'all_vars_lat_lon_{}_{}.nc'.format(
                        latitude, longitude)
                    data = xr.open_dataset(path + fil)
                    #
                    if o > 0:
                        explain.append('O{}'.format(o))
                        tr_explain.append('O{}'.format(o))
                    start_time = timeit()

                    X_train, y_train = dataset_to_numpy_order(
                        dataset=data.sel(time=slice('2004', '2013')),
                        order=order,
                        bias=bias)
                    #print(X_train[0, :])
                    X_test, y_test = dataset_to_numpy_order(
                        dataset=data.sel(time=slice('2014', '2018')),
                        order=order,
                        bias=bias)
                    #print('transform {}'.format(transform))
                    #print(bias)
                    if transform:  # and not bias):# or (not transform and bias):
                        X_train = transform_X(X_train,
                                              lat=latitude,
                                              lon=longitude,
                                              data=stats_data,
                                              order=o)
                        X_test = transform_X(X_test,
                                             lat=latitude,
                                             lon=longitude,
                                             data=stats_data,
                                             order=o)
                    #else:
                    #    print('Not valid model....')
                    #    raise OSError('Not valid model config')

                    if sig:
                        y_train = inverse_sigmoid(y_train)
                        y_test = inverse_sigmoid(y_test)

                    name = full_name + '-o{}'.format(o)
                    tr_name = full_name_tr + '-o{}'.format(o)

                    eval_dict = {}
                    eval_tr_dict = {}
                    weights_dict = {}
                    weights_tr_dict = {}

                    Xtr, ytr = drop_nans(X_train[:, :int(tr_index + o)],
                                         y_train)
                    Xte, yte = drop_nans(X_test[:, :int(tr_index + o)], y_test)

                    if sig:
                        yte = sigmoid(yte)
                        ytr = sigmoid(ytr)

                    if np.isnan(yte).any():
                        print('Warning nans detected in training data')

                    if np.isnan(ytr).any():
                        print('Warning nans detected in test data')

                    if o > 0:
                        # updatig predictors
                        Tr_Xtr = Xtr[:, tr_index:]
                        Tr_Xte = Xte[:, tr_index:]
                        print(Tr_Xtr.shape)
                        coeffs_tr = fit_pixel(Tr_Xtr, ytr)

                        y_test_pred_tr = predict_pixel(Tr_Xte, coeffs_tr)
                        y_train_pred_tr = predict_pixel(Tr_Xtr, coeffs_tr)

                        mse_test_tr = mean_squared_error(y_test_pred_tr, yte)
                        mse_train_tr = mean_squared_error(y_train_pred_tr, ytr)

                        mae_test_tr = mean_absolute_error(y_test_pred_tr, yte)
                        mae_train_tr = mean_absolute_error(
                            y_train_pred_tr, ytr)

                    ############# Fitting
                    coeffs = fit_pixel(Xtr, ytr)

                    y_test_pred = predict_pixel(Xte, coeffs)
                    y_train_pred = predict_pixel(Xtr, coeffs)

                    ################ Evaluation
                    mse_test = mean_squared_error(y_test_pred, yte)
                    mse_train = mean_squared_error(y_train_pred, ytr)

                    mae_test = mean_squared_error(y_test_pred, yte)
                    mae_train = mean_squared_error(y_train_pred, ytr)

                    ##################### Adding the autoregressive model
                    #print(coeffs)
                    #print(explaination)
                    weights_dict['coeffs'] = (['weights'], coeffs.flatten()
                                              )  # 'latitude', 'longitude',

                    eval_dict['mse_test'] = mse_test[
                        0]  #(['latitude', 'longitude'],)
                    eval_dict['mse_train'] = mse_train[0]

                    eval_dict['mae_test'] = mae_test[
                        0]  #(['latitude', 'longitude'], )
                    eval_dict['mae_train'] = mae_train[
                        0]  #(['latitude', 'longitude'], )

                    num_test_samples = len(yte)
                    num_train_samples = len(ytr)

                    eval_dict[
                        'num_test_samples'] = num_test_samples  # (['latitude', 'longitude'], )
                    eval_dict[
                        'num_train_samples'] = num_train_samples  # (['latitude', 'longitude'], )

                    eval_dict.update(config)
                    weights_dict.update(config)

                    ###################### Adding traditional model
                    if o > 0:
                        weights_tr_dict['coeffs'] = ([
                            'weights'
                        ], coeffs_tr.flatten())  # 'latitude', 'longitude',
                        print(weights_tr_dict)
                        print(tr_explain)
                        eval_tr_dict['mse_test'] = mse_test_tr[
                            0]  #(['latitude', 'longitude'],)
                        eval_tr_dict['mse_train'] = mse_train_tr[0]

                        eval_tr_dict['mae_test'] = mae_test_tr[
                            0]  #(['latitude', 'longitude'], )
                        eval_tr_dict['mae_train'] = mae_train_tr[
                            0]  #(['latitude', 'longitude'], )

                        num_test_samples = len(yte)
                        num_train_samples = len(ytr)

                        eval_tr_dict[
                            'num_test_samples'] = num_test_samples  # (['latitude', 'longitude'], )
                        eval_tr_dict[
                            'num_train_samples'] = num_train_samples  # (['latitude', 'longitude'], )

                        eval_tr_dict.update(tr_config)
                        weights_tr_dict.update(tr_config)

                        w_tr_filename = '{}/weights_{}_{}_{}.nc'.format(
                            path_ar_results, tr_name, longitude, latitude)
                        p_tr_filename = '{}/performance_{}_{}_{}.nc'.format(
                            path_ar_results, tr_name, longitude, latitude)

                        ds = xr.Dataset(weights_tr_dict,
                                        coords={
                                            'latitude':
                                            (['latitude'], [latitude]),
                                            'longitude':
                                            (['longitude'], [longitude]),
                                            'weights':
                                            (['weights'], tr_explain)
                                        })
                        ds.to_netcdf(w_tr_filename)

                        ds = xr.Dataset(eval_tr_dict,
                                        coords={
                                            'latitude':
                                            (['latitude'], [latitude]),
                                            'longitude':
                                            (['longitude'], [longitude])
                                        })
                        ds.to_netcdf(p_tr_filename)

                    stop_time = timeit()
                    #print(stop_time - start_time)
                    eval_dict['time_elapsed_seconds'] = (
                        stop_time - start_time)  #(['latitude', 'longitude'], )

                    w_filename = '{}weights_{}_{}_{}.nc'.format(
                        path_ar_results, name, longitude, latitude)
                    p_filename = '{}performance_{}_{}_{}.nc'.format(
                        path_ar_results, name, longitude, latitude)
                    ds = xr.Dataset(weights_dict,
                                    coords={
                                        'latitude': (['latitude'], [latitude]),
                                        'longitude':
                                        (['longitude'], [longitude]),
                                        'weights': (['weights'], explain)
                                    })
                    ds.to_netcdf(w_filename)

                    ds = xr.Dataset(eval_dict,
                                    coords={
                                        'latitude': (['latitude'], [latitude]),
                                        'longitude':
                                        (['longitude'], [longitude])
                                    })
                    ds.to_netcdf(p_filename)
                    print(
                        'finished calibrating bias {}, sigmoid {}, Transform {}, order/Lag {} - ({}, {})'
                        .format(bias, sig, transform, o, longitude, latitude))
                else:
                    print(
                        'Model config already calibrated bias {}, sigmoid {}, Transform {}, order/Lag {} - ({}, {})'
                        .format(bias, sig, transform, o, longitude, latitude))