Esempio n. 1
0
def test_universality(path_to_config_file, fluxnet_site_type, num_iter):
    config = get_training_params(path_to_config_file)
    train_labels = [l + '_train' for l in config['labels']
                    ]  #used to identify proper labels to be used for training

    base_model = get_model_type(config['model_type'])

    zip_file_info_for_climate_sites = get_zip_info(fluxnet_site_type)
    print(zip_file_info_for_climate_sites)

    # preprocess data for each site and store so we dont have to do it s^2 times
    processed_site_data = {}
    common_variable_set = set(config['target_variables'])
    for zf in zip_file_info_for_climate_sites:
        site_data, site_variables = preprocess(*zf,
                                               config['granularity'],
                                               config['target_variables'],
                                               config['backup_variables'],
                                               config['labels'], [],
                                               offset=config['offset'])
        print(site_variables)

        site_data[site_variables[:-1]] = scale(site_data[site_variables[:-1]])
        processed = site_data[site_variables + train_labels +
                              config['labels']].astype('float64')
        processed_site_data[zf[1]] = processed

        common_variable_set = common_variable_set.intersection(
            set(site_variables))

    site_test_data = {}
    common_variables = list(common_variable_set) + [
        'avg_fAPAR_interpol', 'time_index'
    ]
    print("Common variables: " + str(common_variables))
    cumulative_train_data = np.zeros((1, len(common_variables)))
    cumulative_val_data = np.zeros((1, len(common_variables)))
    cumulative_train_labels = np.zeros((1, 1))
    cumulative_val_labels = np.zeros((1, 1))
    for site_name in processed_site_data:
        d = processed_site_data[site_name]
        site_train, site_test, site_y_train, site_y_test = train_test_split(
            d[common_variables].to_numpy(),
            d[train_labels].to_numpy(),
            test_size=config['test_size'],
            shuffle=False)
        # set aside test data for later
        site_test_data[site_name] = (site_test, site_y_test)

        # create train-val split
        site_train, site_val, site_y_train, site_y_val = train_test_split(
            site_train,
            site_y_train,
            test_size=config['val_size'],
            shuffle=False)

        cumulative_train_data = np.concatenate(
            (cumulative_train_data, site_train))
        cumulative_train_labels = np.concatenate(
            (cumulative_train_labels, site_y_train))
        cumulative_val_data = np.concatenate((cumulative_val_data, site_val))
        cumulative_val_labels = np.concatenate(
            (cumulative_val_labels, site_y_val))

    cumulative_train_data = cumulative_train_data[1:]
    cumulative_train_labels = cumulative_train_labels[1:]
    cumulative_val_data = cumulative_val_data[1:]
    cumulative_val_labels = cumulative_val_labels[1:]

    all_training_data = np.concatenate(
        (cumulative_train_data, cumulative_val_data))
    all_training_labels = np.concatenate(
        (cumulative_train_labels, cumulative_val_labels))
    train = list(range(len(cumulative_train_data)))
    test = list(range(len(cumulative_train_data), len(all_training_data)))

    num_key_variables = len(common_variables) - 1

    clf = GridSearchCV(base_model(num_key_variables, 1),
                       config['hyperparameter_grid'],
                       cv=[(train, test)],
                       refit=False,
                       n_jobs=-1)
    clf.fit(all_training_data, all_training_labels)

    best_params = clf.best_params_

    path = config['out'] + '/' + fluxnet_site_type
    if not os.path.exists(path):
        os.makedirs(path)

    with open(path + '/universiality_best_params.txt', 'w') as f:
        f.write("Common variable set: \n")
        f.write(str(common_variables) + '\n')
        f.write("Best parameters: \n")
        f.write(str(best_params) + '\n')
        f.close()

    site_name = []
    r2_score = []

    for i in range(num_iter):
        print("Iteration " + str(i))
        model = base_model(num_key_variables, 1)
        model.set_params(**best_params)
        ## For LSTM
        model.fit(all_training_data, all_training_labels)
        model.set_params(scoring='r2')
        for sn in site_test_data:
            site_name.append(sn)
            reference_data, reference_labels = site_test_data[sn]
            r2_score.append(model.score(reference_data, reference_labels))

    df = pd.DataFrame({'site': site_name, 'r2': r2_score})
    df.to_csv(path + '/universiality_test.txt', index=False)
def test_performance(site_zip_info, path_to_config_file, num_iter):
    config = get_training_params(path_to_config_file)
    train_labels = [l + '_train' for l in config['labels']
                    ]  #used to identify proper labels to be used for training

    data, variables = preprocess(*site_zip_info,
                                 config['granularity'],
                                 config['target_variables'],
                                 config['backup_variables'],
                                 config['labels'], [],
                                 offset=config['offset'])
    print("Training Variables for " + site_zip_info[1] + ":")
    print(variables)
    num_key_variables = len(
        variables
    ) - 1  #dont count time index; time index is only used for custom sampler

    data[variables[:-1]] = scale(data[variables[:-1]])
    processed = data[variables + train_labels +
                     config['labels']].astype('float64')

    # # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(
        processed[variables].to_numpy(),
        processed[train_labels].to_numpy(),
        test_size=config['test_size'],
        shuffle=False
    )  #can't shuffle because time series data -> sequence order matters
    hyperparam_out = open(config['out'] + '/' + site_zip_info[1] + '_out.txt')
    best_params = None
    for line in hyperparam_out:
        if '{' in line:
            best_params = ast.literal_eval(line)
            break

    base_model = get_model_type(config['model_type'])

    model = base_model(num_key_variables, 1)
    model.set_params(**best_params)
    predictions = []
    scores = []
    for i in range(num_iter):
        print("Iteration " + str(i))
        if config['model_type'] == 'lstm':
            model.fit(X_train, y_train)
            test_pred = ([np.nan] *
                         (model.sequence_length - 1)) + model.predict(X_test)
        elif config['model_type'] == 'ann' or config['model_type'] == 'cgan':
            model.fit(X_train[:, :-1], y_train)
            test_pred = model.predict(X_test[:, :-1])
        predictions.append(test_pred)
        prev_scoring = model.scoring
        model.set_params(scoring='r2')
        if config['model_type'] == 'lstm':
            scores.append(model.score(X_test, y_test))
        elif config['model_type'] == 'ann' or config['model_type'] == 'cgan':
            scores.append(model.score(X_test[:, :-1], y_test))
        model.set_params(scoring=prev_scoring)

    df = pd.DataFrame(data=np.array(predictions), columns=X_test[:, -1])
    df['r2'] = scores
    path = config['out'] + '/predictions'
    if not os.path.exists(path):
        os.makedirs(path)
    df.to_csv(path + '/' + site_zip_info[1] + '.txt', index=False)
Esempio n. 3
0
def test_generalizability(site_zip_info, path_to_config_file,
                          fluxnet_site_type, num_iter):
    config = get_training_params(path_to_config_file)

    train_labels = [l + '_train' for l in config['labels']
                    ]  #used to identify proper labels to be used for training
    base_model = get_model_type(config['model_type'])

    zip_file_info_for_climate_sites = get_zip_info(fluxnet_site_type)
    print(zip_file_info_for_climate_sites)

    # preprocess data for each site and store so we dont have to do it s^2 times
    processed_site_data = {}
    X_train, X_test, y_train, y_test = None, None, None, None
    found = False
    for zf in zip_file_info_for_climate_sites:
        data, variables = preprocess(*zf,
                                     config['granularity'],
                                     config['target_variables'],
                                     config['backup_variables'],
                                     config['labels'], [],
                                     offset=config['offset'])

        data[variables[:-1]] = scale(data[variables[:-1]])
        processed = data[variables + train_labels +
                         config['labels']].astype('float64')
        processed_site_data[zf[1]] = (processed, variables)

        if zf[1] == site_zip_info[1]:
            found = True
            reference_data, reference_variables = data, variables
            print("Training Variables for evaluating " + site_zip_info[1] +
                  ":")
            print(reference_variables)
            num_key_variables = len(reference_variables) - 1

            # split the dataset
            X_train, X_test, y_train, y_test = train_test_split(
                reference_data[reference_variables].to_numpy(),
                reference_data[train_labels].to_numpy(),
                test_size=config['test_size'],
                shuffle=False
            )  #can't shuffle because time series data -> sequence order matters

    if not found:
        raise RuntimeError("Reference site not found within data directory")

    hyperparam_out = open(config['out'] + '/' + site_zip_info[1] + '_out.txt',
                          'r')
    best_params = None
    for line in hyperparam_out:
        if '{' in line:
            best_params = ast.literal_eval(line)
            break

    # test using each site as the reference site
    site_name = []
    trained_on = []
    r2_score = []
    for i in range(num_iter):
        print(site_zip_info[1] + " iteration: " + str(i))
        model = base_model(num_key_variables, 1)
        if best_params is not None:
            model.set_params(**best_params)
        model.fit(X_train, y_train)
        model.set_params(scoring='r2')

        for other_site_name in processed_site_data:
            if other_site_name != site_zip_info[1]:
                other_site_data, other_site_variables = processed_site_data[
                    other_site_name]

                for v in reference_variables:
                    if v not in other_site_variables:
                        other_site_data[v] = np.zeros(
                            len(other_site_data.index))
                print("Trained on: " + site_zip_info[1] + ", Scoring on: " +
                      other_site_name)
                site_name.append(other_site_name)
                trained_on.append(site_zip_info[1])
                r2_score.append(
                    model.score(
                        other_site_data[reference_variables].to_numpy(),
                        other_site_data[train_labels].to_numpy()))

    df = pd.DataFrame({
        'site': site_name,
        'trained_on': trained_on,
        'r2': r2_score
    })
    path = config['out'] + '/' + fluxnet_site_type
    if not os.path.exists(path):
        os.makedirs(path)
    df.to_csv(path + '/' + site_zip_info[1] + '_generalizability_test.txt',
              index=False)
def train_on_site(site_zip_info, path_to_config_file):
    """Train a model with the given configurations using data from the specified site.
    General pipeline:
        - preprocess data 
        - hyperparameter search
        - scoring
        - graping

    Keyword arguments:
    site_zip_info -- zipfile information about specified site (tup)
    path_to_config_file -- relative path to config file (str)
    """
    config = get_training_params(path_to_config_file)
    train_labels = [l + '_train' for l in config['labels']
                    ]  #used to identify proper labels to be used for training

    file_output = []
    data, variables = preprocess(*site_zip_info,
                                 config['granularity'],
                                 config['target_variables'],
                                 config['backup_variables'],
                                 config['labels'],
                                 file_output,
                                 offset=config['offset'])
    print("Training Variables for " + site_zip_info[1] + ":")
    print(variables)
    num_key_variables = len(
        variables
    ) - 1  #dont count time index; time index is only used for custom sampler

    data[variables[:-1]] = scale(data[variables[:-1]])
    processed = data[variables + train_labels +
                     config['labels']].astype('float64')

    base_model = get_model_type(config['model_type'])

    # # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(
        processed[variables].to_numpy(),
        processed[train_labels].to_numpy(),
        test_size=config['test_size'],
        shuffle=False
    )  #don't shuffle time series data -> sequence order matters

    clf = GridSearchCV(base_model(num_key_variables, 1),
                       config['hyperparameter_grid'],
                       cv=config['k'])
    clf.fit(X_train, y_train)

    # test best model
    best_model = clf.best_estimator_
    file_output.append("Number of epochs trained for on this site:")
    file_output.append(str(best_model.trained_for + 1))
    file_output.append("Best parameters set found on for this site:")
    file_output.append(str(clf.best_params_))
    file_output.append("Model score on test set with best parameters (" +
                       clf.best_estimator_.scoring + "):")
    file_output.append(str(best_model.score(X_test, y_test)))
    best_model.set_params(scoring='r2')
    file_output.append("R2 score on train set with best parameters:")
    file_output.append(str(best_model.score(X_train, y_train)))
    file_output.append("R2 score on test set with best parameters:")
    file_output.append(str(best_model.score(X_test, y_test)))

    # visualize results
    y_train = best_model.predict(X_train)
    y_test = best_model.predict(X_test)
    if config['model_type'] == 'lstm':
        if config['offset'] > 0:
            y_train = ([np.nan] *
                       (best_model.sequence_length + config['offset'] -
                        1)) + y_train[:-config['offset']]
            y_test = ([np.nan] *
                      (best_model.sequence_length + config['offset'] -
                       1)) + y_test[:-config['offset']]
        else:
            y_train = ([np.nan] * (best_model.sequence_length - 1)) + y_train
            y_test = ([np.nan] * (best_model.sequence_length - 1)) + y_test

    generate_visualizations(processed['time_index'].to_numpy().squeeze(),
                            processed[config['labels']].to_numpy().squeeze(),
                            y_test, y_train, config['granularity'],
                            data['TIMESTAMP'].iloc[0], config['labels'],
                            site_zip_info[1], config['viz'])
    generate_file_output(file_output, site_zip_info[1], config['out'])

    if config['model_type'] == 'lstm':
        generate_weights_visualization(best_model, variables[:-1],
                                       site_zip_info[1], config['viz'])
Esempio n. 5
0
def quantify_weight_variability(site_zip_info, path_to_config_file, num_iter):
    config = get_training_params(path_to_config_file)
    train_labels = [l + '_train' for l in config['labels']
                    ]  #used to identify proper labels to be used for training

    site = []
    target_variable = []
    weight_type = []
    weight_sum_across_nodes = []
    data, variables = preprocess(*site_zip_info,
                                 config['granularity'],
                                 config['target_variables'],
                                 config['backup_variables'],
                                 config['labels'], [],
                                 offset=config['offset'])
    print(variables)
    num_key_variables = len(variables) - 1

    data[variables[:-1]] = scale(data[variables[:-1]])
    processed = data[variables + train_labels +
                     config['labels']].astype('float64')

    # # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(
        processed[variables].to_numpy(),
        processed[train_labels].to_numpy(),
        test_size=config['test_size'],
        shuffle=False
    )  #can't shuffle because time series data -> sequence order matters

    hyperparam_out = open(config['out'] + '/' + site_zip_info[1] + '_out.txt')
    best_params = None
    for line in hyperparam_out:
        if '{' in line:
            best_params = ast.literal_eval(line)
            break

    base_model = get_model_type(config['model_type'])

    if config['model_type'] == 'lstm':
        lstm = base_model(num_key_variables, 1)
        lstm.set_params(**best_params)
        # lstm.set_params(epochs=1)

        for i in range(num_iter):
            lstm.fit(X_train, y_train)

            param_list = list(lstm.model.parameters())
            dim = lstm.hidden_dim
            input_weights = param_list[0].data.cpu()
            ii_weights = torch.sum(input_weights[:dim], dim=0)
            if_weights = torch.sum(input_weights[dim:dim * 2], dim=0)
            ig_weights = torch.sum(input_weights[dim * 2:dim * 3], dim=0)
            io_weights = torch.sum(input_weights[dim * 3:dim * 4], dim=0)

            for i in range(len(ii_weights)):
                site.append(site_zip_info[1])
                target_variable.append(variables[i])
                weight_type.append("input-input")
                weight_sum_across_nodes.append(ii_weights[i].item())

            for i in range(len(if_weights)):
                site.append(site_zip_info[1])
                target_variable.append(variables[i])
                weight_type.append("input-forget")
                weight_sum_across_nodes.append(if_weights[i].item())

            for i in range(len(ig_weights)):
                site.append(site_zip_info[1])
                target_variable.append(variables[i])
                weight_type.append("input-cell state")
                weight_sum_across_nodes.append(ig_weights[i].item())

            for i in range(len(io_weights)):
                site.append(site_zip_info[1])
                target_variable.append(variables[i])
                weight_type.append("input-output")
                weight_sum_across_nodes.append(io_weights[i].item())
    else:
        raise ValueError(
            "weight variablity testing not supported for specified model type")

    df = pd.DataFrame({
        "site": site,
        "target_variable": target_variable,
        "weight_type": weight_type,
        "variability": weight_sum_across_nodes
    })
    path = config['out'] + '/weights'
    if not os.path.exists(path):
        os.makedirs(path)
    df.to_csv(path + '/' + site_zip_info[1] + '_weight_variance.txt',
              index=False)