Esempio n. 1
0
def main():
    method = 'OrgData'

    # , 'DOcategory', 'pHcategory']  # ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    targets = ['dissolved_oxygen', 'ph']

    models = ['multihead_MLP']
    path = 'Sondes_data/train_Summer/'
    files = [f for f in os.listdir(path) if f.endswith(
        ".csv") and f.startswith('leavon')]

    for model_name in models:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets',
                        'file_names': 'file_names',  'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'configs': 'configs', 'scores': 'scores'}
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons',  'CV': 'CV',
                        'file_names': 'file_names',  'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mse': 'mse', 'rmse': 'rmse', 'R2': 'R2', 'configs': 'configs'}
            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:

                result_filename = 'results_'+target + \
                    '_'+file + '_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory+result_filename, index=False)
                PrH_index = 0
                for n_steps_in in [1, 3, 6, 12, 24, 36]:
                    print(n_steps_in)

                    dataset = pd.read_csv(path+file)
                    dataset = dataset[[
                        'year', 'month', 'day', 'hour', target]]

                    # dataset = dataset.dropna()
                    print(dataset.head())

                    dataset = temporal_horizon(
                        dataset, PrH_index, target)

                    train_X_grid, train_y_grid = split_sequences(
                        dataset, n_steps_in)

                    dataset_bgsusd = pd.read_csv(path+'bgsusd_all.csv')
                    dataset_osugi = pd.read_csv(path+'osugi.csv')
                    dataset_utlcp = pd.read_csv(path+'utlcp.csv')
                    dataset_leoc_1 = pd.read_csv(path+'leoc_1.csv')

                    dataset_bgsusd = temporal_horizon(
                        dataset_bgsusd[[target]], PrH_index, target)
                    dataset_osugi = temporal_horizon(
                        dataset_osugi[[target]], PrH_index, target)
                    dataset_utlcp = temporal_horizon(
                        dataset_utlcp[[target]], PrH_index, target)
                    dataset_leoc_1 = temporal_horizon(
                        dataset_leoc_1[[target]], PrH_index, target)

                    train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences(
                        dataset_bgsusd, n_steps_in)
                    train_X_grid_osugi, train_y_grid_osugi = split_sequences(
                        dataset_osugi, n_steps_in)
                    train_X_grid_utlcp, train_y_grid_utlcp = split_sequences(
                        dataset_utlcp, n_steps_in)
                    train_X_grid_leoc_1, train_y_grid_leoc_1 = split_sequences(
                        dataset_leoc_1, n_steps_in)

                    # print(train_X_grid[0:2])
                    # print("--")
                    input_dim = train_X_grid.shape
                    # print("shapes: ")
                    # print(input_dim)
                    # print(train_y_grid.shape)

                    # print('na:')
                    # inds = np.where(np.isnan(train_X_grid))
                    # print(inds)
                    # train_X_grid[inds] = 0
                    # inds = np.where(np.isnan(train_y_grid))
                    # train_y_grid[inds] = 0
                    # print(inds)
                    # print('--')
                    n_features = 1
                    X1 = train_X_grid[:, :, -1]
                    X2 = train_X_grid_bgsusd[:, :, -1]
                    X3 = train_X_grid_osugi[:, :, -1]
                    X4 = train_X_grid_utlcp[:, :, -1]
                    X5 = train_X_grid_leoc_1[:, :, -1]
                    y = train_y_grid

                    # print("-X-")
                    # print(X1.shape)
                    # print(np.array([X1, X2, X3, X4, X5]).shape)
                    # print("--")

                    n_steps_out = y.shape[1]
                    if cat:
                        y = to_categorical(y, 3)
                    # print(X1[0:2])
                    # print("--")

                    train_X_grid = train_X_grid.reshape(
                        train_X_grid.shape[0], train_X_grid.shape[1]*train_X_grid.shape[2])
                    # print(train_X_grid[0])
                    # dftime = pd.DataFrame({
                    #     'year': np.array(train_X_grid[:, -5]).astype(int), 'month': np.array(train_X_grid[:, -4]).astype(int),
                    #     'day': np.array(train_X_grid[:, -3]).astype(int), 'hour': np.array(train_X_grid[:, -2]).astype(int)})
                    # df_time = pd.to_datetime(
                    #     dftime, format='%Y%m%d %H')

                    # print(df_time.head())

                    start_time = time.time()

                    # if cat == 1:
                    #     metric = make_scorer(f2_measure)
                    # else:
                    #     metric = make_scorer(R2_measure)

                    # custom_cv = func.custom_cv_2folds(X1, 3)

                    # if cat == 1:
                    #     gs = RandomizedSearchCV(
                    #         estimator=model, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, scoring=metric,  verbose=0, random_state=42)
                    #     clf = gs.fit([X1, X2, X3, X4, X5], y, epochs=1000,
                    #                  model__class_weight={0: 1, 1: 50, 2: 100})
                    # else:
                    # gs = RandomizedSearchCV(
                    #     estimator=model, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=1, cv=custom_cv, scoring=metric,  verbose=0, random_state=42)

                    i_cv = 1
                    neurons = [32, 64, 128]
                    epochs = [500, 1000, 2000]
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        train_X = [X1[train_index], X2[train_index],
                                   X3[train_index], X4[train_index], X5[train_index]]
                        train_y = y[train_index]
                        test_X = [X1[test_index], X2[test_index],
                                  X3[test_index], X4[test_index], X5[test_index]]
                        test_y = y[test_index]

                        test_time = train_X_grid[test_index]
                        dftime = pd.DataFrame({
                            'year': np.array(test_time[:, -5]).astype(int), 'month': np.array(test_time[:, -4]).astype(int),
                            'day': np.array(test_time[:, -3]).astype(int), 'hour': np.array(test_time[:, -2]).astype(int),
                        })
                        df_time = pd.to_datetime(dftime, format='%Y%m%d %H')
                        # print("-CV test-")
                        # print(test_X[0:2])
                        # print(np.array(test_X).shape)
                        # print(test_y[0:2])
                        # print(np.array(test_y).shape)
                        # print("--")
                        # print("--")

                        for neuron in neurons:
                            for epoch in epochs:
                                model = algofind(
                                    model_name, neuron, input_dim, cat, n_steps_in, n_features, n_steps_out)
                                clf = model.fit(train_X, train_y,
                                                epochs=epoch, verbose=0)

                                configs = (neuron, epoch)
                                predictions = model.predict(test_X)

                                fpath = 'predictions_' + method+target+'_Window' +\
                                    str(n_steps_in) + '_TH' +\
                                    str(PrH_index)+'_CV' + \
                                    str(i_cv)+str(neuron)+str(epoch)+file

                                if cat == 1:
                                    test_y = np.argmax(test_y, axis=1)

                                cm0 = np.zeros((n_steps_out, 6))
                                for t in range(n_steps_out):
                                    cm0[t, :] = func.forecast_accuracy(
                                        predictions[:, t], test_y[:, t], cat)
                                print(cm0)

                                fig, ax = plt.subplots(
                                    nrows=5, ncols=2,  figsize=(50, 50))
                                i = j = 0
                                k = 0
                                columns = ['t+1', 't+3', 't+6', 't+12',
                                           't+24', 't+36', 't+48', 't+60', 't+72']
                                for col in columns:
                                    if k < len(columns):
                                        ax[i, j].scatter(
                                            df_time.values, test_y[:, k])
                                        ax[i, j].scatter(
                                            df_time.values, predictions[:, k])
                                        k = k+1
                                        ax[i, j].set_title(col)
                                        ax[i, j].legend(['y', 'yhat'])
                                        j += 1
                                        if j > 1:
                                            i += 1
                                            j = 0

                                # plt.legend(['actual', 'predictions'],
                                #            loc='lower right')
                                plt.savefig(directory+fpath+'.jpg')
                                plt.close()

                                # print(test_y.shape)
                                # print(predictions.shape)
                                columns = ['a+1', 'a+3', 'a+6', 'a+12',
                                           'a+24', 'a+36', 'a+48', 'a+60', 'a+72']
                                df_actual = pd.DataFrame(
                                    data=test_y, columns=columns)
                                columns = ['p+1', 'p+3', 'p+6', 'p+12',
                                           'p+24', 'p+36', 'p+48', 'p+60', 'p+72']
                                df_predictions = pd.DataFrame(
                                    data=predictions, columns=columns)

                                frames = [df_actual, df_predictions]
                                # concatenate dataframes
                                df = pd.concat(frames, axis=1)  # sort=False
                                df.to_csv(directory+fpath, index=False)

                                if cat == 1:
                                    data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv,
                                            'file_names': file, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]],  'configs': [configs]}
                                elif cat == 0:
                                    data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv,
                                            'file_names': file, 'mape': [cm0[:, 0]], 'me': [cm0[:, 1]], 'mae': [cm0[:, 2]], 'mse': [cm0[:, 3]], 'rmse': [cm0[:, 4]], 'R2': [cm0[:, 5]], 'configs': [configs]}

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory+result_filename,
                                          index=False, mode='a', header=False)

                                elapsed_time = time.time() - start_time
                                print(time.strftime("%H:%M:%S",
                                                    time.gmtime(elapsed_time)))
                        i_cv = i_cv+1
def main():

    models = ['RF']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    targets = ['ph']  # ['DOcategory', 'pHcategory'] # 'ph','dissolved_oxygen'
    # ph TH: 24,36,48
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/balance_data/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta',
                    'imfeatures': 'imfeatures',
                    'best_thresh_0': 'best_thresh_0',
                    'best_thresh_1': 'best_thresh_1',
                    'best_thresh_2': 'best_thresh_2'
                }
            else:
                cat = 0
                directory = 'Results/balance_data/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2',
                    'imfeatures': 'imfeatures'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory + resultFileName,
                            index=False,
                            header=False)

            if model_name == 'DT' or model_name == 'RF':
                path = 'Sondes_data/train/train_data/'
                method = 'OrgData'
            else:
                method = 'StandardScaler'
                path = 'Sondes_data/train/train_data_normalized/' + method + '/' + target + '/'

            for n_steps in [1, 3, 6, 12]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [
                        f for f in os.listdir(path)
                        if f.endswith('.csv') and f.startswith(sondefilename)
                    ]
                    file = files[0]
                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    dataset = pd.read_csv(path + file)
                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)

                    if cat == 1 and (model_name == 'LSTM'
                                     or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()

                    # resample = SMOTETomek(tomek=TomekLinks(
                    #     sampling_strategy='majority'))
                    # print(train_y_grid[train_y_grid.argmax(axis=1)==2])

                    model = func.algofind(model_name, input_dim, n_steps, cat)
                    # ('r', resample),
                    # if cat == 1:
                    #     model = CalibratedClassifierCV(
                    #         model, method='isotonic')

                    pipeline = Pipeline(steps=[('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)
                    gs = RandomizedSearchCV(
                        estimator=pipeline,
                        param_distributions=func.param_grid['param_grid_' +
                                                            model_name +
                                                            str(cat)],
                        n_iter=10,
                        cv=custom_cv,
                        verbose=0,
                        random_state=42,
                        n_jobs=n_job)

                    if cat == 1 and (model_name == 'LSTM'
                                     or model_name == 'NN'):
                        clf = gs.fit(train_X_grid,
                                     train_y_grid,
                                     model__class_weight={
                                         0: 1,
                                         1: 50,
                                         2: 100
                                     })
                    else:
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]
                        predictions = clf.predict(test_X)
                        # predict_mine = []
                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1:
                            # predict probabilities
                            yhat = clf.predict_proba(test_X)
                            # print(yhat[100:103])
                            y = label_binarize(test_y, classes=[0, 1, 2])
                            # print(y[100:103])

                            # roc_curve
                            fpr = dict()
                            tpr = dict()
                            roc_auc = dict()
                            best_thresh = dict()
                            for i in range(3):
                                fpr[i], tpr[i], thresholds = roc_curve(
                                    y[:, i], yhat[:, i])
                                roc_auc[i] = auc(fpr[i], tpr[i])
                                J = tpr[i] - fpr[i]
                                # get the best threshold
                                ix = argmax(J)
                                best_thresh[i] = thresholds[ix]
                                print('Best Threshold=%f, roc_auc=%.3f' %
                                      (best_thresh[i], roc_auc[i]))

                            # Compute micro-average ROC curve and ROC area
                            fpr["micro"], tpr["micro"], _ = roc_curve(
                                y.ravel(), yhat.ravel())
                            roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
                            plt.plot(
                                fpr["micro"],
                                tpr["micro"],
                                label='micro-average ROC curve (area = {0:0.2f})'
                                ''.format(roc_auc["micro"]),
                                color='deeppink',
                                linestyle=':',
                                linewidth=4)

                            colors = cycle(
                                ['aqua', 'darkorange', 'cornflowerblue'])
                            for i, color in zip(range(3), colors):
                                plt.plot(
                                    fpr[i],
                                    tpr[i],
                                    color=color,
                                    lw=2,
                                    label=
                                    'ROC curve of class {0} (area = {1:0.2f})'
                                    ''.format(i, roc_auc[i]))
                            # plot the roc curve for the model
                            plt.plot([0, 1], [0, 1],
                                     linestyle='--',
                                     label='No Skill')
                            # axis labels
                            plt.xlabel('False Positive Rate')
                            plt.ylabel('True Positive Rate')
                            plt.title(
                                'Some extension of Receiver operating characteristic to multi-class'
                            )
                            plt.legend(loc="lower right")
                            # show the plot
                            plt.savefig(directory + fpath + 'ROC_curve.jpg')
                            plt.close()

                        if cat == 1 and (model_name == 'LSTM'
                                         or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)
                            # predictions = argmax(predictions, axis=1)
                        if cat == 0:
                            predictions, test_y = func.transform(
                                predictions, test_y, method, target, file)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)), test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions,
                                    s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory + fpath + '.jpg')

                        plt.close()

                        # data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        # if model_name == 'RF':
                        #     df = pd.DataFrame(data=data)
                        # else:
                        #     df = pd.DataFrame(data=data, index=[0])
                        # df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': fpath,
                                'std_test_score': [test_std],
                                'mean_test_score': [test_Score],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]],
                                'imfeatures': [clf.best_estimator_],
                                'best_thresh_0': best_thresh[0],
                                'best_thresh_1': best_thresh[1],
                                'best_thresh_2': best_thresh[2]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': fpath,
                                'std_test_score': [test_std],
                                'mean_test_score': [test_Score],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5],
                                'imfeatures': [clf.best_estimator_]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + resultFileName,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i = i + 1
                    Kb.clear_session()
                    gc.collect()
                    del clf
def main():

    models = ['RF']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    targets = ['dissolved_oxygen', 'ph']  # ['DOcategory', 'pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/balance_data/output_Cat_' + model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1'}
            else:
                cat = 0
                directory = 'Results/balance_data/output_Reg_' + model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_'+target+str(time.time())+'.csv'

            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult+resultFileName,
                            index=False, header=False)

            if model_name == 'DT' or model_name == 'RF':
                method = 'OrgData'
                path = 'Sondes_data/train/train_data/'
                testpath = 'Sondes_data/test/test_data/'
            else:
                method = 'StandardScaler'
                path = 'Sondes_data/train/train_data_normalized/'+method+'/'+target+'/'
                testpath = 'Sondes_data/test/train_data_normalized/' + method+'/'+target+'/'

            for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                params = func.trained_param_grid[
                    'param_grid_'+model_name+str(cat)]
                lags = func.getlags_window(
                    model_name, params['param_'+target+'_'+str(PrH_index)], cat)

                files = [f for f in os.listdir(path) if f.endswith(
                    '.csv') and f.startswith(sondefilename)]
                file1 = files[0]
                print(' TH: ' +
                      str(PrH_index)+' '+method+' '+target+' '+file1)

                dataset = pd.read_csv(path+file1)
                train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                if model_name == 'LSTM' or model_name == 'NN':
                    n_job = 1

                start_time = time.time()

                clf = func.getModel(
                    model_name, input_dim, params['param_'+target+'_'+str(PrH_index)], n_job, cat)

                print('clf: '+str(clf))

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    train_y_grid = to_categorical(train_y_grid, 3)
                    clf = clf.fit(train_X_grid, train_y_grid,
                                  model__class_weight={0: 1, 1: 50, 2: 100})
                else:
                    clf = clf.fit(train_X_grid, train_y_grid)

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'_'+str(PrH_index)+'.sav'
                joblib.dump(clf, directory+filename)

                # if model_name == 'RF' or model_name=='DT':
                #     featurenames = func.setfeatures(features, lags)

                #     if not os.path.exists(directory+'trees/'):
                #         os.makedirs(directory+'trees/')

                #     i_tree = 0
                #     class_names = ['0', '1', '2']
                #     print(len(clf))
                #     for tree_in_forest in clf:
                #         dot_data = tree.export_graphviz(tree_in_forest, out_file=None,
                #                                         feature_names=featurenames,
                #                                         class_names=class_names,
                #                                         filled=True, rounded=True,
                #                                         special_characters=True)
                #         graph = pydotplus.graph_from_dot_data(dot_data)
                #         graph.write_pdf(
                #             directory+'trees/tree_'+filename+str(i_tree)+".pdf")
                #         i_tree = i_tree + 1
                #         if(i_tree > 1):
                #             break

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testsondefilename = re.sub('wo_', '', sondefilename)
                files = [f for f in os.listdir(testpath) if f.endswith(
                    '.csv')and f.startswith(testsondefilename)]
                file1 = files[0]
                print('Window: '+str(lags) + ' TH: ' +
                      str(PrH_index)+' '+method+' '+target+file1)

                dataset = pd.read_csv(testpath+file1)

                test_X_grid, test_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                i = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(
                    test_X_grid, 100)
                for test_index in custom_cv:
                    test_X = test_X_grid[test_index]
                    test_y = test_y_grid[test_index]

                    predictions = clf.predict(test_X)

                    if model_name == 'LSTM' or model_name == 'NN':
                        test_y = argmax(test_y, axis=1)
                        # predictions = argmax(predictions, axis=1)

                    # test_y = test_y.astype(int)
                    # predictions = predictions.astype(int)

                    if i % 10 == 0:
                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')
                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(lags) + '_TH'+str(PrH_index) + \
                            '_CV' + str(i)+file1
                        plt.savefig(directoryresult+fpath+'.jpg')

                        plt.close()
                    #     data = {'Actual': test_y, 'Predictions': predictions}
                    #     print(test_y.shape)
                    #     print(predictions.shape)
                    #     if model_name == 'RF':
                    #         df = pd.DataFrame(data=data)
                    #     else:
                    #         df = pd.DataFrame(data=data, index=[0])

                    #     df.to_csv(directoryresult+filename +
                    #             '_CV'+str(i)+'.csv', index=False)

                    cm0 = func.forecast_accuracy(predictions, test_y, cat)

                    if cat == 1:
                        data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i,
                                'file_names': filename,  'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7]}
                    elif cat == 0:
                        data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i,
                                'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5]}

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directoryresult+resultFileName,
                              index=False, mode='a', header=False)

                    elapsed_time = time.time() - start_time
                    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i = i+1
                Kb.clear_session()
                gc.collect()
                del clf
def main():
    method = 'OrgData'

    # , 'DOcategory', 'pHcategory']  # ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    # , 'dissolved_oxygen', 'ph']
    targets = ['dissolved_oxygen', 'ph']  # 'ysi_blue_green_algae'

    models = ['LSTM']
    path = 'Sondes_data/train_Summer/'
    files = [
        f for f in os.listdir(path)
        if f.endswith(".csv") and f.startswith('leavon')
    ]  # leavon

    for model_name in models:
        for target in targets:
            print(target)
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta',
                    'imfeatures': 'imfeatures',
                    'configs': 'configs',
                    'scores': 'scores'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mse': 'mse',
                    'rmse': 'rmse',
                    'R2': 'R2',
                    'imfeatures': 'imfeatures',
                    'configs': 'configs',
                    'scores': 'scores'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:

                result_filename = 'results_'+target + \
                    '_'+file+'_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                PrH_index = 0
                for n_steps_in in [36, 48, 60]:
                    print(model_name)
                    print(str(n_steps_in))

                    dataset = pd.read_csv(path + file)
                    #'water_conductivity', 'ysi_blue_green_algae', 'DOcategory', 'pHcategory',
                    dataset = dataset[[
                        'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                        'dissolved_oxygen_saturation', 'dissolved_oxygen',
                        'ph', 'year', 'month', 'day', 'hour'
                    ]]

                    print(dataset.head())

                    # dataset_bgsusd = pd.read_csv(path+'bgsusd_all.csv')
                    # dataset_osugi = pd.read_csv(path+'osugi.csv')
                    # dataset_utlcp = pd.read_csv(path+'utlcp.csv')
                    # dataset_leoc_1 = pd.read_csv(path+'leoc_1.csv')

                    # dataset_bgsusd = dataset_bgsusd[['Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    #  'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour']]
                    # dataset_osugi = dataset_osugi[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae',
                    #                                'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']]
                    # dataset_utlcp = dataset_utlcp[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae',
                    #                                'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']]
                    # dataset_leoc_1 = dataset_leoc_1[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae',
                    #                                  'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']]

                    dataset = temporal_horizon(dataset, PrH_index, target)

                    # dataset_bgsusd = temporal_horizon(
                    #     dataset_bgsusd, PrH_index, target)

                    # dataset_osugi = temporal_horizon(
                    #     dataset_osugi, PrH_index, target)

                    # dataset_utlcp = temporal_horizon(
                    #     dataset_utlcp, PrH_index, target)

                    # dataset_leoc_1 = temporal_horizon(
                    #     dataset_leoc_1, PrH_index, target)

                    n_steps_out = 9
                    train_X_grid, y = split_sequences(dataset, n_steps_in,
                                                      n_steps_out)

                    n_features = train_X_grid.shape[2]
                    print('n_fetures: ' + str(n_features))

                    # if cat:
                    #     y = to_categorical(y, 3)

                    # train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences(
                    #     dataset_bgsusd, n_steps_in, n_steps_out)

                    # train_X_grid_osugi, train_y_grid_osugi = split_sequences(
                    #     dataset_osugi, n_steps_in, n_steps_out)

                    # train_X_grid_utlcp, train_y_grid_utlcp = split_sequences(
                    #     dataset_utlcp, n_steps_in, n_steps_out)

                    # train_X_grid_leoc_1, train_y_grid_leoc_1 = split_sequences(
                    #     dataset_leoc_1, n_steps_in, n_steps_out)

                    # print(train_X_grid[0:2])
                    # print("--")
                    # print("shapes: ")
                    # print(train_X_grid.shape)
                    # print(y.shape)
                    # print(y[0])

                    train_X_grid = train_X_grid.reshape(
                        train_X_grid.shape[0],
                        train_X_grid.shape[1] * train_X_grid.shape[2])

                    # train_X_grid_bgsusd = train_X_grid_bgsusd.reshape(
                    #     train_X_grid_bgsusd.shape[0], train_X_grid_bgsusd.shape[1]*train_X_grid_bgsusd.shape[2])

                    # train_X_grid_osugi = train_X_grid_osugi.reshape(
                    #     train_X_grid_osugi.shape[0], train_X_grid_osugi.shape[1]*train_X_grid_osugi.shape[2])

                    # train_X_grid_utlcp = train_X_grid_utlcp.reshape(
                    #     train_X_grid_utlcp.shape[0], train_X_grid_utlcp.shape[1]*train_X_grid_utlcp.shape[2])

                    # train_X_grid_leoc_1 = train_X_grid_leoc_1.reshape(
                    #     train_X_grid_leoc_1.shape[0], train_X_grid_leoc_1.shape[1]*train_X_grid_leoc_1.shape[2])

                    # print(train_X_grid[0])
                    # dftime = pd.DataFrame({
                    #     'year': np.array(train_X_grid[:, -4]).astype(int), 'month': np.array(train_X_grid[:, -3]).astype(int),
                    #     'day': np.array(train_X_grid[:, -2]).astype(int), 'hour': np.array(train_X_grid[:, -1]).astype(int),
                    # })
                    # df_time = pd.to_datetime(
                    #     dftime, format='%Y%m%d %H')
                    # print(df_time.head())

                    # XX = np.array([X1, X2, X3, X4, X5])

                    XX = train_X_grid

                    # hstack((train_X_grid))
                    # train_X_grid_bgsusd,train_X_grid_osugi, train_X_grid_utlcp, train_X_grid_leoc_1))
                    # XX = XX.reshape(-1, XX.shape[-1])

                    print(XX.shape)
                    # print(XX[0])
                    input_dim = XX.shape
                    # n_steps_in = input_dim.shape[1]

                    model = algofind(model_name, input_dim, cat, n_steps_in,
                                     n_features, n_steps_out)

                    start_time = time.time()

                    # nostandard = False
                    if model_name == 'RF' or model_name == 'DT':
                        pipeline = Pipeline(steps=[('model', model)])
                    else:
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model)])
                    # if cat == 1:
                    #     metric = make_scorer(f2_measure)
                    # else:
                    #     metric = make_scorer(R2_measure)

                    custom_cv = func.custom_cv_2folds(XX, 3)

                    gs = RandomizedSearchCV(
                        estimator=pipeline,
                        param_distributions=func.param_grid['param_grid_' +
                                                            model_name +
                                                            str(cat)],
                        n_iter=25,
                        cv=custom_cv,
                        verbose=0,
                        n_jobs=1)
                    if model_name == 'ConvEnLSTM' or model_name == 'endecodeLSTM' or model_name == 'CNNLSTM':
                        clf = gs.fit(XX, y.reshape(y.shape[0], 1, n_steps_out))
                    else:
                        clf = gs.fit(XX, y)

                    test_Score = clf.cv_results_['mean_test_score']
                    test_std = clf.cv_results_['std_test_score']
                    configs = clf.cv_results_['params']

                    test_Score_mean = clf.cv_results_['mean_test_score'].mean()
                    test_std_mean = clf.cv_results_['std_test_score'].mean()

                    # print(test_Score)
                    # print(configs)

                    i_cv = 1
                    custom_cv = func.custom_cv_2folds(XX, 3)
                    for train_index, test_index in custom_cv:
                        test_X = XX[test_index]
                        test_y = y[test_index]

                        test_time = XX[test_index]
                        print(test_time[0])
                        dftime = pd.DataFrame({
                            'year':
                            np.array(test_time[:, -4]).astype(int),
                            'month':
                            np.array(test_time[:, -3]).astype(int),
                            'day':
                            np.array(test_time[:, -2]).astype(int),
                            'hour':
                            np.array(test_time[:, -1]).astype(int),
                        })
                        # print(dftime.head())
                        df_time = pd.to_datetime(dftime, format='%Y%m%d %H')
                        # print(df_time.head())

                        # print("-CV test-")
                        # print(test_X[0:2])
                        # print(np.array(test_X).shape)
                        # print(test_y[0:2])
                        # print(np.array(test_y).shape)
                        # print("--")
                        # print("--")

                        predictions = clf.predict(test_X)

                        print(predictions.shape)
                        predictions = predictions.reshape(-1, n_steps_out)

                        fpath = 'predictions_' + method+target+'_Window' +\
                            str(n_steps_in) + '_TH' +\
                            str(PrH_index)+'_CV' + str(i_cv)+file

                        if cat == 1:
                            test_y = np.argmax(test_y, axis=1)

                        # for t in range(6):
                        cm0 = np.zeros((n_steps_out, 6))
                        for t in range(n_steps_out):
                            cm0[t, :] = func.forecast_accuracy(
                                predictions[:, t], test_y[:, t], cat)
                        # print(cm0)

                        fig, ax = plt.subplots(nrows=5,
                                               ncols=2,
                                               figsize=(50, 50))
                        i = j = 0
                        k = 0
                        columns = [
                            't+1', 't+3', 't+6', 't+12', 't+24', 't+36',
                            't+48', 't+60', 't+72'
                        ]
                        for col in columns:
                            if k < len(columns):
                                ax[i, j].scatter(df_time.values, test_y[:, k])
                                ax[i, j].scatter(df_time.values,
                                                 predictions[:, k])
                                k = k + 1
                                ax[i, j].set_title(col)
                                ax[i, j].legend(['actual', 'prediction'])
                                j += 1
                                if j > 1:
                                    i += 1
                                    j = 0

                        plt.savefig(directory + fpath + '.png')
                        plt.close()

                        # print(test_y.shape)
                        # print(predictions.shape)
                        columns = [
                            'a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36',
                            'a+48', 'a+60', 'a+72'
                        ]
                        df_actual = pd.DataFrame(data=test_y, columns=columns)
                        columns = [
                            'p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36',
                            'p+48', 'p+60', 'p+72'
                        ]
                        df_predictions = pd.DataFrame(data=predictions,
                                                      columns=columns)

                        frames = [df_time, df_actual, df_predictions]
                        # concatenate dataframes
                        df = pd.concat(frames, axis=1)  # , sort=False
                        df.to_csv(directory + fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps_in,
                                'temporalhorizons': PrH_index,
                                'CV': i_cv,
                                'file_names': file,
                                'std_test_score': [test_std_mean],
                                'mean_test_score': [test_Score_mean],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]],
                                'imfeatures': [clf.best_estimator_],
                                'configs': [configs],
                                'scores': [test_Score]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps_in,
                                'temporalhorizons': PrH_index,
                                'CV': i_cv,
                                'file_names': file,
                                'std_test_score': [test_std_mean],
                                'mean_test_score': [test_Score_mean],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'mape': [cm0[:, 0]],
                                'me': [cm0[:, 1]],
                                'mae': [cm0[:, 2]],
                                'mse': [cm0[:, 3]],
                                'rmse': [cm0[:, 4]],
                                'R2': [cm0[:, 5]],
                                'imfeatures': [clf.best_estimator_],
                                'configs': [configs],
                                'scores': [test_Score]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + result_filename,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i_cv = i_cv + 1
def main():

    models = ['MA']
    targets = ['ph', 'dissolved_oxygen']  # 'pHcategory', 'DOcategory'
    sondefilename = 'leavon'

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/1sonde/output_Cat_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/1sonde/output_Reg_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)
            print(directoryresult)
            testsondefilename = 'utlcp'
            resultFileName = 'results_'+testsondefilename + '_' + \
                target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train_Summer/'
            testpath = 'Sondes_data/test_Summer/'
            method = 'OrgData'

            for n_steps in [1]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60,
                                  72]:  # 1, 3, 6, 12,
                    # files = [f for f in os.listdir(path) if f.endswith(
                    #     '.csv') and f.startswith(sondefilename)]
                    # file = files[0]
                    # print('Window: '+str(n_steps) + ' TH: ' +
                    #       str(PrH_index)+' '+method+' '+target)

                    # dataset = pd.read_csv(path+file)

                    # ######################
                    # # FOR MA
                    # ######################
                    # dataset = temporal_horizon(dataset, PrH_index, target)
                    # train = dataset[target]
                    # train_target = dataset['Target_'+target]
                    # print(train.head())
                    # print(train_target.head())

                    # custom_cv = func.custom_cv_kfolds_testdataonly(
                    #     train, 1)

                    # for train_index in custom_cv:
                    #     train = train[train_index].values
                    #     train_target = train_target[train_index].values

                    # coef, lag = movingAverage(
                    #     train, train_target)
                    # np.save(directory+'MA_model_'+target +
                    #                 '_'+str(PrH_index)+'.npy')
                    # np.save(directory+'MA_data_'+target +
                    #                 '_'+str(PrH_index)+'.npy', lag)

                    coef = np.load(directory + 'MA_model_' + target + '_' +
                                   str(PrH_index) + '.npy')
                    lag = np.load(directory + 'MA_data_' + target + '_' +
                                  str(PrH_index) + '.npy')

                    ######################
                    # TEST sets
                    ######################
                    # start_time = time.time()
                    # testsondefilename = re.sub('wo_', '', sondefilename)

                    files = [
                        f for f in os.listdir(testpath) if f.endswith('.csv')
                        and f.startswith(testsondefilename)
                    ]
                    file1 = files[0]
                    print('Window: ' + str(len(lag)) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target + file1)

                    testdataset = pd.read_csv(testpath + file1)
                    testdataset = temporal_horizon(testdataset, PrH_index,
                                                   target)

                    test = testdataset[target]
                    test_target = testdataset['Target_' + target]
                    # print(test.head())
                    # print(test_target.head())

                    i = 1
                    custom_cv = func.custom_cv_kfolds_testdataonly(test, 100)
                    for test_index in custom_cv:
                        test_y = test[test_index].values
                        # for MA
                        test_y_targets = test_target[test_index].values

                        # walk forward over time steps in test
                        history = [lag[i] for i in range(len(lag))]
                        predictions = list()
                        for t in range(len(test_y)):
                            # persistence
                            yhat = test_y[t]
                            # predict error
                            length = len(history)
                            window = len(coef)
                            hl = [
                                history[i]
                                for i in range(length - window, length)
                            ]
                            pred_error = predict(coef, hl, window)
                            yhat = yhat + pred_error
                            predictions.append(yhat)
                            error = test_y_targets[t] - yhat
                            history.append(error)

                        if cat == 1:
                            predictions = np.array(predictions).astype(int)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + \
                            str(i) + testsondefilename
                        # '_vals_'+str(p)+'_'+str(d) + \
                        # '_'+str(q)+'_'+\
                        # print(len(predictions))
                        # print(len(test_y_targets))
                        cm0 = func.forecast_accuracy(predictions,
                                                     test_y_targets, cat)

                        if i % 10 == 0:
                            plt.scatter(np.arange(len(test_y_targets)),
                                        test_y,
                                        s=1)
                            plt.scatter(np.arange(len(predictions)),
                                        predictions,
                                        s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.savefig(directoryresult + fpath + '.png')
                            plt.close()

                            data = {
                                'Actual': test_y_targets,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)
                            df.to_csv(directoryresult + fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': testsondefilename,
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': testsondefilename,
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directoryresult + resultFileName,
                                  index=False,
                                  mode='a',
                                  header=False)

                        i = i + 1
def main():
    methods = ['OrgData']
    # 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory']
    targets = ['ysi_blue_green_algae']
    model_name = 'baseline'
    # test_Summer train_Summer  # bookTwo: Sondes_data/old/test/test_data/
    path = 'Sondes_data/test_Summer/'
    files = [f for f in os.listdir(path) if f.endswith(".csv")]

    for method in methods:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/final_models/Results/'  # final_models/Results  oversampling_cv_models/ #2
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/final_models/Results/'  # final_models/Results  oversampling_cv_models  #3
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }
            if not os.path.exists(directory):
                os.makedirs(directory)
            for file in files:
                print(file)
                result_filename = 'results_' + target + '_' + file
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                n_steps = 1

                for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]:

                    dataset = pd.read_csv(path + file)

                    # Only the Target
                    dataset = dataset[['year', 'month', 'day', 'hour', target]]

                    # dataset = dataset.dropna()
                    # print(dataset.head())

                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    # print(train_y_grid[0:1])

                    start_time = time.time()

                    i = 1
                    # For Test files: #4
                    custom_cv = func.custom_cv_kfolds_testdataonly(
                        train_X_grid, 100)
                    for test_index in custom_cv:

                        # For Train files:
                        # custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                        # for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        # current value would be the same in the future predictions
                        predictions = test_X[:, -1]

                        df_time = pd.DataFrame({
                            'year':
                            np.array(test_X[:, 0]).astype(int),
                            'month':
                            np.array(test_X[:, 1]).astype(int),
                            'day':
                            np.array(test_X[:, 2]).astype(int),
                            'hour':
                            np.array(test_X[:, 3]).astype(int),
                        })
                        # print(df_time.head())

                        timeline = pd.to_datetime(df_time, format='%Y%m%d %H')
                        # print(timeline.head())

                        # timeline = timeline.reshape(len(time),)

                        if cat == 1:
                            predictions = np.array(predictions).astype(int)
                            test_y = np.array(test_y).astype(int)

                        test_y = test_y.reshape(len(test_y), )
                        predictions = predictions.reshape(len(predictions), )

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        filename = file + '_' + \
                            target+'_TH' + \
                            str(PrH_index)+'_lag' + \
                            str(n_steps)+'_'+str(i)

                        # First test files
                        if i % 10 == 0:  # or i <= 3:  # 5
                            plt.scatter(timeline.values, test_y, s=1)
                            plt.scatter(timeline.values, predictions, s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.xticks(rotation=45)

                            directorydeeper = directory + 'more/'
                            if not os.path.exists(directorydeeper):
                                os.makedirs(directorydeeper)
                            plt.savefig(directorydeeper + filename + '.jpg')

                            # plt.show()

                            plt.close()
                            data = {
                                'time': timeline,
                                'Actual': test_y,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)

                            df.to_csv(directorydeeper + filename + '.csv',
                                      index=False)

                        if cat == 1:
                            data = {
                                'CV': i,
                                'target_names': target,
                                'method_names': method,
                                'temporalhorizons': PrH_index,
                                'window_nuggets': 1,
                                'file_names': filename,
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]]
                            }
                        elif cat == 0:
                            data = {
                                'CV': i,
                                'target_names': target,
                                'method_names': method,
                                'temporalhorizons': PrH_index,
                                'window_nuggets': 1,
                                'file_names': filename,
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + result_filename,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i = i + 1
                    gc.collect()
                if target == 'ph':
                    indexNames = results[results['Actual'] < 8].index
                    results.drop(indexNames, inplace=True)

                else:
                    indexNames = results[results['Actual'] < 4].index
                    results.drop(indexNames, inplace=True)

                results = results.reset_index()

                test_y = results[['Actual']].values
                predictions = results[['Predictions']].values
                # print(predictions)
                # print(test_y)

                cm0 = func.forecast_accuracy(predictions, test_y, 0)

                data = {
                    'target_names': target,
                    'CV': i_cv,
                    'TH': th,
                    'file_names': file,
                    'mape': cm0[0],
                    'me': cm0[1],
                    'mae': cm0[2],
                    'mpe': cm0[3],
                    'rmse': cm0[4],
                    'R2': cm0[5]
                }
                i_cv = i_cv + 10
                df = pd.DataFrame(data=data, index=[0])
Esempio n. 8
0
def main():

    # 'LR', 'DT', 'SVC', 'LSTM', 'NN', # 'MLP', 'CNN', 'LSTM', 'ConvLSTM', 'CNNLSTM', 'EncodeDecodeLSTMs'
    models = ['RF']
    targets = ['DOcategory', 'pHcategory', 'ph', 'dissolved_oxygen']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures'}
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2', 'imfeatures': 'imfeatures'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'OrgData'

            for n_steps in [1, 3, 6, 12]:  #
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    print(train_X_grid[0:1])

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()
                    model = func.algofind(model_name, input_dim, n_steps, cat)

                    if cat == 1:
                        metric = make_scorer(f2_measure)
                    else:
                        metric = make_scorer(R2_measure)

                    # cat_ix = train_X_grid[:, 7:]
                    # print(cat_ix[0:2])
                    # num_ix = train_X_grid[:, : 7]
                    # print(num_ix[0:2])
                    # one hot encode categorical, normalize numerical
                    # ct = ColumnTransformer(
                    #     [('c', OneHotEncoder(), cat_ix), ('n', StandardScaler(), num_ix)])

                    if model_name == 'RF' or model_name == 'DT':
                        pipeline = Pipeline(steps=[('model', model)])

                    else:  # model_name == 'LSTM' or model_name == 'NN':
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model)])

                    # else:
                    #     pipeline = Pipeline(
                    #         steps=[('transforms', ct), ('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid,
                                     model__class_weight={0: 1, 1: 50, 2: 100})
                    elif cat == 0 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)
                    else:
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, scoring=metric, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        predictions = clf.predict(test_X)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory+fpath+'.jpg')

                        plt.close()

                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        df = pd.DataFrame(data=data)

                        df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_]}
                        elif cat == 0:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_]}

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory+resultFileName,
                                  index=False, mode='a', header=False)

                        elapsed_time = time.time() - start_time
                        print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                        i = i+1
def main():

    models = ['SARIMA']
    targets = ['dissolved_oxygen']
    sondefilename = 'leavon'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/1sonde/output_Cat_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/1sonde/output_Reg_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train_Summer/'
            testpath = 'Sondes_data/test_Summer/'
            method = 'OrgData'

            for n_steps in [1]:
                for PrH_index in [48, 60]:  # 1, 3, 6, 12, 24, 36,
                    files = [
                        f for f in os.listdir(path)
                        if f.endswith('.csv') and f.startswith(sondefilename)
                    ]
                    file = files[0]
                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    dataset = pd.read_csv(path + file)

                    ######################
                    # FOR  ARIMA
                    ######################
                    train = dataset[target]
                    custom_cv = custom_cv_kfolds_testdataonly(
                        train, 1, PrH_index)
                    for train_index in custom_cv:
                        train_y = train[train_index].values

                        config = getconfig(target, PrH_index, model_name)
                        if model_name == 'ARIMA':
                            model_fit = ARIMAregression(train_y, config)
                            model_fit.save(directory + 'ARIMA_model' + target +
                                           '_' + str(PrH_index) + '.pkl')
                        elif model_name == 'ETS':
                            model_fit = ETSregression(train_y, config)
                            model_fit.save(directory + 'ETS_model' + target +
                                           '_' + str(PrH_index) + '.pkl')

                        elif model_name == 'SARIMA':
                            model_fit = SARIMAregression(train_y, config)
                            model_fit.save(directory + 'SARIMA_model' +
                                           target + '_' + str(PrH_index) +
                                           '.pkl')

                            ######################
                            # TEST sets
                            ######################
                        start_time = time.time()
                        testsondefilename = sondefilename
                        files = [
                            f for f in os.listdir(testpath)
                            if f.endswith('.csv')
                            and f.startswith(testsondefilename)
                        ]
                        file1 = files[0]

                        testdataset = pd.read_csv(testpath + file1)

                        test = testdataset[target]
                        i = 1
                        custom_cv = custom_cv_kfolds_testdataonly(
                            test, 5, PrH_index)
                        for test_index in custom_cv:
                            test_y = test[test_index].values

                            # ARIMA
                            history = [train_y[i] for i in range(len(train_y))]
                            predictions = list()

                            for t in range(len(test_y)):
                                if model_name == 'ARIMA':
                                    model = ARIMA(history, order=(config))
                                    model_fit = model.fit(disp=0)
                                    yhat, stderr, conf = model_fit.forecast()

                                elif model_name == 'ETS':
                                    model_fit = ETSregression(history, config)
                                    yhat = model_fit.forecast()

                                elif model_name == 'SARIMA':
                                    model_fit = SARIMAregression(
                                        history, config)
                                    yhat = model_fit.forecast()

                                predictions.append(yhat)
                                history.append(test_y[t])

                            if cat == 1:
                                predictions = np.array(predictions).astype(int)

                            fpath = 'predictions_' + method+target+'_Window' + \
                                str(n_steps) + '_TH' + \
                                str(PrH_index)+'_CV' + str(i) + file

                            cm0 = func.forecast_accuracy(
                                predictions, test_y, cat)

                            if i % 10 == 0 or i <= 5:
                                plt.scatter(np.arange(len(test_y)),
                                            test_y,
                                            s=1)
                                plt.scatter(np.arange(len(predictions)),
                                            predictions,
                                            s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.savefig(directoryresult + fpath + '.png')
                                plt.close()

                                data = {
                                    'Actual': test_y,
                                    'Predictions': predictions
                                }
                                df = pd.DataFrame(data=data)
                                df.to_csv(directoryresult + fpath, index=False)

                            if cat == 1:
                                data = {
                                    'target_names': target,
                                    'method_names': method,
                                    'window_nuggets': n_steps,
                                    'temporalhorizons': PrH_index,
                                    'CV': i,
                                    'file_names': fpath,
                                    'F1_0': cm0[0],
                                    'F1_1': cm0[1],
                                    'P_0': cm0[2],
                                    'P_1': cm0[3],
                                    'R_0': cm0[4],
                                    'R_1': cm0[5],
                                    'acc0_1': cm0[6],
                                    'F1_0_1': cm0[7],
                                    'F1_all': cm0[8],
                                    'fbeta': [cm0[9]]
                                }
                            elif cat == 0:
                                data = {
                                    'target_names': target,
                                    'method_names': method,
                                    'window_nuggets': n_steps,
                                    'temporalhorizons': PrH_index,
                                    'CV': i,
                                    'file_names': fpath,
                                    'mape': cm0[0],
                                    'me': cm0[1],
                                    'mae': cm0[2],
                                    'mpe': cm0[3],
                                    'rmse': cm0[4],
                                    'R2': cm0[5]
                                }

                            df = pd.DataFrame(data=data, index=[0])
                            df.to_csv(directoryresult + resultFileName,
                                      index=False,
                                      mode='a',
                                      header=False)

                            i = i + 1
def saveResults(predictions, test_y, cat, directory, file, target, PrH_index,
                n_steps, i, result_filename, timeline, config):
    print(cat, directory, file, target, PrH_index, n_steps, i, result_filename,
          config)
    if cat == 1:
        predictions = np.array(predictions).astype(int)
        test_y = np.array(test_y).astype(int)

    cm0 = func.forecast_accuracy(predictions, test_y, cat)

    filename = file + '_' + \
        target+'_TH' + \
        str(PrH_index)+'_lag' + \
        str(n_steps)+'_'+str(i)+'_config'+str(config)

    directorydeeper = directory + 'more/'
    if not os.path.exists(directorydeeper):
        os.makedirs(directorydeeper)

    data = {'time': timeline, 'Actual': test_y, 'Predictions': predictions}
    df = pd.DataFrame(data=data)

    df.to_csv(directorydeeper + filename + '.csv', index=False)

    plt.scatter(timeline.values, test_y, s=1)
    plt.scatter(timeline.values, predictions, s=1)
    plt.legend(['actual', 'predictions'], loc='upper right')
    plt.xticks(rotation=45)
    plt.savefig(directorydeeper + filename + '.png')
    plt.close()

    # print(directorydeeper)
    # print(filename)
    # print(cm0)

    method = 'OrgData'
    if cat == 1:
        data = {
            'CV': i,
            'target_names': target,
            'method_names': method,
            'temporalhorizons': PrH_index,
            'window_nuggets': 1,
            'config': [config],
            'file_names': file,
            'F1_0': cm0[0],
            'F1_1': cm0[1],
            'P_0': cm0[2],
            'P_1': cm0[3],
            'R_0': cm0[4],
            'R_1': cm0[5],
            'acc0_1': cm0[6],
            'F1_0_1': cm0[7],
            'F1_all': cm0[8],
            'fbeta': [cm0[9]]
        }
    elif cat == 0:
        data = {
            'CV': i,
            'target_names': target,
            'method_names': method,
            'temporalhorizons': PrH_index,
            'window_nuggets': 1,
            'config': [config],
            'file_names': file,
            'mape': cm0[0],
            'me': cm0[1],
            'mae': cm0[2],
            'mpe': cm0[3],
            'rmse': cm0[4],
            'R2': cm0[5]
        }

    df = pd.DataFrame(data=data, index=[0])
    df.to_csv(directory + result_filename, index=False, mode='a', header=False)
    print(directory + result_filename)
    print('-------------------------')
def main():

    models = ['NN']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen',
    targets = ['pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'

            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train/train_data/'
            testpath = 'Sondes_data/test/test_data/'
            method = 'OrgData'

            for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                params = func.trained_param_grid['param_grid_' + model_name +
                                                 str(cat)]
                lags = func.getlags_window(
                    model_name,
                    params['param_' + target + '_' + str(PrH_index)], cat)

                files = [
                    f for f in os.listdir(path)
                    if f.endswith('.csv') and f.startswith(sondefilename)
                ]
                file1 = files[0]
                print(' TH: ' + str(PrH_index) + ' ' + method + ' ' + target +
                      ' ' + file1)

                dataset = pd.read_csv(path + file1)
                train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)
                print(input_dim)

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    train_y_grid = to_categorical(train_y_grid, 3)

                start_time = time.time()

                mo = func.getModel(
                    model_name, input_dim,
                    params['param_' + target + '_' + str(PrH_index)], n_job,
                    cat)

                if model_name == 'RF' or model_name == 'DT':
                    pipeline = Pipeline(steps=[('model', mo)])
                else:
                    pipeline = Pipeline(steps=[('n',
                                                StandardScaler()), ('model',
                                                                    mo)])

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'_'+str(PrH_index)+'.sav'

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    clf = pipeline.fit(train_X_grid,
                                       train_y_grid,
                                       model__class_weight={
                                           0: 1,
                                           1: 50,
                                           2: 100
                                       })
                else:
                    clf = pipeline.fit(train_X_grid, train_y_grid)

                # joblib.dump(clf, directory+filename)
                pickle.dump(clf, open(directory + filename, 'wb'))

                # To load the model, open the file in reading and binary mode
                # load_lr_model =pickle.load(open(filename, 'rb'))

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testsondefilename = re.sub('wo_', '', sondefilename)
                files = [
                    f for f in os.listdir(testpath)
                    if f.endswith('.csv') and f.startswith(testsondefilename)
                ]
                file1 = files[0]
                print('Window: ' + str(lags) + ' TH: ' + str(PrH_index) + ' ' +
                      method + ' ' + target + file1)

                dataset = pd.read_csv(testpath + file1)

                test_X_grid, test_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    test_y_grid = to_categorical(test_y_grid, 3)

                i = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(
                    test_X_grid, 100)
                for test_index in custom_cv:
                    test_X = test_X_grid[test_index]
                    test_y = test_y_grid[test_index]

                    predictions = clf.predict(test_X)

                    if model_name == 'LSTM' or model_name == 'NN':
                        test_y = argmax(test_y, axis=1)
                        # predictions = argmax(predictions, axis=1)

                    if cat == 1:
                        predictions = np.array(predictions).astype(int)
                        test_y = np.array(test_y).astype(int)
                        test_y = test_y.reshape(len(test_y), )
                        predictions = predictions.reshape(len(predictions), )

                    if i % 10 == 0:
                        plt.scatter(np.arange(len(test_y)), test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions,
                                    s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')
                        fpath = filename + '_CV' + str(i) + file1
                        # 'predictions_' + method+target+'_Window' + str(lags) + '_TH'+str(PrH_index) + \'_CV' + str(i)+file1
                        plt.savefig(directoryresult + fpath + '.jpg')

                        plt.close()
                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)
                        df = pd.DataFrame(data=data)
                        df.to_csv(directoryresult + filename + '_CV' + str(i) +
                                  file1,
                                  index=False)

                    cm0 = func.forecast_accuracy(predictions, test_y, cat)

                    if cat == 1:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'temporalhorizons': PrH_index,
                            'CV': i,
                            'file_names': filename,
                            'F1_0': cm0[0],
                            'F1_1': cm0[1],
                            'P_0': cm0[2],
                            'P_1': cm0[3],
                            'R_0': cm0[4],
                            'R_1': cm0[5],
                            'acc0_1': cm0[6],
                            'F1_0_1': cm0[7],
                            'F1_all': cm0[8],
                            'fbeta': [cm0[9]]
                        }
                    elif cat == 0:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'temporalhorizons': PrH_index,
                            'CV': i,
                            'file_names': filename,
                            'mape': cm0[0],
                            'me': cm0[1],
                            'mae': cm0[2],
                            'mpe': cm0[3],
                            'rmse': cm0[4],
                            'R2': cm0[5]
                        }

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directoryresult + resultFileName,
                              index=False,
                              mode='a',
                              header=False)

                    elapsed_time = time.time() - start_time
                    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i = i + 1
                Kb.clear_session()
                gc.collect()
                del clf
Esempio n. 12
0
def main():

    # models = ['endecodeLSTM', 'CNNLSTM', 'ConvEnLSTM',
    #           'NN', 'SVC', 'RF_onereg', 'DT_onereg']

    models = ['LSTM']  # save the models later
    # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen',
    targets = ['dissolved_oxygen', 'ph']
    path = 'Sondes_data/train_Summer/'
    # files = [f for f in os.listdir(path) if f.endswith(
    #     ".csv") and f.startswith('leavon')]  # leavon
    files = ['osugi.csv', 'utlcp.csv', 'leoc_1.csv', 'leavon.csv']
    n_job = -1
    PrH_index = 0

    for model_name in models:
        print(model_name)
        for target in targets:
            print(target)
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/2sondes/output_Cat_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/2sondes/output_Reg_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mse': 'mse',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)
            print(directory)
            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            # resultFileName = 'results_'+target+str(time.time())+'.csv'

            for file in files:

                method = 'OrgData'

                params = func.trained_param_grid['param_grid_' + model_name +
                                                 str(cat)]
                n_steps_in = func.getlags_window(
                    model_name,
                    params['param_' + target + '_' + str(PrH_index)], cat)
                print(n_steps_in)

                dataset = pd.read_csv(path + file)

                dataset = dataset[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]
                # print(dataset.head())

                dataset_bgsusd = pd.read_csv(path + 'bgsusd_all.csv')

                dataset_bgsusd = dataset_bgsusd[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]

                dataset = temporal_horizon(dataset, PrH_index, target)

                dataset_bgsusd = temporal_horizon(dataset_bgsusd, PrH_index,
                                                  target)

                n_steps_out = 9
                train_X_grid, y = split_sequences(dataset, n_steps_in,
                                                  n_steps_out)
                print(train_X_grid.shape)

                n_features = train_X_grid.shape[2]
                print('n_fetures: ' + str(n_features))

                train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences(
                    dataset_bgsusd, n_steps_in, n_steps_out)

                train_X_grid = train_X_grid.reshape(
                    train_X_grid.shape[0],
                    train_X_grid.shape[1] * train_X_grid.shape[2])

                train_X_grid_bgsusd = train_X_grid_bgsusd.reshape(
                    train_X_grid_bgsusd.shape[0],
                    train_X_grid_bgsusd.shape[1] *
                    train_X_grid_bgsusd.shape[2])

                XX = hstack((train_X_grid_bgsusd, train_X_grid))
                # XX = train_X_grid  # for final multivariate training model on LSTM
                print(XX.shape)
                # print(XX[0])
                input_dim = XX.shape

                start_time = time.time()

                model = algofind(
                    model_name, input_dim, cat, n_features, n_steps_out,
                    params['param_' + target + '_' + str(PrH_index)], n_job)

                if model_name == 'RF' or model_name == 'DT':
                    pipeline = Pipeline(steps=[('model', model)])
                else:
                    pipeline = Pipeline(
                        steps=[('n', StandardScaler()), ('model', model)])

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'.joblib'

                if model_name == 'ConvEnLSTM' or model_name == 'endecodeLSTM' or model_name == 'CNNLSTM':
                    clf = pipeline.fit(XX, y.reshape(y.shape[0], 1,
                                                     n_steps_out))
                else:
                    clf = pipeline.fit(XX, y)

                # joblib.dump(clf, directory+filename)
                # pickle.dump(clf, open(directory+filename, 'wb'))

                # To load the model, open the file in reading and binary mode
                # load_lr_model =pickle.load(open(filename, 'rb'))

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testpath = 'Sondes_data/test_Summer/'
                # testfiles = ['lelorain.csv', 'utlcp.csv',
                #              'lementor_1.csv', 'lebiww.csv']
                # for testfile in testfiles:
                testfile = file
                result_filename = 'results_'+testfile+'_'+target + \
                    '_'+file+'_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)

                dataset = pd.read_csv(testpath + testfile)
                dataset = dataset[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]
                # print(dataset.head())

                dataset_bgsusd = pd.read_csv(testpath + 'bgsusd_all.csv')
                dataset_bgsusd = dataset_bgsusd[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]

                dataset = temporal_horizon(dataset, PrH_index, target)

                dataset_bgsusd = temporal_horizon(dataset_bgsusd, PrH_index,
                                                  target)

                test_X_grid, y = split_sequences(dataset, n_steps_in,
                                                 n_steps_out)

                n_features = test_X_grid.shape[2]
                test_X_grid_bgsusd, test_y_grid_bgsusd = split_sequences(
                    dataset_bgsusd, n_steps_in, n_steps_out)

                test_X_grid = test_X_grid.reshape(
                    test_X_grid.shape[0],
                    test_X_grid.shape[1] * test_X_grid.shape[2])

                test_X_grid_bgsusd = test_X_grid_bgsusd.reshape(
                    test_X_grid_bgsusd.shape[0],
                    test_X_grid_bgsusd.shape[1] * test_X_grid_bgsusd.shape[2])

                test_XX = hstack((test_X_grid_bgsusd, test_X_grid))
                # test_XX = test_X_grid

                i_cv = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(test_XX, 100)
                for test_index in custom_cv:
                    test_X = test_XX[test_index]
                    test_y = y[test_index]

                    test_time = test_XX[test_index]
                    # print(test_time[0])
                    dftime = pd.DataFrame({
                        'year':
                        np.array(test_time[:, -4]).astype(int),
                        'month':
                        np.array(test_time[:, -3]).astype(int),
                        'day':
                        np.array(test_time[:, -2]).astype(int),
                        'hour':
                        np.array(test_time[:, -1]).astype(int),
                    })
                    # print(dftime.head())
                    df_time = pd.to_datetime(dftime, format='%Y%m%d %H')

                    predictions = clf.predict(test_X)

                    # print(predictions.shape)
                    predictions = predictions.reshape(-1, n_steps_out)

                    fpath = 'predictions_' + method+target+'_Window' +\
                        str(n_steps_in) + '_CV' + str(i_cv)+testfile

                    if i_cv % 10 == 0:
                        fig, ax = plt.subplots(nrows=5,
                                               ncols=2,
                                               figsize=(50, 50))
                        i = j = 0
                        k = 0
                        columns = [
                            't+1', 't+3', 't+6', 't+12', 't+24', 't+36',
                            't+48', 't+60', 't+72'
                        ]
                        for col in columns:
                            if k < len(columns):
                                ax[i, j].scatter(df_time.values, test_y[:, k])
                                ax[i, j].scatter(df_time.values,
                                                 predictions[:, k])
                                k = k + 1
                                ax[i, j].set_title(col)
                                ax[i, j].legend(['actual', 'prediction'])
                                j += 1
                                if j > 1:
                                    i += 1
                                    j = 0

                        plt.savefig(directoryresult + fpath + '.png')
                        plt.close()

                        # print(test_y.shape)
                        # print(predictions.shape)
                        columns = [
                            'a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36',
                            'a+48', 'a+60', 'a+72'
                        ]
                        df_actual = pd.DataFrame(data=test_y, columns=columns)
                        columns = [
                            'p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36',
                            'p+48', 'p+60', 'p+72'
                        ]
                        df_predictions = pd.DataFrame(data=predictions,
                                                      columns=columns)

                        frames = [df_time, df_actual, df_predictions]
                        # concatenate dataframes
                        df = pd.concat(frames, axis=1)  # , sort=False
                        df.to_csv(directoryresult + fpath, index=False)

                    cm0 = np.zeros((n_steps_out, 6))
                    for t in range(n_steps_out):
                        cm0[t, :] = func.forecast_accuracy(
                            predictions[:, t], test_y[:, t], cat)

                    if cat == 1:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'window_nuggets': n_steps_in,
                            'temporalhorizons': PrH_index,
                            'CV': i_cv,
                            'file_names': testfile,
                            'F1_0': cm0[0],
                            'F1_1': cm0[1],
                            'P_0': cm0[2],
                            'P_1': cm0[3],
                            'R_0': cm0[4],
                            'R_1': cm0[5],
                            'acc0_1': cm0[6],
                            'F1_0_1': cm0[7],
                            'F1_all': cm0[8],
                            'fbeta': [cm0[9]]
                        }
                    elif cat == 0:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'window_nuggets': n_steps_in,
                            'temporalhorizons': PrH_index,
                            'CV': i_cv,
                            'file_names': testfile,
                            'mape': [cm0[:, 0]],
                            'me': [cm0[:, 1]],
                            'mae': [cm0[:, 2]],
                            'mse': [cm0[:, 3]],
                            'rmse': [cm0[:, 4]],
                            'R2': [cm0[:, 5]]
                        }

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directory + result_filename,
                              index=False,
                              mode='a',
                              header=False)

                    elapsed_time = time.time() - start_time
                    # print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i_cv = i_cv + 1
def main():

    models = ['ARIMA']
    targets = ['dissolved_oxygen']  # , 'DOcategory', 'pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1
    # evaluate parameters
    p_values = range(1, 3)
    d_values = range(0, 2)
    q_values = range(0, 3)

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookTwo/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookTwo/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'OrgData'

            for n_steps in [1]:
                for PrH_index in [1, 3, 6]:
                    files = [
                        f for f in os.listdir(path)
                        if f.endswith('.csv') and f.startswith(sondefilename)
                    ]
                    file = files[0]
                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    dataset = pd.read_csv(path + file)
                    ######################
                    # FOR AR and ARIMA
                    ######################
                    train = dataset[target]
                    custom_cv = custom_cv_2folds(train, 1, PrH_index)

                    ######################
                    # FOR MA
                    ######################
                    # dataset = temporal_horizon(dataset, PrH_index, target)
                    # train = dataset[target]
                    # train_target = dataset['Target_'+target]
                    # custom_cv = func.custom_cv_2folds(train, 3)

                    ######################
                    # Cross Validation sets
                    ######################
                    i = 0
                    for train_index, test_index in custom_cv:
                        train_y = train[train_index].values
                        # train_y_targets = train_target[train_index].values #for MA
                        test_y = train[test_index].values
                        # test_y_targets = train_target[test_index].values  #for MA

                        # predictions = movingAverage(
                        #     train_y, train_y_targets, test_y, test_y_targets)

                        # predictions = AutoRegression(train_y, test_y)

                        # FOR ARIMA
                        for p in p_values:
                            for d in d_values:
                                for q in q_values:
                                    if p == q and d == q:
                                        print(p, d, q)
                                    else:
                                        print(p, d, q)
                                        predictions = ARIMAregression(
                                            train_y, test_y, p, d, q)

                                        if cat == 1:
                                            predictions = np.array(
                                                predictions).astype(int)

                                        fpath = 'predictions_' + method+target+'_Window' + \
                                            str(n_steps) + '_TH' + \
                                            str(PrH_index)+'_CV' + str(i) + \
                                            '_vals_'+str(p)+'_'+str(d) + \
                                            '_'+str(q)+'_'+file

                                        cm0 = func.forecast_accuracy(
                                            predictions, test_y, cat)

                                        plt.scatter(np.arange(len(test_y)),
                                                    test_y,
                                                    s=1)
                                        plt.scatter(np.arange(
                                            len(predictions)),
                                                    predictions,
                                                    s=1)
                                        plt.legend(['actual', 'predictions'],
                                                   loc='upper right')

                                        plt.savefig(directory + fpath + '.jpg')

                                        plt.close()

                                        data = {
                                            'Actual': test_y,
                                            'Predictions': predictions
                                        }

                                        df = pd.DataFrame(data=data)

                                        df.to_csv(directory + fpath,
                                                  index=False)

                                        if cat == 1:
                                            data = {
                                                'target_names': target,
                                                'method_names': method,
                                                'window_nuggets': n_steps,
                                                'temporalhorizons': PrH_index,
                                                'CV': i,
                                                'file_names': fpath,
                                                'F1_0': cm0[0],
                                                'F1_1': cm0[1],
                                                'P_0': cm0[2],
                                                'P_1': cm0[3],
                                                'R_0': cm0[4],
                                                'R_1': cm0[5],
                                                'acc0_1': cm0[6],
                                                'F1_0_1': cm0[7],
                                                'F1_all': cm0[8],
                                                'fbeta': cm0[9]
                                            }
                                        elif cat == 0:
                                            data = {
                                                'target_names': target,
                                                'method_names': method,
                                                'window_nuggets': n_steps,
                                                'temporalhorizons': PrH_index,
                                                'CV': i,
                                                'file_names': fpath,
                                                'mape': cm0[0],
                                                'me': cm0[1],
                                                'mae': cm0[2],
                                                'mpe': cm0[3],
                                                'rmse': cm0[4],
                                                'R2': cm0[5]
                                            }

                                        df = pd.DataFrame(data=data, index=[0])
                                        df.to_csv(directory + resultFileName,
                                                  index=False,
                                                  mode='a',
                                                  header=False)

                        i = i + 1
Esempio n. 14
0
def main():

    models = ['AR']
    targets = ['ph', 'dissolved_oxygen']  # , 'pHcategory', 'DOcategory']
    sondefilename = 'leavon'
    n_job = -1
    # evaluate parameters
    p_values = [0, 1, 2, 4, 6, 8, 10]
    d_values = range(0, 3)
    q_values = range(0, 3)

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/1sonde/output_Cat_' + \
                    model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta'}
            else:
                cat = 0
                directory = 'Results/bookThree/1sonde/output_Reg_' + \
                    model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train_Summer/'
            testpath = 'Sondes_data/test_Summer/'
            method = 'OrgData'

            for n_steps in [1]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)

                    ######################
                    # FOR AR and ARIMA
                    ######################
                    train = dataset[target]
                    custom_cv = custom_cv_kfolds_testdataonly(
                        train, 1, PrH_index)
                    for train_index in custom_cv:
                        train_y = train[train_index].values

                        # FOR AR
                        coef, lag = AutoRegression(
                            train_y)
                        np.save(directory+'AR_model_'+target +
                                '_'+str(PrH_index)+'.npy', coef)
                        np.save(directory+'AR_data_'+target +
                                '_'+str(PrH_index)+'.npy', lag)

                        # FOR ARIMA
                        # model_fit = ARIMAregression(train_y, 0, 1, 1)
                        # model_fit.save(directory+'ARIMA_model'+target +
                        #                '_'+str(PrH_index)+'.pkl')
                        # numpy.save('model_bias.npy', [bias])

                        ######################
                        # TEST sets
                        ######################
                        start_time = time.time()
                        testsondefilename = re.sub('wo_', '', sondefilename)
                        files = [f for f in os.listdir(testpath) if f.endswith(
                            '.csv')and f.startswith(testsondefilename)]
                        file1 = files[0]

                        # AR
                        print('Window: ' + str(len(lag)) + ' TH: ' +
                              str(PrH_index)+' '+method+' '+target+file1)

                        testdataset = pd.read_csv(testpath+file1)

                        test = testdataset[target]
                        i = 1
                        custom_cv = custom_cv_kfolds_testdataonly(
                            test, 100, PrH_index)
                        for test_index in custom_cv:
                            test_y = test[test_index].values

                            # FOR AR  : making history a list type
                            history = [lag[i]for i in range(len(lag))]

                            # ARIMA
                            # history = [train_y[i]for i in range(len(train_y))]
                            predictions = list()

                            for t in range(len(test_y)):
                                # FOR AR
                                length = len(history)
                                window = len(coef)
                                hl = [history[i]
                                      for i in range(length-window, length)]
                                yhat = predict(coef, hl, window)
                                # print(yhat)

                                # ARIMA
                                # model_fit = ARIMAResults.load(directory +
                                #                               'ARIMA_model'+target + '_'+str(PrH_index)+'.pkl')
                                # yhat, stderr, conf = model_fit.forecast()
                                # bias = numpy.load('model_bias.npy')
                                # yhat = bias + yhat

                                predictions.append(yhat)
                                history.append(test_y[t])

                                # ARIMA
                                # model = ARIMA(history, order=(0, 1, 0))
                                # model_fit = model.fit(disp=0)
                                # model_fit.save(directory +
                                #                'ARIMA_model'+target + '_'+str(PrH_index)+'.pkl')
                                # print('predicted=%f, expected=%f' % (yhat, obs))

                            if cat == 1:
                                predictions = np.array(
                                    predictions).astype(int)

                            fpath = 'predictions_' + method+target+'_Window' + \
                                str(n_steps) + '_TH' + \
                                str(PrH_index)+'_CV' + str(i) + file

                            cm0 = func.forecast_accuracy(
                                predictions, test_y, cat)

                            if i % 10 == 0:
                                plt.scatter(np.arange(len(test_y)),
                                            test_y, s=1)
                                plt.scatter(np.arange(len(predictions)),
                                            predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.savefig(directoryresult+fpath+'.jpg')
                                plt.close()

                                data = {'Actual': test_y,
                                        'Predictions': predictions}
                                df = pd.DataFrame(data=data)
                                df.to_csv(directoryresult +
                                          fpath, index=False)

                            if cat == 1:
                                data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                        'file_names': fpath, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]]}
                            elif cat == 0:
                                data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                        'file_names': fpath,  'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5]}

                            df = pd.DataFrame(data=data, index=[0])
                            df.to_csv(directoryresult+resultFileName,
                                      index=False, mode='a', header=False)

                            i = i+1
def main():
    method = 'OrgData'

    # 'DOcategory', 'pHcategory',ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    # 'ysi_blue_green_algae']  # , 'dissolved_oxygen', 'ph']
    targets = ['ph']
    # 'ARIMA', 'SARIMA', 'ETS', 'AR', 'MA'
    models = ['SARIMA']
    path = 'Sondes_data/train_Summer/'
    files = [
        f for f in os.listdir(path)
        if f.endswith(".csv") and f.startswith('leavon')
    ]  # leavon bgsusd_all

    for model_name in models:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'config': 'config',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'config': 'config',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }
            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:
                print(file)
                result_filename = 'results_'+target + \
                    '_'+file + '_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                n_steps = 1

                for PrH_index in [1, 3, 6, 12, 24, 36]:

                    dataset = pd.read_csv(path + file)

                    # Only the Target
                    dataset = dataset[['year', 'month', 'day', 'hour', target]]

                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    i = 1

                    if model_name == 'MA':
                        train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                            dataset, PrH_index, n_steps, target, cat)

                        start_time = time.time()
                        # For Train files:
                        custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                        for train_index, test_index in custom_cv:
                            train_X = train_X_grid[train_index]
                            train_y = train_y_grid[train_index]
                            train_X_uni = train_X[:, -1]

                            test_X = train_X_grid[test_index]
                            # actual future values
                            test_X_uni = test_X[:, -1]
                            test_y = train_y_grid[test_index]

                            predictions = ufunc.movingAverage(
                                train_X_uni, train_y, test_X_uni, test_y)

                            df_time = pd.DataFrame({
                                'year':
                                np.array(test_X[:, 0]).astype(int),
                                'month':
                                np.array(test_X[:, 1]).astype(int),
                                'day':
                                np.array(test_X[:, 2]).astype(int),
                                'hour':
                                np.array(test_X[:, 3]).astype(int),
                            })

                            timeline = pd.to_datetime(df_time,
                                                      format='%Y%m%d %H')

                            if cat == 1:
                                predictions = np.array(predictions).astype(int)
                                test_y = np.array(test_y).astype(int)

                            # test_y = test_y.reshape(len(test_y),)
                            # predictions = predictions.reshape(
                            #     len(predictions),)

                            cm0 = func.forecast_accuracy(
                                predictions, test_y, cat)

                            filename = file + '_' + \
                                target+'_TH' + \
                                str(PrH_index)+'_lag' + \
                                str(n_steps)+'_'+str(i)

                            plt.scatter(timeline.values, test_y, s=1)
                            plt.scatter(timeline.values, predictions, s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.xticks(rotation=45)

                            directorydeeper = directory + 'more/'
                            if not os.path.exists(directorydeeper):
                                os.makedirs(directorydeeper)
                            plt.savefig(directorydeeper + filename + '.jpg')

                            plt.close()
                            data = {
                                'time': timeline,
                                'Actual': test_y,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)

                            df.to_csv(directorydeeper + filename + '.csv',
                                      index=False)

                            if cat == 1:
                                data = {
                                    'CV': i,
                                    'target_names': target,
                                    'method_names': method,
                                    'temporalhorizons': PrH_index,
                                    'window_nuggets': 1,
                                    'file_names': filename,
                                    'F1_0': cm0[0],
                                    'F1_1': cm0[1],
                                    'P_0': cm0[2],
                                    'P_1': cm0[3],
                                    'R_0': cm0[4],
                                    'R_1': cm0[5],
                                    'acc0_1': cm0[6],
                                    'F1_0_1': cm0[7],
                                    'F1_all': cm0[8],
                                    'fbeta': [cm0[9]]
                                }
                            elif cat == 0:
                                data = {
                                    'CV': i,
                                    'target_names': target,
                                    'method_names': method,
                                    'temporalhorizons': PrH_index,
                                    'window_nuggets': 1,
                                    'file_names': filename,
                                    'mape': cm0[0],
                                    'me': cm0[1],
                                    'mae': cm0[2],
                                    'mpe': cm0[3],
                                    'rmse': cm0[4],
                                    'R2': cm0[5]
                                }

                            df = pd.DataFrame(data=data, index=[0])
                            df.to_csv(directory + result_filename,
                                      index=False,
                                      mode='a',
                                      header=False)
                            i = i + 1

                            elapsed_time = time.time() - start_time
                            print(
                                time.strftime("%H:%M:%S",
                                              time.gmtime(elapsed_time)))

                    if model_name == 'ARIMA' or model_name == 'AR' or model_name == 'ETS' or model_name == 'SARIMA' or model_name == 'BL':
                        start_time = time.time()
                        train_X_grid = dataset.values
                        custom_cv = ufunc.custom_cv_2folds(
                            train_X_grid, 1, PrH_index)

                        ######################
                        # Cross Validation sets
                        ######################
                        i = 1
                        for train_index, test_index in custom_cv:
                            train_X = train_X_grid[train_index]
                            train_X_uni = train_X[:, -1]

                            test_X = train_X_grid[test_index]
                            # actual future values
                            test_X_uni = test_X[:, -1]

                            df_time = pd.DataFrame({
                                'year':
                                np.array(test_X[:, 0]).astype(int),
                                'month':
                                np.array(test_X[:, 1]).astype(int),
                                'day':
                                np.array(test_X[:, 2]).astype(int),
                                'hour':
                                np.array(test_X[:, 3]).astype(int),
                            })

                            timeline = pd.to_datetime(df_time,
                                                      format='%Y%m%d %H')

                            if model_name == 'BL':

                                # train_X_uni,test_X_uni
                                # make them into dataFrame so below can be done

                                test_X_uni = pd.DataFrame(test_X_uni)
                                target_values = test_X_uni.drop(
                                    test_X_uni.index[0:1], axis=0)
                                target_values.index = np.arange(
                                    0, len(target_values))

                                # test_X_uni = pd.DataFrame(test_X_uni)

                                predictions = test_X_uni.drop(
                                    test_X_uni.index[len(test_X_uni) -
                                                     1:len(test_X_uni)],
                                    axis=0)
                                test_X_uni = target_values

                                timeline = timeline.drop(
                                    timeline.index[len(timeline) -
                                                   1:len(timeline)],
                                    axis=0)

                                cm0 = func.forecast_accuracy(
                                    predictions, test_X_uni, cat)

                                filename = file + '_' + \
                                    target+'_TH' + \
                                    str(PrH_index)+'_lag' + \
                                    str(n_steps)+'_'+str(i)

                                plt.scatter(timeline.values, test_X_uni, s=1)
                                plt.scatter(timeline.values, predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.xticks(rotation=45)

                                directorydeeper = directory + 'more/'
                                if not os.path.exists(directorydeeper):
                                    os.makedirs(directorydeeper)
                                plt.savefig(directorydeeper + filename +
                                            '.jpg')

                                plt.close()

                                print(predictions.head())
                                print(test_X_uni.head())
                                print(timeline.head())

                                # data = {'time': timeline,
                                #         'Actual': test_X_uni,
                                #         'Predictions': predictions}
                                frames = [timeline, test_X_uni, predictions]
                                df = pd.concat(frames, axis=1)
                                df.to_csv(
                                    directorydeeper + filename + '.csv',
                                    index=False,
                                    header=['time', 'Actual', 'Predictions'])

                                if cat == 1:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'F1_0': cm0[0],
                                        'F1_1': cm0[1],
                                        'P_0': cm0[2],
                                        'P_1': cm0[3],
                                        'R_0': cm0[4],
                                        'R_1': cm0[5],
                                        'acc0_1': cm0[6],
                                        'F1_0_1': cm0[7],
                                        'F1_all': cm0[8],
                                        'fbeta': [cm0[9]]
                                    }
                                elif cat == 0:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'mape': cm0[0],
                                        'me': cm0[1],
                                        'mae': cm0[2],
                                        'mpe': cm0[3],
                                        'rmse': cm0[4],
                                        'R2': cm0[5]
                                    }

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory + result_filename,
                                          index=False,
                                          mode='a',
                                          header=False)

                            if model_name == 'AR':
                                predictions = ufunc.AutoRegression(
                                    train_X_uni, test_X_uni)
                                if cat == 1:
                                    predictions = np.array(predictions).astype(
                                        int)
                                    test_X_uni = np.array(test_X_uni).astype(
                                        int)

                                cm0 = func.forecast_accuracy(
                                    predictions, test_X_uni, cat)

                                filename = file + '_' + \
                                    target+'_TH' + \
                                    str(PrH_index)+'_lag' + \
                                    str(n_steps)+'_'+str(i)

                                plt.scatter(timeline.values, test_X_uni, s=1)
                                plt.scatter(timeline.values, predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.xticks(rotation=45)

                                directorydeeper = directory + 'more/'
                                if not os.path.exists(directorydeeper):
                                    os.makedirs(directorydeeper)
                                plt.savefig(directorydeeper + filename +
                                            '.jpg')

                                plt.close()
                                data = {
                                    'time': timeline,
                                    'Actual': test_X_uni,
                                    'Predictions': predictions
                                }
                                df = pd.DataFrame(data=data)

                                df.to_csv(directorydeeper + filename + '.csv',
                                          index=False)

                                if cat == 1:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'F1_0': cm0[0],
                                        'F1_1': cm0[1],
                                        'P_0': cm0[2],
                                        'P_1': cm0[3],
                                        'R_0': cm0[4],
                                        'R_1': cm0[5],
                                        'acc0_1': cm0[6],
                                        'F1_0_1': cm0[7],
                                        'F1_all': cm0[8],
                                        'fbeta': [cm0[9]]
                                    }
                                elif cat == 0:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'mape': cm0[0],
                                        'me': cm0[1],
                                        'mae': cm0[2],
                                        'mpe': cm0[3],
                                        'rmse': cm0[4],
                                        'R2': cm0[5]
                                    }

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory + result_filename,
                                          index=False,
                                          mode='a',
                                          header=False)

                            cfg_list = list()
                            if model_name == 'ETS':
                                cfg_list = ufunc.exp_smoothing_configs()
                                scores = [
                                    ufunc.score_model('ETS', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            if model_name == 'ARIMA':
                                cfg_list = ufunc.ARIMA_configs()
                                scores = [
                                    ufunc.score_model('ARIMA', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            if model_name == 'SARIMA':
                                cfg_list = ufunc.sarima_configs()

                                scores = [
                                    ufunc.score_model('SARIMA', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            i = i + 1
                            elapsed_time = time.time() - start_time
                            print(
                                time.strftime("%H:%M:%S",
                                              time.gmtime(elapsed_time)))