def main():

    models = ['RF']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    targets = ['dissolved_oxygen', 'ph']  # ['DOcategory', 'pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/balance_data/output_Cat_' + model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1'}
            else:
                cat = 0
                directory = 'Results/balance_data/output_Reg_' + model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_'+target+str(time.time())+'.csv'

            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult+resultFileName,
                            index=False, header=False)

            if model_name == 'DT' or model_name == 'RF':
                method = 'OrgData'
                path = 'Sondes_data/train/train_data/'
                testpath = 'Sondes_data/test/test_data/'
            else:
                method = 'StandardScaler'
                path = 'Sondes_data/train/train_data_normalized/'+method+'/'+target+'/'
                testpath = 'Sondes_data/test/train_data_normalized/' + method+'/'+target+'/'

            for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                params = func.trained_param_grid[
                    'param_grid_'+model_name+str(cat)]
                lags = func.getlags_window(
                    model_name, params['param_'+target+'_'+str(PrH_index)], cat)

                files = [f for f in os.listdir(path) if f.endswith(
                    '.csv') and f.startswith(sondefilename)]
                file1 = files[0]
                print(' TH: ' +
                      str(PrH_index)+' '+method+' '+target+' '+file1)

                dataset = pd.read_csv(path+file1)
                train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                if model_name == 'LSTM' or model_name == 'NN':
                    n_job = 1

                start_time = time.time()

                clf = func.getModel(
                    model_name, input_dim, params['param_'+target+'_'+str(PrH_index)], n_job, cat)

                print('clf: '+str(clf))

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    train_y_grid = to_categorical(train_y_grid, 3)
                    clf = clf.fit(train_X_grid, train_y_grid,
                                  model__class_weight={0: 1, 1: 50, 2: 100})
                else:
                    clf = clf.fit(train_X_grid, train_y_grid)

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'_'+str(PrH_index)+'.sav'
                joblib.dump(clf, directory+filename)

                # if model_name == 'RF' or model_name=='DT':
                #     featurenames = func.setfeatures(features, lags)

                #     if not os.path.exists(directory+'trees/'):
                #         os.makedirs(directory+'trees/')

                #     i_tree = 0
                #     class_names = ['0', '1', '2']
                #     print(len(clf))
                #     for tree_in_forest in clf:
                #         dot_data = tree.export_graphviz(tree_in_forest, out_file=None,
                #                                         feature_names=featurenames,
                #                                         class_names=class_names,
                #                                         filled=True, rounded=True,
                #                                         special_characters=True)
                #         graph = pydotplus.graph_from_dot_data(dot_data)
                #         graph.write_pdf(
                #             directory+'trees/tree_'+filename+str(i_tree)+".pdf")
                #         i_tree = i_tree + 1
                #         if(i_tree > 1):
                #             break

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testsondefilename = re.sub('wo_', '', sondefilename)
                files = [f for f in os.listdir(testpath) if f.endswith(
                    '.csv')and f.startswith(testsondefilename)]
                file1 = files[0]
                print('Window: '+str(lags) + ' TH: ' +
                      str(PrH_index)+' '+method+' '+target+file1)

                dataset = pd.read_csv(testpath+file1)

                test_X_grid, test_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                i = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(
                    test_X_grid, 100)
                for test_index in custom_cv:
                    test_X = test_X_grid[test_index]
                    test_y = test_y_grid[test_index]

                    predictions = clf.predict(test_X)

                    if model_name == 'LSTM' or model_name == 'NN':
                        test_y = argmax(test_y, axis=1)
                        # predictions = argmax(predictions, axis=1)

                    # test_y = test_y.astype(int)
                    # predictions = predictions.astype(int)

                    if i % 10 == 0:
                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')
                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(lags) + '_TH'+str(PrH_index) + \
                            '_CV' + str(i)+file1
                        plt.savefig(directoryresult+fpath+'.jpg')

                        plt.close()
                    #     data = {'Actual': test_y, 'Predictions': predictions}
                    #     print(test_y.shape)
                    #     print(predictions.shape)
                    #     if model_name == 'RF':
                    #         df = pd.DataFrame(data=data)
                    #     else:
                    #         df = pd.DataFrame(data=data, index=[0])

                    #     df.to_csv(directoryresult+filename +
                    #             '_CV'+str(i)+'.csv', index=False)

                    cm0 = func.forecast_accuracy(predictions, test_y, cat)

                    if cat == 1:
                        data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i,
                                'file_names': filename,  'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7]}
                    elif cat == 0:
                        data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i,
                                'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5]}

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directoryresult+resultFileName,
                              index=False, mode='a', header=False)

                    elapsed_time = time.time() - start_time
                    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i = i+1
                Kb.clear_session()
                gc.collect()
                del clf
def main():
    methods = ['OrgData']
    # 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory']
    targets = ['ysi_blue_green_algae']
    model_name = 'baseline'
    # test_Summer train_Summer  # bookTwo: Sondes_data/old/test/test_data/
    path = 'Sondes_data/test_Summer/'
    files = [f for f in os.listdir(path) if f.endswith(".csv")]

    for method in methods:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/final_models/Results/'  # final_models/Results  oversampling_cv_models/ #2
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/final_models/Results/'  # final_models/Results  oversampling_cv_models  #3
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }
            if not os.path.exists(directory):
                os.makedirs(directory)
            for file in files:
                print(file)
                result_filename = 'results_' + target + '_' + file
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                n_steps = 1

                for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]:

                    dataset = pd.read_csv(path + file)

                    # Only the Target
                    dataset = dataset[['year', 'month', 'day', 'hour', target]]

                    # dataset = dataset.dropna()
                    # print(dataset.head())

                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    # print(train_y_grid[0:1])

                    start_time = time.time()

                    i = 1
                    # For Test files: #4
                    custom_cv = func.custom_cv_kfolds_testdataonly(
                        train_X_grid, 100)
                    for test_index in custom_cv:

                        # For Train files:
                        # custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                        # for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        # current value would be the same in the future predictions
                        predictions = test_X[:, -1]

                        df_time = pd.DataFrame({
                            'year':
                            np.array(test_X[:, 0]).astype(int),
                            'month':
                            np.array(test_X[:, 1]).astype(int),
                            'day':
                            np.array(test_X[:, 2]).astype(int),
                            'hour':
                            np.array(test_X[:, 3]).astype(int),
                        })
                        # print(df_time.head())

                        timeline = pd.to_datetime(df_time, format='%Y%m%d %H')
                        # print(timeline.head())

                        # timeline = timeline.reshape(len(time),)

                        if cat == 1:
                            predictions = np.array(predictions).astype(int)
                            test_y = np.array(test_y).astype(int)

                        test_y = test_y.reshape(len(test_y), )
                        predictions = predictions.reshape(len(predictions), )

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        filename = file + '_' + \
                            target+'_TH' + \
                            str(PrH_index)+'_lag' + \
                            str(n_steps)+'_'+str(i)

                        # First test files
                        if i % 10 == 0:  # or i <= 3:  # 5
                            plt.scatter(timeline.values, test_y, s=1)
                            plt.scatter(timeline.values, predictions, s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.xticks(rotation=45)

                            directorydeeper = directory + 'more/'
                            if not os.path.exists(directorydeeper):
                                os.makedirs(directorydeeper)
                            plt.savefig(directorydeeper + filename + '.jpg')

                            # plt.show()

                            plt.close()
                            data = {
                                'time': timeline,
                                'Actual': test_y,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)

                            df.to_csv(directorydeeper + filename + '.csv',
                                      index=False)

                        if cat == 1:
                            data = {
                                'CV': i,
                                'target_names': target,
                                'method_names': method,
                                'temporalhorizons': PrH_index,
                                'window_nuggets': 1,
                                'file_names': filename,
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]]
                            }
                        elif cat == 0:
                            data = {
                                'CV': i,
                                'target_names': target,
                                'method_names': method,
                                'temporalhorizons': PrH_index,
                                'window_nuggets': 1,
                                'file_names': filename,
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + result_filename,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i = i + 1
                    gc.collect()
def main():

    models = ['NN']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen',
    targets = ['pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'

            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train/train_data/'
            testpath = 'Sondes_data/test/test_data/'
            method = 'OrgData'

            for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                params = func.trained_param_grid['param_grid_' + model_name +
                                                 str(cat)]
                lags = func.getlags_window(
                    model_name,
                    params['param_' + target + '_' + str(PrH_index)], cat)

                files = [
                    f for f in os.listdir(path)
                    if f.endswith('.csv') and f.startswith(sondefilename)
                ]
                file1 = files[0]
                print(' TH: ' + str(PrH_index) + ' ' + method + ' ' + target +
                      ' ' + file1)

                dataset = pd.read_csv(path + file1)
                train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)
                print(input_dim)

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    train_y_grid = to_categorical(train_y_grid, 3)

                start_time = time.time()

                mo = func.getModel(
                    model_name, input_dim,
                    params['param_' + target + '_' + str(PrH_index)], n_job,
                    cat)

                if model_name == 'RF' or model_name == 'DT':
                    pipeline = Pipeline(steps=[('model', mo)])
                else:
                    pipeline = Pipeline(steps=[('n',
                                                StandardScaler()), ('model',
                                                                    mo)])

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'_'+str(PrH_index)+'.sav'

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    clf = pipeline.fit(train_X_grid,
                                       train_y_grid,
                                       model__class_weight={
                                           0: 1,
                                           1: 50,
                                           2: 100
                                       })
                else:
                    clf = pipeline.fit(train_X_grid, train_y_grid)

                # joblib.dump(clf, directory+filename)
                pickle.dump(clf, open(directory + filename, 'wb'))

                # To load the model, open the file in reading and binary mode
                # load_lr_model =pickle.load(open(filename, 'rb'))

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testsondefilename = re.sub('wo_', '', sondefilename)
                files = [
                    f for f in os.listdir(testpath)
                    if f.endswith('.csv') and f.startswith(testsondefilename)
                ]
                file1 = files[0]
                print('Window: ' + str(lags) + ' TH: ' + str(PrH_index) + ' ' +
                      method + ' ' + target + file1)

                dataset = pd.read_csv(testpath + file1)

                test_X_grid, test_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    test_y_grid = to_categorical(test_y_grid, 3)

                i = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(
                    test_X_grid, 100)
                for test_index in custom_cv:
                    test_X = test_X_grid[test_index]
                    test_y = test_y_grid[test_index]

                    predictions = clf.predict(test_X)

                    if model_name == 'LSTM' or model_name == 'NN':
                        test_y = argmax(test_y, axis=1)
                        # predictions = argmax(predictions, axis=1)

                    if cat == 1:
                        predictions = np.array(predictions).astype(int)
                        test_y = np.array(test_y).astype(int)
                        test_y = test_y.reshape(len(test_y), )
                        predictions = predictions.reshape(len(predictions), )

                    if i % 10 == 0:
                        plt.scatter(np.arange(len(test_y)), test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions,
                                    s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')
                        fpath = filename + '_CV' + str(i) + file1
                        # 'predictions_' + method+target+'_Window' + str(lags) + '_TH'+str(PrH_index) + \'_CV' + str(i)+file1
                        plt.savefig(directoryresult + fpath + '.jpg')

                        plt.close()
                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)
                        df = pd.DataFrame(data=data)
                        df.to_csv(directoryresult + filename + '_CV' + str(i) +
                                  file1,
                                  index=False)

                    cm0 = func.forecast_accuracy(predictions, test_y, cat)

                    if cat == 1:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'temporalhorizons': PrH_index,
                            'CV': i,
                            'file_names': filename,
                            'F1_0': cm0[0],
                            'F1_1': cm0[1],
                            'P_0': cm0[2],
                            'P_1': cm0[3],
                            'R_0': cm0[4],
                            'R_1': cm0[5],
                            'acc0_1': cm0[6],
                            'F1_0_1': cm0[7],
                            'F1_all': cm0[8],
                            'fbeta': [cm0[9]]
                        }
                    elif cat == 0:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'temporalhorizons': PrH_index,
                            'CV': i,
                            'file_names': filename,
                            'mape': cm0[0],
                            'me': cm0[1],
                            'mae': cm0[2],
                            'mpe': cm0[3],
                            'rmse': cm0[4],
                            'R2': cm0[5]
                        }

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directoryresult + resultFileName,
                              index=False,
                              mode='a',
                              header=False)

                    elapsed_time = time.time() - start_time
                    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i = i + 1
                Kb.clear_session()
                gc.collect()
                del clf
def main():

    models = ['MA']
    targets = ['ph', 'dissolved_oxygen']  # 'pHcategory', 'DOcategory'
    sondefilename = 'leavon'

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/1sonde/output_Cat_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/1sonde/output_Reg_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)
            print(directoryresult)
            testsondefilename = 'utlcp'
            resultFileName = 'results_'+testsondefilename + '_' + \
                target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train_Summer/'
            testpath = 'Sondes_data/test_Summer/'
            method = 'OrgData'

            for n_steps in [1]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60,
                                  72]:  # 1, 3, 6, 12,
                    # files = [f for f in os.listdir(path) if f.endswith(
                    #     '.csv') and f.startswith(sondefilename)]
                    # file = files[0]
                    # print('Window: '+str(n_steps) + ' TH: ' +
                    #       str(PrH_index)+' '+method+' '+target)

                    # dataset = pd.read_csv(path+file)

                    # ######################
                    # # FOR MA
                    # ######################
                    # dataset = temporal_horizon(dataset, PrH_index, target)
                    # train = dataset[target]
                    # train_target = dataset['Target_'+target]
                    # print(train.head())
                    # print(train_target.head())

                    # custom_cv = func.custom_cv_kfolds_testdataonly(
                    #     train, 1)

                    # for train_index in custom_cv:
                    #     train = train[train_index].values
                    #     train_target = train_target[train_index].values

                    # coef, lag = movingAverage(
                    #     train, train_target)
                    # np.save(directory+'MA_model_'+target +
                    #                 '_'+str(PrH_index)+'.npy')
                    # np.save(directory+'MA_data_'+target +
                    #                 '_'+str(PrH_index)+'.npy', lag)

                    coef = np.load(directory + 'MA_model_' + target + '_' +
                                   str(PrH_index) + '.npy')
                    lag = np.load(directory + 'MA_data_' + target + '_' +
                                  str(PrH_index) + '.npy')

                    ######################
                    # TEST sets
                    ######################
                    # start_time = time.time()
                    # testsondefilename = re.sub('wo_', '', sondefilename)

                    files = [
                        f for f in os.listdir(testpath) if f.endswith('.csv')
                        and f.startswith(testsondefilename)
                    ]
                    file1 = files[0]
                    print('Window: ' + str(len(lag)) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target + file1)

                    testdataset = pd.read_csv(testpath + file1)
                    testdataset = temporal_horizon(testdataset, PrH_index,
                                                   target)

                    test = testdataset[target]
                    test_target = testdataset['Target_' + target]
                    # print(test.head())
                    # print(test_target.head())

                    i = 1
                    custom_cv = func.custom_cv_kfolds_testdataonly(test, 100)
                    for test_index in custom_cv:
                        test_y = test[test_index].values
                        # for MA
                        test_y_targets = test_target[test_index].values

                        # walk forward over time steps in test
                        history = [lag[i] for i in range(len(lag))]
                        predictions = list()
                        for t in range(len(test_y)):
                            # persistence
                            yhat = test_y[t]
                            # predict error
                            length = len(history)
                            window = len(coef)
                            hl = [
                                history[i]
                                for i in range(length - window, length)
                            ]
                            pred_error = predict(coef, hl, window)
                            yhat = yhat + pred_error
                            predictions.append(yhat)
                            error = test_y_targets[t] - yhat
                            history.append(error)

                        if cat == 1:
                            predictions = np.array(predictions).astype(int)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + \
                            str(i) + testsondefilename
                        # '_vals_'+str(p)+'_'+str(d) + \
                        # '_'+str(q)+'_'+\
                        # print(len(predictions))
                        # print(len(test_y_targets))
                        cm0 = func.forecast_accuracy(predictions,
                                                     test_y_targets, cat)

                        if i % 10 == 0:
                            plt.scatter(np.arange(len(test_y_targets)),
                                        test_y,
                                        s=1)
                            plt.scatter(np.arange(len(predictions)),
                                        predictions,
                                        s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.savefig(directoryresult + fpath + '.png')
                            plt.close()

                            data = {
                                'Actual': test_y_targets,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)
                            df.to_csv(directoryresult + fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': testsondefilename,
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': testsondefilename,
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directoryresult + resultFileName,
                                  index=False,
                                  mode='a',
                                  header=False)

                        i = i + 1
Exemple #5
0
def main():

    # models = ['endecodeLSTM', 'CNNLSTM', 'ConvEnLSTM',
    #           'NN', 'SVC', 'RF_onereg', 'DT_onereg']

    models = ['LSTM']  # save the models later
    # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen',
    targets = ['dissolved_oxygen', 'ph']
    path = 'Sondes_data/train_Summer/'
    # files = [f for f in os.listdir(path) if f.endswith(
    #     ".csv") and f.startswith('leavon')]  # leavon
    files = ['osugi.csv', 'utlcp.csv', 'leoc_1.csv', 'leavon.csv']
    n_job = -1
    PrH_index = 0

    for model_name in models:
        print(model_name)
        for target in targets:
            print(target)
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/2sondes/output_Cat_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/2sondes/output_Reg_' + \
                    model_name+'/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mse': 'mse',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)
            print(directory)
            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            # resultFileName = 'results_'+target+str(time.time())+'.csv'

            for file in files:

                method = 'OrgData'

                params = func.trained_param_grid['param_grid_' + model_name +
                                                 str(cat)]
                n_steps_in = func.getlags_window(
                    model_name,
                    params['param_' + target + '_' + str(PrH_index)], cat)
                print(n_steps_in)

                dataset = pd.read_csv(path + file)

                dataset = dataset[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]
                # print(dataset.head())

                dataset_bgsusd = pd.read_csv(path + 'bgsusd_all.csv')

                dataset_bgsusd = dataset_bgsusd[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]

                dataset = temporal_horizon(dataset, PrH_index, target)

                dataset_bgsusd = temporal_horizon(dataset_bgsusd, PrH_index,
                                                  target)

                n_steps_out = 9
                train_X_grid, y = split_sequences(dataset, n_steps_in,
                                                  n_steps_out)
                print(train_X_grid.shape)

                n_features = train_X_grid.shape[2]
                print('n_fetures: ' + str(n_features))

                train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences(
                    dataset_bgsusd, n_steps_in, n_steps_out)

                train_X_grid = train_X_grid.reshape(
                    train_X_grid.shape[0],
                    train_X_grid.shape[1] * train_X_grid.shape[2])

                train_X_grid_bgsusd = train_X_grid_bgsusd.reshape(
                    train_X_grid_bgsusd.shape[0],
                    train_X_grid_bgsusd.shape[1] *
                    train_X_grid_bgsusd.shape[2])

                XX = hstack((train_X_grid_bgsusd, train_X_grid))
                # XX = train_X_grid  # for final multivariate training model on LSTM
                print(XX.shape)
                # print(XX[0])
                input_dim = XX.shape

                start_time = time.time()

                model = algofind(
                    model_name, input_dim, cat, n_features, n_steps_out,
                    params['param_' + target + '_' + str(PrH_index)], n_job)

                if model_name == 'RF' or model_name == 'DT':
                    pipeline = Pipeline(steps=[('model', model)])
                else:
                    pipeline = Pipeline(
                        steps=[('n', StandardScaler()), ('model', model)])

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'.joblib'

                if model_name == 'ConvEnLSTM' or model_name == 'endecodeLSTM' or model_name == 'CNNLSTM':
                    clf = pipeline.fit(XX, y.reshape(y.shape[0], 1,
                                                     n_steps_out))
                else:
                    clf = pipeline.fit(XX, y)

                # joblib.dump(clf, directory+filename)
                # pickle.dump(clf, open(directory+filename, 'wb'))

                # To load the model, open the file in reading and binary mode
                # load_lr_model =pickle.load(open(filename, 'rb'))

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testpath = 'Sondes_data/test_Summer/'
                # testfiles = ['lelorain.csv', 'utlcp.csv',
                #              'lementor_1.csv', 'lebiww.csv']
                # for testfile in testfiles:
                testfile = file
                result_filename = 'results_'+testfile+'_'+target + \
                    '_'+file+'_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)

                dataset = pd.read_csv(testpath + testfile)
                dataset = dataset[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]
                # print(dataset.head())

                dataset_bgsusd = pd.read_csv(testpath + 'bgsusd_all.csv')
                dataset_bgsusd = dataset_bgsusd[[
                    'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph',
                    'year', 'month', 'day', 'hour'
                ]]

                dataset = temporal_horizon(dataset, PrH_index, target)

                dataset_bgsusd = temporal_horizon(dataset_bgsusd, PrH_index,
                                                  target)

                test_X_grid, y = split_sequences(dataset, n_steps_in,
                                                 n_steps_out)

                n_features = test_X_grid.shape[2]
                test_X_grid_bgsusd, test_y_grid_bgsusd = split_sequences(
                    dataset_bgsusd, n_steps_in, n_steps_out)

                test_X_grid = test_X_grid.reshape(
                    test_X_grid.shape[0],
                    test_X_grid.shape[1] * test_X_grid.shape[2])

                test_X_grid_bgsusd = test_X_grid_bgsusd.reshape(
                    test_X_grid_bgsusd.shape[0],
                    test_X_grid_bgsusd.shape[1] * test_X_grid_bgsusd.shape[2])

                test_XX = hstack((test_X_grid_bgsusd, test_X_grid))
                # test_XX = test_X_grid

                i_cv = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(test_XX, 100)
                for test_index in custom_cv:
                    test_X = test_XX[test_index]
                    test_y = y[test_index]

                    test_time = test_XX[test_index]
                    # print(test_time[0])
                    dftime = pd.DataFrame({
                        'year':
                        np.array(test_time[:, -4]).astype(int),
                        'month':
                        np.array(test_time[:, -3]).astype(int),
                        'day':
                        np.array(test_time[:, -2]).astype(int),
                        'hour':
                        np.array(test_time[:, -1]).astype(int),
                    })
                    # print(dftime.head())
                    df_time = pd.to_datetime(dftime, format='%Y%m%d %H')

                    predictions = clf.predict(test_X)

                    # print(predictions.shape)
                    predictions = predictions.reshape(-1, n_steps_out)

                    fpath = 'predictions_' + method+target+'_Window' +\
                        str(n_steps_in) + '_CV' + str(i_cv)+testfile

                    if i_cv % 10 == 0:
                        fig, ax = plt.subplots(nrows=5,
                                               ncols=2,
                                               figsize=(50, 50))
                        i = j = 0
                        k = 0
                        columns = [
                            't+1', 't+3', 't+6', 't+12', 't+24', 't+36',
                            't+48', 't+60', 't+72'
                        ]
                        for col in columns:
                            if k < len(columns):
                                ax[i, j].scatter(df_time.values, test_y[:, k])
                                ax[i, j].scatter(df_time.values,
                                                 predictions[:, k])
                                k = k + 1
                                ax[i, j].set_title(col)
                                ax[i, j].legend(['actual', 'prediction'])
                                j += 1
                                if j > 1:
                                    i += 1
                                    j = 0

                        plt.savefig(directoryresult + fpath + '.png')
                        plt.close()

                        # print(test_y.shape)
                        # print(predictions.shape)
                        columns = [
                            'a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36',
                            'a+48', 'a+60', 'a+72'
                        ]
                        df_actual = pd.DataFrame(data=test_y, columns=columns)
                        columns = [
                            'p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36',
                            'p+48', 'p+60', 'p+72'
                        ]
                        df_predictions = pd.DataFrame(data=predictions,
                                                      columns=columns)

                        frames = [df_time, df_actual, df_predictions]
                        # concatenate dataframes
                        df = pd.concat(frames, axis=1)  # , sort=False
                        df.to_csv(directoryresult + fpath, index=False)

                    cm0 = np.zeros((n_steps_out, 6))
                    for t in range(n_steps_out):
                        cm0[t, :] = func.forecast_accuracy(
                            predictions[:, t], test_y[:, t], cat)

                    if cat == 1:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'window_nuggets': n_steps_in,
                            'temporalhorizons': PrH_index,
                            'CV': i_cv,
                            'file_names': testfile,
                            'F1_0': cm0[0],
                            'F1_1': cm0[1],
                            'P_0': cm0[2],
                            'P_1': cm0[3],
                            'R_0': cm0[4],
                            'R_1': cm0[5],
                            'acc0_1': cm0[6],
                            'F1_0_1': cm0[7],
                            'F1_all': cm0[8],
                            'fbeta': [cm0[9]]
                        }
                    elif cat == 0:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'window_nuggets': n_steps_in,
                            'temporalhorizons': PrH_index,
                            'CV': i_cv,
                            'file_names': testfile,
                            'mape': [cm0[:, 0]],
                            'me': [cm0[:, 1]],
                            'mae': [cm0[:, 2]],
                            'mse': [cm0[:, 3]],
                            'rmse': [cm0[:, 4]],
                            'R2': [cm0[:, 5]]
                        }

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directory + result_filename,
                              index=False,
                              mode='a',
                              header=False)

                    elapsed_time = time.time() - start_time
                    # print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i_cv = i_cv + 1