def main():

    models = ['RF']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    targets = ['dissolved_oxygen', 'ph']  # ['DOcategory', 'pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/balance_data/output_Cat_' + model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1'}
            else:
                cat = 0
                directory = 'Results/balance_data/output_Reg_' + model_name+'/final_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_'+target+str(time.time())+'.csv'

            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult+resultFileName,
                            index=False, header=False)

            if model_name == 'DT' or model_name == 'RF':
                method = 'OrgData'
                path = 'Sondes_data/train/train_data/'
                testpath = 'Sondes_data/test/test_data/'
            else:
                method = 'StandardScaler'
                path = 'Sondes_data/train/train_data_normalized/'+method+'/'+target+'/'
                testpath = 'Sondes_data/test/train_data_normalized/' + method+'/'+target+'/'

            for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                params = func.trained_param_grid[
                    'param_grid_'+model_name+str(cat)]
                lags = func.getlags_window(
                    model_name, params['param_'+target+'_'+str(PrH_index)], cat)

                files = [f for f in os.listdir(path) if f.endswith(
                    '.csv') and f.startswith(sondefilename)]
                file1 = files[0]
                print(' TH: ' +
                      str(PrH_index)+' '+method+' '+target+' '+file1)

                dataset = pd.read_csv(path+file1)
                train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                if model_name == 'LSTM' or model_name == 'NN':
                    n_job = 1

                start_time = time.time()

                clf = func.getModel(
                    model_name, input_dim, params['param_'+target+'_'+str(PrH_index)], n_job, cat)

                print('clf: '+str(clf))

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    train_y_grid = to_categorical(train_y_grid, 3)
                    clf = clf.fit(train_X_grid, train_y_grid,
                                  model__class_weight={0: 1, 1: 50, 2: 100})
                else:
                    clf = clf.fit(train_X_grid, train_y_grid)

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'_'+str(PrH_index)+'.sav'
                joblib.dump(clf, directory+filename)

                # if model_name == 'RF' or model_name=='DT':
                #     featurenames = func.setfeatures(features, lags)

                #     if not os.path.exists(directory+'trees/'):
                #         os.makedirs(directory+'trees/')

                #     i_tree = 0
                #     class_names = ['0', '1', '2']
                #     print(len(clf))
                #     for tree_in_forest in clf:
                #         dot_data = tree.export_graphviz(tree_in_forest, out_file=None,
                #                                         feature_names=featurenames,
                #                                         class_names=class_names,
                #                                         filled=True, rounded=True,
                #                                         special_characters=True)
                #         graph = pydotplus.graph_from_dot_data(dot_data)
                #         graph.write_pdf(
                #             directory+'trees/tree_'+filename+str(i_tree)+".pdf")
                #         i_tree = i_tree + 1
                #         if(i_tree > 1):
                #             break

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testsondefilename = re.sub('wo_', '', sondefilename)
                files = [f for f in os.listdir(testpath) if f.endswith(
                    '.csv')and f.startswith(testsondefilename)]
                file1 = files[0]
                print('Window: '+str(lags) + ' TH: ' +
                      str(PrH_index)+' '+method+' '+target+file1)

                dataset = pd.read_csv(testpath+file1)

                test_X_grid, test_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                i = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(
                    test_X_grid, 100)
                for test_index in custom_cv:
                    test_X = test_X_grid[test_index]
                    test_y = test_y_grid[test_index]

                    predictions = clf.predict(test_X)

                    if model_name == 'LSTM' or model_name == 'NN':
                        test_y = argmax(test_y, axis=1)
                        # predictions = argmax(predictions, axis=1)

                    # test_y = test_y.astype(int)
                    # predictions = predictions.astype(int)

                    if i % 10 == 0:
                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')
                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(lags) + '_TH'+str(PrH_index) + \
                            '_CV' + str(i)+file1
                        plt.savefig(directoryresult+fpath+'.jpg')

                        plt.close()
                    #     data = {'Actual': test_y, 'Predictions': predictions}
                    #     print(test_y.shape)
                    #     print(predictions.shape)
                    #     if model_name == 'RF':
                    #         df = pd.DataFrame(data=data)
                    #     else:
                    #         df = pd.DataFrame(data=data, index=[0])

                    #     df.to_csv(directoryresult+filename +
                    #             '_CV'+str(i)+'.csv', index=False)

                    cm0 = func.forecast_accuracy(predictions, test_y, cat)

                    if cat == 1:
                        data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i,
                                'file_names': filename,  'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7]}
                    elif cat == 0:
                        data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i,
                                'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5]}

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directoryresult+resultFileName,
                              index=False, mode='a', header=False)

                    elapsed_time = time.time() - start_time
                    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i = i+1
                Kb.clear_session()
                gc.collect()
                del clf
def main():

    models = ['RF']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    targets = ['ph']  # ['DOcategory', 'pHcategory'] # 'ph','dissolved_oxygen'
    # ph TH: 24,36,48
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/balance_data/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta',
                    'imfeatures': 'imfeatures',
                    'best_thresh_0': 'best_thresh_0',
                    'best_thresh_1': 'best_thresh_1',
                    'best_thresh_2': 'best_thresh_2'
                }
            else:
                cat = 0
                directory = 'Results/balance_data/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2',
                    'imfeatures': 'imfeatures'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory + resultFileName,
                            index=False,
                            header=False)

            if model_name == 'DT' or model_name == 'RF':
                path = 'Sondes_data/train/train_data/'
                method = 'OrgData'
            else:
                method = 'StandardScaler'
                path = 'Sondes_data/train/train_data_normalized/' + method + '/' + target + '/'

            for n_steps in [1, 3, 6, 12]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [
                        f for f in os.listdir(path)
                        if f.endswith('.csv') and f.startswith(sondefilename)
                    ]
                    file = files[0]
                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    dataset = pd.read_csv(path + file)
                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)

                    if cat == 1 and (model_name == 'LSTM'
                                     or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()

                    # resample = SMOTETomek(tomek=TomekLinks(
                    #     sampling_strategy='majority'))
                    # print(train_y_grid[train_y_grid.argmax(axis=1)==2])

                    model = func.algofind(model_name, input_dim, n_steps, cat)
                    # ('r', resample),
                    # if cat == 1:
                    #     model = CalibratedClassifierCV(
                    #         model, method='isotonic')

                    pipeline = Pipeline(steps=[('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)
                    gs = RandomizedSearchCV(
                        estimator=pipeline,
                        param_distributions=func.param_grid['param_grid_' +
                                                            model_name +
                                                            str(cat)],
                        n_iter=10,
                        cv=custom_cv,
                        verbose=0,
                        random_state=42,
                        n_jobs=n_job)

                    if cat == 1 and (model_name == 'LSTM'
                                     or model_name == 'NN'):
                        clf = gs.fit(train_X_grid,
                                     train_y_grid,
                                     model__class_weight={
                                         0: 1,
                                         1: 50,
                                         2: 100
                                     })
                    else:
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]
                        predictions = clf.predict(test_X)
                        # predict_mine = []
                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1:
                            # predict probabilities
                            yhat = clf.predict_proba(test_X)
                            # print(yhat[100:103])
                            y = label_binarize(test_y, classes=[0, 1, 2])
                            # print(y[100:103])

                            # roc_curve
                            fpr = dict()
                            tpr = dict()
                            roc_auc = dict()
                            best_thresh = dict()
                            for i in range(3):
                                fpr[i], tpr[i], thresholds = roc_curve(
                                    y[:, i], yhat[:, i])
                                roc_auc[i] = auc(fpr[i], tpr[i])
                                J = tpr[i] - fpr[i]
                                # get the best threshold
                                ix = argmax(J)
                                best_thresh[i] = thresholds[ix]
                                print('Best Threshold=%f, roc_auc=%.3f' %
                                      (best_thresh[i], roc_auc[i]))

                            # Compute micro-average ROC curve and ROC area
                            fpr["micro"], tpr["micro"], _ = roc_curve(
                                y.ravel(), yhat.ravel())
                            roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
                            plt.plot(
                                fpr["micro"],
                                tpr["micro"],
                                label='micro-average ROC curve (area = {0:0.2f})'
                                ''.format(roc_auc["micro"]),
                                color='deeppink',
                                linestyle=':',
                                linewidth=4)

                            colors = cycle(
                                ['aqua', 'darkorange', 'cornflowerblue'])
                            for i, color in zip(range(3), colors):
                                plt.plot(
                                    fpr[i],
                                    tpr[i],
                                    color=color,
                                    lw=2,
                                    label=
                                    'ROC curve of class {0} (area = {1:0.2f})'
                                    ''.format(i, roc_auc[i]))
                            # plot the roc curve for the model
                            plt.plot([0, 1], [0, 1],
                                     linestyle='--',
                                     label='No Skill')
                            # axis labels
                            plt.xlabel('False Positive Rate')
                            plt.ylabel('True Positive Rate')
                            plt.title(
                                'Some extension of Receiver operating characteristic to multi-class'
                            )
                            plt.legend(loc="lower right")
                            # show the plot
                            plt.savefig(directory + fpath + 'ROC_curve.jpg')
                            plt.close()

                        if cat == 1 and (model_name == 'LSTM'
                                         or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)
                            # predictions = argmax(predictions, axis=1)
                        if cat == 0:
                            predictions, test_y = func.transform(
                                predictions, test_y, method, target, file)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)), test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions,
                                    s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory + fpath + '.jpg')

                        plt.close()

                        # data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        # if model_name == 'RF':
                        #     df = pd.DataFrame(data=data)
                        # else:
                        #     df = pd.DataFrame(data=data, index=[0])
                        # df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': fpath,
                                'std_test_score': [test_std],
                                'mean_test_score': [test_Score],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]],
                                'imfeatures': [clf.best_estimator_],
                                'best_thresh_0': best_thresh[0],
                                'best_thresh_1': best_thresh[1],
                                'best_thresh_2': best_thresh[2]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': fpath,
                                'std_test_score': [test_std],
                                'mean_test_score': [test_Score],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5],
                                'imfeatures': [clf.best_estimator_]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + resultFileName,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i = i + 1
                    Kb.clear_session()
                    gc.collect()
                    del clf
def main():
    methods = ['OrgData']
    # 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory']
    targets = ['ysi_blue_green_algae']
    model_name = 'baseline'
    # test_Summer train_Summer  # bookTwo: Sondes_data/old/test/test_data/
    path = 'Sondes_data/test_Summer/'
    files = [f for f in os.listdir(path) if f.endswith(".csv")]

    for method in methods:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/final_models/Results/'  # final_models/Results  oversampling_cv_models/ #2
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/final_models/Results/'  # final_models/Results  oversampling_cv_models  #3
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }
            if not os.path.exists(directory):
                os.makedirs(directory)
            for file in files:
                print(file)
                result_filename = 'results_' + target + '_' + file
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                n_steps = 1

                for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]:

                    dataset = pd.read_csv(path + file)

                    # Only the Target
                    dataset = dataset[['year', 'month', 'day', 'hour', target]]

                    # dataset = dataset.dropna()
                    # print(dataset.head())

                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    # print(train_y_grid[0:1])

                    start_time = time.time()

                    i = 1
                    # For Test files: #4
                    custom_cv = func.custom_cv_kfolds_testdataonly(
                        train_X_grid, 100)
                    for test_index in custom_cv:

                        # For Train files:
                        # custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                        # for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        # current value would be the same in the future predictions
                        predictions = test_X[:, -1]

                        df_time = pd.DataFrame({
                            'year':
                            np.array(test_X[:, 0]).astype(int),
                            'month':
                            np.array(test_X[:, 1]).astype(int),
                            'day':
                            np.array(test_X[:, 2]).astype(int),
                            'hour':
                            np.array(test_X[:, 3]).astype(int),
                        })
                        # print(df_time.head())

                        timeline = pd.to_datetime(df_time, format='%Y%m%d %H')
                        # print(timeline.head())

                        # timeline = timeline.reshape(len(time),)

                        if cat == 1:
                            predictions = np.array(predictions).astype(int)
                            test_y = np.array(test_y).astype(int)

                        test_y = test_y.reshape(len(test_y), )
                        predictions = predictions.reshape(len(predictions), )

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        filename = file + '_' + \
                            target+'_TH' + \
                            str(PrH_index)+'_lag' + \
                            str(n_steps)+'_'+str(i)

                        # First test files
                        if i % 10 == 0:  # or i <= 3:  # 5
                            plt.scatter(timeline.values, test_y, s=1)
                            plt.scatter(timeline.values, predictions, s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.xticks(rotation=45)

                            directorydeeper = directory + 'more/'
                            if not os.path.exists(directorydeeper):
                                os.makedirs(directorydeeper)
                            plt.savefig(directorydeeper + filename + '.jpg')

                            # plt.show()

                            plt.close()
                            data = {
                                'time': timeline,
                                'Actual': test_y,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)

                            df.to_csv(directorydeeper + filename + '.csv',
                                      index=False)

                        if cat == 1:
                            data = {
                                'CV': i,
                                'target_names': target,
                                'method_names': method,
                                'temporalhorizons': PrH_index,
                                'window_nuggets': 1,
                                'file_names': filename,
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]]
                            }
                        elif cat == 0:
                            data = {
                                'CV': i,
                                'target_names': target,
                                'method_names': method,
                                'temporalhorizons': PrH_index,
                                'window_nuggets': 1,
                                'file_names': filename,
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + result_filename,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i = i + 1
                    gc.collect()
Exemple #4
0
def main():

    # models = ['LOF', 'EE', 'IF', 'SVM']
    targets = ['DOcategory', 'pHcategory']  # , 'ph', 'dissolved_oxygen']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    # n_job = -1
    model, model_name = get_models()
    for j in range(len(model)):
        print(model_name[j])
        print(model[j])

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/AnomalyDetection/output_Cat_' + \
                    model_name[j]+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'fbeta': 'fbeta'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'SS_pipeline'

            for n_steps in [1, 3, 6, 12]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)
                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    print(train_X_grid[0:1])

                    start_time = time.time()

                    if model_name[j] == 'IF':
                        pipeline = Pipeline(steps=[('model', model[j])])
                    else:
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model[j])])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    i = 1
                    for train_index, test_index in custom_cv:
                        train_X_ = train_X_grid[train_index]
                        test_y_ = train_y_grid[train_index]
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        # fit on majority class
                        train_X_ = train_X_[test_y_ == 0]

                        # detect outliers in the test set
                        # if model_name[j] == 'LOF':
                        #     predictions = lof_predict(
                        #         model[j], train_X_, test_X)
                        # else:
                        pipeline.fit(train_X_)
                        predictions = pipeline.predict(test_X)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        # mark inliers 1, outliers -1
                        test_y[test_y > 0] = -1
                        test_y[test_y == 0] = 1
                        # calculate score
                        score = f1_score(test_y, predictions, pos_label=-1)
                        print('F-measure: %.3f' % score)
                        # cm0 = predict(predictions, predictions, cat)

                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory+fpath+'.jpg')

                        plt.close()

                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        df = pd.DataFrame(data=data)
                        df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'F-measure': score}

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory+resultFileName,
                                  index=False, mode='a', header=False)

                        elapsed_time = time.time() - start_time
                        print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                        i = i+1
                    Kb.clear_session()
                    gc.collect()
Exemple #5
0
def main():

    # 'LR', 'DT', 'SVC', 'LSTM', 'NN', # 'MLP', 'CNN', 'LSTM', 'ConvLSTM', 'CNNLSTM', 'EncodeDecodeLSTMs'
    models = ['RF']
    targets = ['DOcategory', 'pHcategory', 'ph', 'dissolved_oxygen']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures'}
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2', 'imfeatures': 'imfeatures'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'OrgData'

            for n_steps in [1, 3, 6, 12]:  #
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    print(train_X_grid[0:1])

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()
                    model = func.algofind(model_name, input_dim, n_steps, cat)

                    if cat == 1:
                        metric = make_scorer(f2_measure)
                    else:
                        metric = make_scorer(R2_measure)

                    # cat_ix = train_X_grid[:, 7:]
                    # print(cat_ix[0:2])
                    # num_ix = train_X_grid[:, : 7]
                    # print(num_ix[0:2])
                    # one hot encode categorical, normalize numerical
                    # ct = ColumnTransformer(
                    #     [('c', OneHotEncoder(), cat_ix), ('n', StandardScaler(), num_ix)])

                    if model_name == 'RF' or model_name == 'DT':
                        pipeline = Pipeline(steps=[('model', model)])

                    else:  # model_name == 'LSTM' or model_name == 'NN':
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model)])

                    # else:
                    #     pipeline = Pipeline(
                    #         steps=[('transforms', ct), ('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid,
                                     model__class_weight={0: 1, 1: 50, 2: 100})
                    elif cat == 0 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)
                    else:
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, scoring=metric, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        predictions = clf.predict(test_X)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory+fpath+'.jpg')

                        plt.close()

                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        df = pd.DataFrame(data=data)

                        df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_]}
                        elif cat == 0:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_]}

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory+resultFileName,
                                  index=False, mode='a', header=False)

                        elapsed_time = time.time() - start_time
                        print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                        i = i+1
def main():

    models = ['NN']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen',
    targets = ['pHcategory']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + model_name + '/final_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            directoryresult = directory + 'Results/'
            if not os.path.exists(directoryresult):
                os.makedirs(directoryresult)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'

            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directoryresult + resultFileName,
                            index=False,
                            header=False)

            path = 'Sondes_data/train/train_data/'
            testpath = 'Sondes_data/test/test_data/'
            method = 'OrgData'

            for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                params = func.trained_param_grid['param_grid_' + model_name +
                                                 str(cat)]
                lags = func.getlags_window(
                    model_name,
                    params['param_' + target + '_' + str(PrH_index)], cat)

                files = [
                    f for f in os.listdir(path)
                    if f.endswith('.csv') and f.startswith(sondefilename)
                ]
                file1 = files[0]
                print(' TH: ' + str(PrH_index) + ' ' + method + ' ' + target +
                      ' ' + file1)

                dataset = pd.read_csv(path + file1)
                train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)
                print(input_dim)

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    train_y_grid = to_categorical(train_y_grid, 3)

                start_time = time.time()

                mo = func.getModel(
                    model_name, input_dim,
                    params['param_' + target + '_' + str(PrH_index)], n_job,
                    cat)

                if model_name == 'RF' or model_name == 'DT':
                    pipeline = Pipeline(steps=[('model', mo)])
                else:
                    pipeline = Pipeline(steps=[('n',
                                                StandardScaler()), ('model',
                                                                    mo)])

                # save the model to disk
                filename = model_name+'_model_' + \
                    target+'_'+str(PrH_index)+'.sav'

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    clf = pipeline.fit(train_X_grid,
                                       train_y_grid,
                                       model__class_weight={
                                           0: 1,
                                           1: 50,
                                           2: 100
                                       })
                else:
                    clf = pipeline.fit(train_X_grid, train_y_grid)

                # joblib.dump(clf, directory+filename)
                pickle.dump(clf, open(directory + filename, 'wb'))

                # To load the model, open the file in reading and binary mode
                # load_lr_model =pickle.load(open(filename, 'rb'))

                elapsed_time = time.time() - start_time
                print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

                #################################
                # Testing final model on test data
                #################################
                start_time = time.time()
                testsondefilename = re.sub('wo_', '', sondefilename)
                files = [
                    f for f in os.listdir(testpath)
                    if f.endswith('.csv') and f.startswith(testsondefilename)
                ]
                file1 = files[0]
                print('Window: ' + str(lags) + ' TH: ' + str(PrH_index) + ' ' +
                      method + ' ' + target + file1)

                dataset = pd.read_csv(testpath + file1)

                test_X_grid, test_y_grid, input_dim, features = func.preparedata(
                    dataset, PrH_index, lags, target, cat)

                if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                    test_y_grid = to_categorical(test_y_grid, 3)

                i = 1
                custom_cv = func.custom_cv_kfolds_testdataonly(
                    test_X_grid, 100)
                for test_index in custom_cv:
                    test_X = test_X_grid[test_index]
                    test_y = test_y_grid[test_index]

                    predictions = clf.predict(test_X)

                    if model_name == 'LSTM' or model_name == 'NN':
                        test_y = argmax(test_y, axis=1)
                        # predictions = argmax(predictions, axis=1)

                    if cat == 1:
                        predictions = np.array(predictions).astype(int)
                        test_y = np.array(test_y).astype(int)
                        test_y = test_y.reshape(len(test_y), )
                        predictions = predictions.reshape(len(predictions), )

                    if i % 10 == 0:
                        plt.scatter(np.arange(len(test_y)), test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions,
                                    s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')
                        fpath = filename + '_CV' + str(i) + file1
                        # 'predictions_' + method+target+'_Window' + str(lags) + '_TH'+str(PrH_index) + \'_CV' + str(i)+file1
                        plt.savefig(directoryresult + fpath + '.jpg')

                        plt.close()
                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)
                        df = pd.DataFrame(data=data)
                        df.to_csv(directoryresult + filename + '_CV' + str(i) +
                                  file1,
                                  index=False)

                    cm0 = func.forecast_accuracy(predictions, test_y, cat)

                    if cat == 1:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'temporalhorizons': PrH_index,
                            'CV': i,
                            'file_names': filename,
                            'F1_0': cm0[0],
                            'F1_1': cm0[1],
                            'P_0': cm0[2],
                            'P_1': cm0[3],
                            'R_0': cm0[4],
                            'R_1': cm0[5],
                            'acc0_1': cm0[6],
                            'F1_0_1': cm0[7],
                            'F1_all': cm0[8],
                            'fbeta': [cm0[9]]
                        }
                    elif cat == 0:
                        data = {
                            'target_names': target,
                            'method_names': method,
                            'temporalhorizons': PrH_index,
                            'CV': i,
                            'file_names': filename,
                            'mape': cm0[0],
                            'me': cm0[1],
                            'mae': cm0[2],
                            'mpe': cm0[3],
                            'rmse': cm0[4],
                            'R2': cm0[5]
                        }

                    df = pd.DataFrame(data=data, index=[0])
                    df.to_csv(directoryresult + resultFileName,
                              index=False,
                              mode='a',
                              header=False)

                    elapsed_time = time.time() - start_time
                    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                    i = i + 1
                Kb.clear_session()
                gc.collect()
                del clf
def main():
    method = 'OrgData'

    # 'DOcategory', 'pHcategory',ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    # 'ysi_blue_green_algae']  # , 'dissolved_oxygen', 'ph']
    targets = ['ph']
    # 'ARIMA', 'SARIMA', 'ETS', 'AR', 'MA'
    models = ['SARIMA']
    path = 'Sondes_data/train_Summer/'
    files = [
        f for f in os.listdir(path)
        if f.endswith(".csv") and f.startswith('leavon')
    ]  # leavon bgsusd_all

    for model_name in models:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'config': 'config',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'config': 'config',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }
            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:
                print(file)
                result_filename = 'results_'+target + \
                    '_'+file + '_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                n_steps = 1

                for PrH_index in [1, 3, 6, 12, 24, 36]:

                    dataset = pd.read_csv(path + file)

                    # Only the Target
                    dataset = dataset[['year', 'month', 'day', 'hour', target]]

                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    i = 1

                    if model_name == 'MA':
                        train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                            dataset, PrH_index, n_steps, target, cat)

                        start_time = time.time()
                        # For Train files:
                        custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                        for train_index, test_index in custom_cv:
                            train_X = train_X_grid[train_index]
                            train_y = train_y_grid[train_index]
                            train_X_uni = train_X[:, -1]

                            test_X = train_X_grid[test_index]
                            # actual future values
                            test_X_uni = test_X[:, -1]
                            test_y = train_y_grid[test_index]

                            predictions = ufunc.movingAverage(
                                train_X_uni, train_y, test_X_uni, test_y)

                            df_time = pd.DataFrame({
                                'year':
                                np.array(test_X[:, 0]).astype(int),
                                'month':
                                np.array(test_X[:, 1]).astype(int),
                                'day':
                                np.array(test_X[:, 2]).astype(int),
                                'hour':
                                np.array(test_X[:, 3]).astype(int),
                            })

                            timeline = pd.to_datetime(df_time,
                                                      format='%Y%m%d %H')

                            if cat == 1:
                                predictions = np.array(predictions).astype(int)
                                test_y = np.array(test_y).astype(int)

                            # test_y = test_y.reshape(len(test_y),)
                            # predictions = predictions.reshape(
                            #     len(predictions),)

                            cm0 = func.forecast_accuracy(
                                predictions, test_y, cat)

                            filename = file + '_' + \
                                target+'_TH' + \
                                str(PrH_index)+'_lag' + \
                                str(n_steps)+'_'+str(i)

                            plt.scatter(timeline.values, test_y, s=1)
                            plt.scatter(timeline.values, predictions, s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.xticks(rotation=45)

                            directorydeeper = directory + 'more/'
                            if not os.path.exists(directorydeeper):
                                os.makedirs(directorydeeper)
                            plt.savefig(directorydeeper + filename + '.jpg')

                            plt.close()
                            data = {
                                'time': timeline,
                                'Actual': test_y,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)

                            df.to_csv(directorydeeper + filename + '.csv',
                                      index=False)

                            if cat == 1:
                                data = {
                                    'CV': i,
                                    'target_names': target,
                                    'method_names': method,
                                    'temporalhorizons': PrH_index,
                                    'window_nuggets': 1,
                                    'file_names': filename,
                                    'F1_0': cm0[0],
                                    'F1_1': cm0[1],
                                    'P_0': cm0[2],
                                    'P_1': cm0[3],
                                    'R_0': cm0[4],
                                    'R_1': cm0[5],
                                    'acc0_1': cm0[6],
                                    'F1_0_1': cm0[7],
                                    'F1_all': cm0[8],
                                    'fbeta': [cm0[9]]
                                }
                            elif cat == 0:
                                data = {
                                    'CV': i,
                                    'target_names': target,
                                    'method_names': method,
                                    'temporalhorizons': PrH_index,
                                    'window_nuggets': 1,
                                    'file_names': filename,
                                    'mape': cm0[0],
                                    'me': cm0[1],
                                    'mae': cm0[2],
                                    'mpe': cm0[3],
                                    'rmse': cm0[4],
                                    'R2': cm0[5]
                                }

                            df = pd.DataFrame(data=data, index=[0])
                            df.to_csv(directory + result_filename,
                                      index=False,
                                      mode='a',
                                      header=False)
                            i = i + 1

                            elapsed_time = time.time() - start_time
                            print(
                                time.strftime("%H:%M:%S",
                                              time.gmtime(elapsed_time)))

                    if model_name == 'ARIMA' or model_name == 'AR' or model_name == 'ETS' or model_name == 'SARIMA' or model_name == 'BL':
                        start_time = time.time()
                        train_X_grid = dataset.values
                        custom_cv = ufunc.custom_cv_2folds(
                            train_X_grid, 1, PrH_index)

                        ######################
                        # Cross Validation sets
                        ######################
                        i = 1
                        for train_index, test_index in custom_cv:
                            train_X = train_X_grid[train_index]
                            train_X_uni = train_X[:, -1]

                            test_X = train_X_grid[test_index]
                            # actual future values
                            test_X_uni = test_X[:, -1]

                            df_time = pd.DataFrame({
                                'year':
                                np.array(test_X[:, 0]).astype(int),
                                'month':
                                np.array(test_X[:, 1]).astype(int),
                                'day':
                                np.array(test_X[:, 2]).astype(int),
                                'hour':
                                np.array(test_X[:, 3]).astype(int),
                            })

                            timeline = pd.to_datetime(df_time,
                                                      format='%Y%m%d %H')

                            if model_name == 'BL':

                                # train_X_uni,test_X_uni
                                # make them into dataFrame so below can be done

                                test_X_uni = pd.DataFrame(test_X_uni)
                                target_values = test_X_uni.drop(
                                    test_X_uni.index[0:1], axis=0)
                                target_values.index = np.arange(
                                    0, len(target_values))

                                # test_X_uni = pd.DataFrame(test_X_uni)

                                predictions = test_X_uni.drop(
                                    test_X_uni.index[len(test_X_uni) -
                                                     1:len(test_X_uni)],
                                    axis=0)
                                test_X_uni = target_values

                                timeline = timeline.drop(
                                    timeline.index[len(timeline) -
                                                   1:len(timeline)],
                                    axis=0)

                                cm0 = func.forecast_accuracy(
                                    predictions, test_X_uni, cat)

                                filename = file + '_' + \
                                    target+'_TH' + \
                                    str(PrH_index)+'_lag' + \
                                    str(n_steps)+'_'+str(i)

                                plt.scatter(timeline.values, test_X_uni, s=1)
                                plt.scatter(timeline.values, predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.xticks(rotation=45)

                                directorydeeper = directory + 'more/'
                                if not os.path.exists(directorydeeper):
                                    os.makedirs(directorydeeper)
                                plt.savefig(directorydeeper + filename +
                                            '.jpg')

                                plt.close()

                                print(predictions.head())
                                print(test_X_uni.head())
                                print(timeline.head())

                                # data = {'time': timeline,
                                #         'Actual': test_X_uni,
                                #         'Predictions': predictions}
                                frames = [timeline, test_X_uni, predictions]
                                df = pd.concat(frames, axis=1)
                                df.to_csv(
                                    directorydeeper + filename + '.csv',
                                    index=False,
                                    header=['time', 'Actual', 'Predictions'])

                                if cat == 1:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'F1_0': cm0[0],
                                        'F1_1': cm0[1],
                                        'P_0': cm0[2],
                                        'P_1': cm0[3],
                                        'R_0': cm0[4],
                                        'R_1': cm0[5],
                                        'acc0_1': cm0[6],
                                        'F1_0_1': cm0[7],
                                        'F1_all': cm0[8],
                                        'fbeta': [cm0[9]]
                                    }
                                elif cat == 0:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'mape': cm0[0],
                                        'me': cm0[1],
                                        'mae': cm0[2],
                                        'mpe': cm0[3],
                                        'rmse': cm0[4],
                                        'R2': cm0[5]
                                    }

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory + result_filename,
                                          index=False,
                                          mode='a',
                                          header=False)

                            if model_name == 'AR':
                                predictions = ufunc.AutoRegression(
                                    train_X_uni, test_X_uni)
                                if cat == 1:
                                    predictions = np.array(predictions).astype(
                                        int)
                                    test_X_uni = np.array(test_X_uni).astype(
                                        int)

                                cm0 = func.forecast_accuracy(
                                    predictions, test_X_uni, cat)

                                filename = file + '_' + \
                                    target+'_TH' + \
                                    str(PrH_index)+'_lag' + \
                                    str(n_steps)+'_'+str(i)

                                plt.scatter(timeline.values, test_X_uni, s=1)
                                plt.scatter(timeline.values, predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.xticks(rotation=45)

                                directorydeeper = directory + 'more/'
                                if not os.path.exists(directorydeeper):
                                    os.makedirs(directorydeeper)
                                plt.savefig(directorydeeper + filename +
                                            '.jpg')

                                plt.close()
                                data = {
                                    'time': timeline,
                                    'Actual': test_X_uni,
                                    'Predictions': predictions
                                }
                                df = pd.DataFrame(data=data)

                                df.to_csv(directorydeeper + filename + '.csv',
                                          index=False)

                                if cat == 1:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'F1_0': cm0[0],
                                        'F1_1': cm0[1],
                                        'P_0': cm0[2],
                                        'P_1': cm0[3],
                                        'R_0': cm0[4],
                                        'R_1': cm0[5],
                                        'acc0_1': cm0[6],
                                        'F1_0_1': cm0[7],
                                        'F1_all': cm0[8],
                                        'fbeta': [cm0[9]]
                                    }
                                elif cat == 0:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'mape': cm0[0],
                                        'me': cm0[1],
                                        'mae': cm0[2],
                                        'mpe': cm0[3],
                                        'rmse': cm0[4],
                                        'R2': cm0[5]
                                    }

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory + result_filename,
                                          index=False,
                                          mode='a',
                                          header=False)

                            cfg_list = list()
                            if model_name == 'ETS':
                                cfg_list = ufunc.exp_smoothing_configs()
                                scores = [
                                    ufunc.score_model('ETS', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            if model_name == 'ARIMA':
                                cfg_list = ufunc.ARIMA_configs()
                                scores = [
                                    ufunc.score_model('ARIMA', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            if model_name == 'SARIMA':
                                cfg_list = ufunc.sarima_configs()

                                scores = [
                                    ufunc.score_model('SARIMA', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            i = i + 1
                            elapsed_time = time.time() - start_time
                            print(
                                time.strftime("%H:%M:%S",
                                              time.gmtime(elapsed_time)))