from _0_DataCreation.Read_Data import load_dataframe
from Scoring.scoring_func import f1_scores_plot
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from General.Paths import Gitlab_Path

fold1_df = load_dataframe(filename='fold1_NA_features.dat')
fold2_df = load_dataframe(filename='fold2_NA_features.dat')
fold3_df = load_dataframe(filename='fold3_NA_features.dat')
testset_df = load_dataframe(filename='testSet_NA_features.dat')

subset_cols = ['pca_1_last', 'pca_2_last','pca_3_last','pca_4_last','pca_5_last', 'pca_6_last','pca_7_last','pca_8_last','pca_9_last','pca_10_last', \
               'R_VALUE_last', 'XR_MAX_last', 'NA_satellite_last', 'NA_SHARPmask_last', 'NA_Rmask_last', 'NA_XR_MAX_last']

fold1_subset = fold1_df[subset_cols]
fold2_subset = fold2_df[subset_cols]
fold3_subset = fold3_df[subset_cols]
testset_subset = testset_df[subset_cols]

## Lav logistisk regression fra R
LR = LogisticRegression()
LR.fit(fold1_subset, fold1_df['label'])

#Score on f2
my_preds = LR.predict_proba(fold2_subset)[:, 1]
true_vals = fold2_df['label']
temp = f1_scores_plot(my_preds, true_vals)  #Næsten det samme. Lidt under

#Fit on fold3 with resize and extract best score
my_preds = LR.predict_proba(fold3_subset)[:, 1]
### To load and save pickle objects
import pickle


def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


### Load and merge data
fold1_df = load_dataframe(filename='fold1_NA_all_last.dat')
fold2_df = load_dataframe(filename='fold2_NA_all_last.dat')
fold3_df = load_dataframe(filename='fold3_NA_all_last.dat')
test_df = load_dataframe(filename='testSet_NA_all_last.dat')
del fold1_df['label'], fold2_df['label'], fold3_df['label']

all_sets = pd.concat([fold1_df, fold2_df, fold3_df, test_df])
del fold1_df, fold2_df, fold3_df, test_df

##Extract last
last_cols = [x for x in all_sets.columns.values if x[-4:] == 'last']
fold_last = all_sets[last_cols]
del all_sets

fold_last.columns = [x[:-5]
                     for x in fold_last.columns]  #remove '_last' from name
    my_model.fit_generator(generator=train_gen,
                           validation_data=valid_gen,
                           steps_per_epoch = np.ceil( (n_lines['fold1'] + n_lines['fold2'])/ batch_size), #Hvornår skal den stoppe med epoc og starte næste?
                           validation_steps= np.ceil( n_lines['fold3']/ batch_size), #Hvornår skal den stoppe med epoc og starte næste?
                           epochs=3)
    
    #Create the new generat   
    valid_gen = batch_generator(filename=Data_Path + '/fold3_NA.dat',
                                batch_size=batch_size,
                                num_features=num_features)
    
    preds = my_model.predict_generator(valid_gen,
                                       steps = np.ceil( n_lines['fold3']/ batch_size)
                                       ).flatten()
    
    true_vals = load_dataframe(filename = 'fold3_NA_features.dat')['label']
    preds = preds[:len(true_vals)] #Vi predicter 'np.ceil' så genstarter batchen
    
    f1_scores_plot(preds,true_vals)
    f1_scores_plot(preds,true_vals,resize = True)
    
    








from _0_DataCreation.Read_Data import load_dataframe
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from time import time

fold1_df = load_dataframe(filename='fold1_NA_features.dat')
fold2_df = load_dataframe(filename='fold2_NA_features.dat')

del fold1_df['id']
del fold2_df['id']

#fold1_df = fold1_df[['label','pca_1_last','pca_2_last','pca_3_last','pca_4_last','pca_5_last','pca_6_last','pca_7_last','pca_8_last','pca_9_last','pca_10_last']]
#fold2_df = fold2_df[['label','pca_1_last','pca_2_last','pca_3_last','pca_4_last','pca_5_last','pca_6_last','pca_7_last','pca_8_last','pca_9_last','pca_10_last']]

## Create the different sets
one_rows = fold1_df.loc[(fold1_df['label'] == 1), :]
zero_rows = fold1_df.loc[(fold1_df['label'] == 0), :]

##create train_sets
n_rfs = int(np.ceil(len(zero_rows) / len(one_rows)))  #We make 6 regressors
n_samples = int(round(len(zero_rows) / n_rfs, 0))
trains = [
    zero_rows.iloc[n_samples * (x - 1):n_samples * x, :]
    for x in range(1, n_rfs)
]
trains += [zero_rows.iloc[n_samples * (n_rfs - 1):, :]]

## Take equal amount of one rows and zero rows, and train random forrests
n_features = int(len(fold1_df.columns) / 4)