# get data f = h5py.File( data_path + filename , 'r') N2 = f['N2'][:] CT = f['CT'][:] SA = f['SA'][:] eps = f['eps'][:] z = f['z'][:] Np = np.shape(eps)[0] # *** raw data pdfs *** #raw_pdf_plot( N2, SA, CT, eps ) # *** remove outliers *** [N2, SA, CT, eps, z] = remove_outliers( N2, SA, CT, eps, z ) # *** pdfs and stats *** #pdf_plot( N2, SA, CT, eps ) #[eps_mu,eps_sig,eps_sk,eps_fl,eps_min,eps_max] = pdf( eps ) # *** 2D contour plots *** #contour_plots( N2, SA, CT, eps ) # *** 3D plots *** print(type(CT)) zdata = np.log10(eps) ydata = CT
import functions as f # open the dataset dataset_path = './dataset/' dataset = pd.read_csv(dataset_path + 'not_standardized_dataset_drop.csv') plot = False # plot boolean variable if plot: # plot of 2 histograms plt.subplot(121) plt.hist(dataset['median_house_value'], bins='auto') # the first one is the raw one plt.title('with outliers') dataset = f.remove_outliers(dataset) # remove outliers if plot: plt.subplot(122) plt.hist(dataset['median_house_value'], bins='auto') # in the second one there is no saturation of data plt.title('without outliers') plt.draw() f.correlation_plot(dataset) # ridge regression with nested cross validation X = dataset.drop(columns=['median_house_value']).to_numpy() # data matrix y = dataset['median_house_value'].to_numpy() # labels vector # add ones column to X
# open the dataset dataset_path = './dataset/' datasets = [ 'standardized_dataset.csv', 'not_standardized_dataset.csv', 'standardized_dataset_drop.csv', 'not_standardized_dataset_drop.csv' ] cv_type = 'nested' for n, name in enumerate(datasets): dataset = pd.read_csv(dataset_path + name) scores = [] # list of scores w/ and w/o outliers for i in range(2): # the first loop is with outliers if i == 1: # the second one is without them dataset = f.remove_outliers(dataset) # outliers removal X = dataset.drop( columns=['median_house_value']).to_numpy() # data matrix y = dataset['median_house_value'].to_numpy() # labels vector # add ones column to X ones = np.ones((X.shape[0], 1)) X = np.hstack((ones, X)) if cv_type == 'k-folds': # k-folds cross validation print('K-FOLDS CROSS VALIDATION') k = 10 # number of folds scores.append(f.k_foldsCV(X, y, k,
weather = join_meta_data(weather, meta_data, chosen_building) # include the building ID for joining dataframes later weather['building_id'] = [chosen_building] * weather.shape[0] weather = weather.reset_index().set_index(keys = ["timestamp", "building_id"]) dataframe_list.append(weather) weather_dataframe = pd.concat(dataframe_list) print("\nBUILDING TRAINING DATA") print("Reading dataset...") data = read_timeseries_data(f"{raw_folder}train.csv") print("Processing dataset...") # process outliers of the whole dataset data, q_high, q_low = remove_outliers(data, data_retention=0.999) print(f"Outlier limits: {q_low}, {q_high}") dataframe_list= [] for chosen_building in range(0, meta_data.shape[0]): if chosen_building%50==0: print(f"We're on building #{chosen_building}...") chosen_site = meta_data.loc[meta_data.building_id == chosen_building, "site_id"].values[0] # removing sites as identified in data exploration if chosen_site is 7 or chosen_site == 9: continue building = data.loc[data.building_id == chosen_building].copy() building = electricity_conversion(building)
count = 0 plot_filenames = [] onlyfiles = [f for f in listdir(data_path) if isfile(join(data_path, f))] Nfiles = np.shape(onlyfiles)[0] # number of files (time steps) Nfiles = 20 Noffset = 10 Nfiles0 = Nfiles for j in range(Noffset,Nfiles+Noffset): my_file = data_path + '/' + onlyfiles[j] print('file =', my_file) [N2j, SAj, CTj, epsj, zj] = fn.get_hydro(my_file,count) [N2j, SAj, CTj, epsj, zj] = fn.nanrid( N2j, SAj, CTj, epsj, zj ) [N2j, SAj, CTj, epsj, zj] = fn.remove_outliers( N2j, SAj, CTj, epsj, zj ) [N2j, SAj, CTj, epsj, zj] = fn.throw_points_in_z( N2j, SAj, CTj, epsj, zj , -2500.) N2=np.concatenate((N2,N2j),axis=0) SA=np.concatenate((SA,SAj),axis=0) CT=np.concatenate((CT,CTj),axis=0) eps=np.concatenate((eps,epsj),axis=0) z=np.concatenate((z,zj),axis=0) count = count + 1 #plot_filenames = np.append(plot_filenames,my_file) #fn.pdf_plot( N2, SA, CT, eps, z ) NSAMPLE = np.shape(N2)[0] # ============================================================================= # plot training data
columns = [ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality' ] feature_cols = [ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ] # 2.2. Preparing data df_wines = dict() df_wines["remove_outliers"] = remove_outliers(data) df_wines["replace_outliers"] = change_outliers_by_median(data) plot_dataframe_columns(data, "Box plots by column with raw data") plot_dataframe_columns(df_wines["remove_outliers"], "Box plots when we remove the outliers") plot_dataframe_columns( df_wines["replace_outliers"], "Box plots when we replace the outliers by the median") plot_scatter_matrix(df_wines["remove_outliers"]) plot_scatter_matrix(df_wines["replace_outliers"]) correlation1 = correlation_table(df_wines["remove_outliers"]) correlation2 = correlation_table(df_wines["replace_outliers"])