def extract_features_training(self): ''' This function creates the dataset using the 'load_data' function from 'Data.Read_Data'. It operates line-by-line so should be able to handle unlimited number of lines in data files. ''' p0 = time.time() ### Calculate features one id at a time for dataset in self.datasets: load_file_path = Data_Path + '/' + dataset + '.dat' save_file_path = self.save_path + '/' + dataset + '_features.dat' #Delete existing print('save path: ' + save_file_path) if os.path.exists(save_file_path): os.remove(save_file_path) with open(save_file_path, 'wb') as save_file: ##Create the header header = ['id', 'label'] for feature in self.features: header += [ variable + '_' + feature for variable in self.variables ] self.header = header pickle.dump(header, save_file) ##Calculate data for id, label, data in load_data(filename=load_file_path, max_row=-1): #Extract subset data = np.array(data[self.variables]) #Create data_point data_point = [id, label] for feature in self.features: feature_func = getattr(self, self.FEATURES[feature]) data_point += list(feature_func(ys=data)) #save data_point pickle.dump(data_point, save_file) #save_file.write(str(data_point[1:-1])) if (id % 5000) == 0: print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time() - p0)) + 's')
def scale_norm_and_pca_test(self): ''' This function creates the dataset using the 'load_data' function from 'Data.Read_Data'. It operates line-by-line so should be able to handle unlimited number of lines in data files. ''' p0 = time.time() ### Calculate features one id at a time for dataset in self.datasets: load_file_path = Data_Path + '/' + dataset + '.dat' save_file_path = self.save_path + '/' + dataset + '_pca.dat' save_file_path_no_pca = self.save_path + '/' + dataset + '_no_pca.dat' #Delete existing print('save path: ' + save_file_path) if os.path.exists(save_file_path): os.remove(save_file_path) #Delete existing print('save path: ' + save_file_path_no_pca) if os.path.exists(save_file_path_no_pca): os.remove(save_file_path_no_pca) with open(save_file_path,'wb') as save_file, open(save_file_path_no_pca,'wb') as save_file_no_pca: ##Calculate data for id, data in load_data(filename = load_file_path, max_row = -1): #Scale scaled_data = scale_df(data) normalized_data = self._normalize(scaled_data) pca_data = self._pca(normalized_data) #save data_point pickle.dump((id,normalized_data),save_file_no_pca) pickle.dump((id,pca_data),save_file) if (id % 5000) == 0: print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
def handle_NA_training(self): ''' This function creates the dataset using the 'load_data' function from 'Data.Read_Data'. It operates line-by-line so should be able to handle unlimited number of lines in data files. ''' p0 = time.time() ### Calculate features one id at a time for dataset in self.datasets: load_file_path = Data_Path + '/' + dataset + '.dat' save_file_path = self.save_path + '/' + dataset + '_NA.dat' #Delete existing print('save path: ' + save_file_path) if os.path.exists(save_file_path): os.remove(save_file_path) with open(save_file_path,'wb') as save_file: ##Calculate data for id, label, data in load_data(filename = load_file_path, max_row = -1): #There are 3 Types of NA: #1) There is no satelite data (all rows are NA) #2) The SHARP mask is empty (some NAs, the rest are zeros except XR_MAX) #3) The R mask is empty (R is zero) #For later use non_division_vars = data.columns.difference(['XR_MAX'] + self.division_variables) non_XR = data.columns.difference(['XR_MAX']) ##Check if satellite is mising (everythin but XR_MAX have NAs) NA_satellite_index = data.index[data[non_XR].isnull().all(1)] data['NA_satellite'] = 0 data.at[NA_satellite_index,'NA_satellite'] = 1 ''' if len(NA_satellite_index) > 0: self.save_set = data print('missing satellite') return ''' ##Check if no SHARPmask (self.division_variables are NaN, the rest are zero) indices = ((data[non_division_vars] == 0).all(axis = 1) & data[self.division_variables].isna().all(axis = 1)) data['NA_SHARPmask'] = 0 data.at[indices,'NA_SHARPmask'] = 1 ''' if len(indices) > 0: self.save_set = data print('missing sharp') return ''' ##Check if no Rmask data['NA_Rmask'] = 0 data.at[data['R_VALUE'] == 0,'NA_Rmask'] = 1 ##Check if no XR_dat data['NA_XR_MAX'] = 0 data.at[data['XR_MAX'] == -99999,'NA_XR_MAX'] = 1 ### Find rows with NA but both satellite data and SHARPmask (i think really few) NA_indices = data.isna().any(axis = 'columns') NA_butsharp = (NA_indices & (data['NA_SHARPmask'] == 0) & (data['NA_satellite'] == 0)) #if sum(NA_butsharp) > 0: # print('id: ' + str(id) + ' har NA') ##Replace -99999 with NA in XR_MAX data.at[data['XR_MAX'] == -99999,'XR_MAX'] = float('NaN') data = self._NA_linear_interpolate(data) #Replaces full NA-rows with 0 #save data_point pickle.dump((id,label,data),save_file) #save_file.write(str(data_point[1:-1])) if (id % 5000) == 0: print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
if col in [ "TOTUSJH", "TOTBSQ", "TOTPOT", "TOTUSJZ", "ABSNJZH", "SAVNCPP", "USFLUX", "XR_MAX" ]: df[col] = scale_series(df[col], lambda x: np.sign(x) * np.abs(x)**0.2) if col in ["MEANPOT"]: df[col] = scale_series(df[col], lambda x: np.sign(x) * np.abs(x)**0.1) if col in ["TOTFZ", "TOTFY", "TOTFX"]: df[col] = scale_series(df[col], lambda x: np.sign(x) * np.abs(x)**0.5) return df def scale_series(series, f): return np.fromiter((f(x) for x in series), series.dtype) # [sign(x) * abs(x)^(1/5) for x in ["TOTUSJH", "TOTBSQ", "TOTPOT", "TOTUSJZ", "ABSNJZH", "SAVNCPP", "USFLUX", "XR_MAX"]] # [sign(x) * abs(x)^(1/10) for x in ["MEANPOT"]] # [sign(x) * abs(x)^(1/2) for x in ["TOTFZ", "TOTFY", "TOTFX"]] if __name__ == '__main__': from _0_DataCreation.Read_Data import load_data, fn id, label, df = next(load_data(filename=fn, max_row=1)) df2 = scale_df(df) print(df) print(all(df == df2))
rand_id = random.randint(1, 99) n_ones = 10 n_zeros = 10 #Take random file and random index (remember the id's are already shuffled, so up to 100 id's should be fine) time_series_ones = [] time_series_zeros = [] zero_count = 0 one_count = 0 begin_id = 0 while True: if one_count == 10 and zero_count == 10: break for id, label, data in load_data(filename=file_path, max_row=100): if id > begin_id: begin_id = id if label == 1 and one_count < 10: one_count += 1 time_series_ones.append(data['pca_1']) break elif label == 0 and zero_count < 10: zero_count += 1 time_series_zeros.append(data['pca_1']) break time_series_ones = np.array(time_series_ones) time_series_zeros = np.array(time_series_zeros) plt.plot(np.arange(1, time_series_zeros.shape[1] + 1) / 5,
def Load_Main_Dataset(Dataset: str, max_row=-1) -> Generator: return load_data(filename=Data_Path + '/' + Dataset + '_NA.dat', max_row=max_row)