def reset_val_batches(self): self.val_batches = int( np.floor(self.val_input_data.shape[0] / self.batch_size)) if self.val_batches > 0: self.val_input_batches = self.val_input_data self.val_output_batches = self.val_output_data self.val_input_batches = self.val_input_batches[:self.batch_size * self.val_batches] self.val_output_batches = self.val_output_batches[:self. batch_size * self.val_batches] else: self.val_input_batches = np.zeros( (self.batch_size, self.val_input_data.shape)) self.val_output_batches = np.zeros( (self.batch_size, self.val_output_data.shape)) self.val_input_batches = data.reshape_1D_input( self.val_input_batches) self.val_input_batches[:self.val_input_data. shape[0]] = self.val_input_data self.val_output_batches[:self.val_output_data. shape[0]] = self.val_output_data if self.frame_work == 'Tensorflow': self.val_input_batches, self.val_output_batches = data.convert_to_tensorflow_minbatch( self.val_input_batches, self.val_output_batches, self.batch_size)
def reset_train_batches(self, batch_size=None, num_batches=None): ''' This function resets the training batches Example: Training data set = 1000 samples, Batch size = 300 Framework: Keras - The output is an array of as many training samples that fit within the batch size. Train_input_batches.shape = [900, data_size] Framework: Tensorflow - The Tensorflow implementation requires each mini-batch to be explicitly set. Train_input_batches.shape = (# batches, ) - In each batch is a numpy array of size (batch_size, data_size) :param batch_size: :param num_batches: :return: ''' # Update batch size if passed if batch_size is not None: self.batch_size = batch_size # Calc number of batches if num_batches is not None: self.num_train_batches = int(num_batches) else: self.num_train_batches = int( np.floor(self.train_input_data.shape[0] / self.batch_size)) # Copy all training data self.train_input_batches = self.train_input_data self.train_output_batches = self.train_output_data # Shuffle Training data self.train_input_batches, self.train_output_batches = data.shuffle_input_output( self.train_input_batches, self.train_output_batches) ## Restrict the amount of training Data a number that fits in the number of batches self.train_input_batches = self.train_input_batches[:self.batch_size * self. num_train_batches] self.train_output_batches = self.train_output_batches[:self. batch_size * self. num_train_batches] if self.frame_work == 'Keras': return if self.frame_work == 'Tensorflow': self.train_input_batches, self.train_output_batches = data.convert_to_tensorflow_minbatch( self.train_input_batches, self.train_output_batches, self.batch_size)
def shuffle_data_samples(GraphData): ''' Shuffle Data ''' GraphData.train_input_data, GraphData.train_output_data = data.shuffle_input_output( GraphData.train_input_data, GraphData.train_output_data) GraphData.val_input_data, GraphData.val_output_data = data.shuffle_input_output( GraphData.val_input_data, GraphData.val_output_data) GraphData.eval_input_data, GraphData.eval_output_data = data.shuffle_input_output( GraphData.eval_input_data, GraphData.eval_output_data)
def combined_1D_onehot(DataCenter, folder_paths, samples = 4): folder_paths = np.array(folder_paths) channels = folder_paths.shape[0] plot = research.SubPlot(samples, channels) update_plot = 1 # Iterate over all samples for sample in range(samples): for channel in range(channels): one_hot_labels = data.load_data(folder_paths[channel] + 'one_hot_labels.csv') max_curves = data.load_data(folder_paths[channel] + 'max_array_record.csv') min_curves = data.load_data(folder_paths[channel] + 'min_array_record.csv') participants = max_curves[:, 0] max_curves = max_curves[:, 1:] min_curves = min_curves[:, 1:] # Randomly Select Input Data rand_idx = np.random.randint(DataCenter.all_input_data.shape[0]) # Get Output idx output_idx = np.argmax(DataCenter.all_output_data[rand_idx]) # Get Participant ID from One Hot Labels participant = one_hot_labels[output_idx] curve_id = np.array(np.where(participants == participant))[0][0] max_curve = np.divide(max_curves[curve_id],DataCenter.input_scale[channel]) min_curve = np.divide(min_curves[curve_id], DataCenter.input_scale[channel]) # Plot Max/Min Curves plot.current_plot = update_plot plot.add_subplot_data(max_curve, add_data_to=update_plot, color='black') plot.add_subplot_data(min_curve, add_data_to=update_plot, color='black') # Plot Input Data input_data = DataCenter.all_input_data[rand_idx][:,channel] plot.add_subplot_data(input_data, add_data_to=update_plot, color='blue') update_plot += 1 plot.show_plow()
def load_all_data_multiple(self, data_folder, data_files): print('Loading Data from multiple CSV files') print('Loading from {}. {} Left'.format(data_folder + data_files[0], len(data_files))) self.data_location = data_folder self.all_data = data.load_data(data_folder + data_files[0]) print('Current Samples = {}'.format(self.all_data.shape[0])) for i in range(1, len(data_files)): print('Loading from {}. {} Left'.format( data_folder + data_files[i], len(data_files) - i)) new_data = data.load_data(data_folder + data_files[i]) self.all_data = np.concatenate([self.all_data, new_data], axis=0) print('Current Samples = {}'.format(self.all_data.shape[0]))
def augment_1D_squeeze_stretch(self, squeeze=0.98, stretch=1.02, steps=3): print( 'Augmenting - Squeeze & Stretch \n Inefficient Implementation, perform before other Augmentation' ) self.train_input_data, self.train_output_data = data.augment_1D_squeeze_stretch( self.train_input_data, self.train_output_data, squeeze, stretch, steps) self.print_num_samples()
def balance_batch_for_dual_sided_one_hot(self): print('Balancing Batches for Dual Sided One Hot Array') if self.one_hot_balance_rate is None: self.one_hot_balance_rate = 1 self.load_data() self.train_input_data, self.train_output_data = data.balance_batch_for_dual_sided_one_hot( self.train_input_data, self.train_output_data)
def calc_eval_siamese_batches(self): self.unique_ids = data.calc_unique_ids(self.all_output_data) self.siamese_eval_input_left, self.siamese_eval_input_right, self.siamese_eval_output_batches, self.siamese_eval_left_idx, self.siamese_eval_right_idx = data.calc_siamese_batches( self.eval_input_data, self.eval_output_data, self.unique_ids, self.num_eval_batches, self.batch_size, reshape=False)
def augment_1D_squash_pull(self, squash=0.98, pull=1.02, steps=10, type='multiply'): print('Augmenting - Squash & Pull') self.train_input_data, self.train_output_data = data.augment_1D_squash_pull( self.train_input_data, self.train_output_data, squash, pull, steps, type) self.print_num_samples()
def reshape_channel(data): channels = data.shape[-1] if channels == 1: if data.shape[channels + 1] == 1: data = data.reshape(data.shape[0], data.shape[1]) else: raise 'To Do: Code needed for more than 1 dimension' return data
def contin_one_hot_output(self): self.all_output_data, self.one_hot_range = data.create_continuous_one_hot_array( self.all_output_data, self.one_hot_val_min, self.one_hot_val_max, self.one_hot_length) # Save one_hot range np.savetxt(self.data_location + self.file_prefix + 'one_hot_labels.csv', self.one_hot_range, delimiter=',') np.savetxt(self.folder_path + self.file_prefix + 'one_hot_labels.csv', self.one_hot_range, delimiter=',')
def one_hot_output(self, column, concat=False): self.all_output_data, self.one_hot_labels = data.one_hot_output( self.all_output_data, column, concat=concat) # Save one_hot_labels np.savetxt(self.data_location + self.file_prefix + 'one_hot_labels.csv', self.one_hot_labels, delimiter=',') np.savetxt(self.folder_path + self.file_prefix + 'one_hot_labels.csv', self.one_hot_labels, delimiter=',') print(self.folder_path + self.file_prefix + 'one_hot_labels.csv')
def split_train_val_eval(self, val_split=0.3, eval_split=0, shuffle=False): if shuffle == True: self.all_input_data, self.all_output_data = data.shuffle_input_output( self.all_input_data, self.all_output_data) total_samples = int(self.all_input_data.shape[0]) # Split into training/validation/evaluation data self.eval_samples = int(self.all_input_data.shape[0] * eval_split) self.val_samples = int(self.all_input_data.shape[0] * val_split) self.train_samples = int(self.all_input_data.shape[0] - self.eval_samples - self.val_samples) print( 'Train Samples = {}({}%), Val Samples = {}({}%), Eval Samples = {}({}%)' .format(self.train_samples, np.round(self.train_samples / total_samples * 100, 2), self.val_samples, np.round(self.val_samples / total_samples * 100, 2), self.eval_samples, np.round(self.eval_samples / total_samples * 100, 2))) self.eval_input_data = self.all_input_data[total_samples - self.eval_samples:] self.eval_output_data = self.all_output_data[total_samples - self.eval_samples:] self.val_input_data = self.all_input_data[total_samples - self.eval_samples - self. val_samples:total_samples - self.eval_samples] self.val_output_data = self.all_output_data[total_samples - self.eval_samples - self. val_samples:total_samples - self.eval_samples] self.train_input_data = self.all_input_data[:total_samples - self.eval_samples - self.val_samples] self.train_output_data = self.all_output_data[:total_samples - self.eval_samples - self.val_samples]
def load_data(self): path = self.folder_path + self.file_prefix print('Loading data from {}'.format(path)) self.train_input_data = np.load(path + 'training_input_data.npy') self.val_input_data = np.load(path + 'validation_input_data.npy') self.eval_input_data = np.load(path + 'evaluation_input_data.npy') self.train_output_data = np.load(path + 'training_output_data.npy') self.val_output_data = np.load(path + 'validation_output_data.npy') self.eval_output_data = np.load(path + 'evaluation_output_data.npy') try: self.one_hot_labels = data.load_data(self.folder_path + self.file_prefix + 'one_hot_labels.csv') except: 'No one-hot labels' self.print_num_samples()
def shuffle_training_only(self): print('Shuffling Training Data') self.train_input_data, self.train_output_data = data.shuffle_input_output( self.train_input_data, self.train_output_data)
def augment_add_noise(self, std_dev=0.01): print('Augmenting - Adding Gausian Noise') self.train_input_data = data.augment_add_noise(self.train_input_data, std_dev)
def load_all_data_single(self, data_folder, data_file): print('Loading Data from CSV file') self.data_location = data_folder self.all_data = data.load_data(data_folder + data_file) self.all_data = np.nan_to_num(self.all_data)
def augment_1D_left_right(self, left=6, right=6, step=1): print('Augmenting Data Left and Right. New Samples =') self.train_input_data, self.train_output_data = data.augment_1D_left_right( self.train_input_data, self.train_output_data, left, right, step) self.print_num_samples()
def scale_multi_chan_input(self, scale=None): self.all_input_data, self.input_scale = data.scale_multi_chan_input( self.all_input_data, scale=scale)
def calc_train_siamese_batches(self): self.unique_ids = data.calc_unique_ids(self.all_output_data) self.siamese_train_input_batches_left, self.siamese_train_input_batches_right, self.siamese_train_output_batches, self.siamese_train_left_idx, self.siamese_train_right_idx = data.calc_siamese_batches( self.train_input_data, self.train_output_data, self.unique_ids, self.num_train_batches, self.batch_size)
def cut_input_data_seq_length(self, out_length=None): self.all_input_data = data.cut_input_data_seq_length( self.all_input_data, out_length)
def export_val_one_hot_predictions(DataCenter, model): val_predictions = predict(DataCenter, model, DataCenter.val_input_batches) val_true = DataProcess.combine_batches(DataCenter.val_output_batches) val_true_arg_max = np.argmax(val_true, axis=1) return np.concatenate([val_true_arg_max.reshape(-1, 1), val_predictions], axis=1)
def reshape_1D_input(self): self.all_input_data = data.reshape_1D_input(self.all_input_data)
def integrate_input_curve(self, col_start=None, col_end=None): self.all_output_data = data.integrate_input_curve(self.all_input_data, col_start=col_start, col_end=col_end)
def dynamic_updating_continuous_mse_loss(self): self.all_output_data, self.dyn_mse_shift = data.continuous_mse_loss( self.all_output_data, self.dyn_mse_base_width, self.dyn_mse_power, self.dyn_mse_top_width, self.dyn_mse_offset)
def split_input_output_data(self, num_outputs, output_first=True): # Split into input and output data self.all_input_data, self.all_output_data = data.split_input_output_data( self.all_data, num_outputs, output_first) assert self.all_input_data.shape[0] == self.all_output_data.shape[0]
def padd_one_hot_output(self, pad_reduce): self.all_output_data = data.padd_one_hot_array(self.all_output_data, pad_reduce)
def restrict_to_ids(self, ids, column=0): print(self.all_output_data.shape) self.all_input_data, self.all_output_data = data.restrict_to_ids( self.all_input_data, self.all_output_data, ids, column) print(self.all_output_data.shape)
def scale_outputs(self, scale_type='max'): self.all_output_data, self.output_scale = data.scale_outputs( self.all_output_data, scale_type)
def export_val_mse_predictions(DataCenter, model): print('Line 98: Exporting Val Predictions') val_predictions = predict(DataCenter, model, DataCenter.val_input_batches) val_true = DataProcess.combine_batches(DataCenter.val_output_batches) return val_predictions, val_true