def data_reader(self, data_filepath, label_filepath, train, should_batch=True, shuffle=True): input_data, labels = read_npy(data_filepath), read_npy(label_filepath) label_to_use = 0 # Sober samples zeros_ids = [idx for idx, x in enumerate(labels) if x == label_to_use] input_data = input_data[zeros_ids] if train: for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) random.shuffle(input_data) print('Total data ', len(input_data)) print('Total data ', len(input_data), file=self.log_file) # Normalizing `input data` on train dataset's min and max values if self.normalise: input_data = (input_data - self._min) / (self._max - self._min) print('Min max values used for normalisation ', self._min, self._max) print('Min max values used for normalisation ', self._min, self._max, file=self.log_file) if should_batch: batched_input = [input_data[pos:pos + self.batch_size] for pos in range(0, len(input_data), self.batch_size)] return batched_input else: return input_data
def data_reader(self, data_filepath, label_filepath, train, should_batch=True, shuffle=True): input_data, labels = read_npy(data_filepath), read_npy(label_filepath) if train: # nu declared in init, initialized here based on the number of anomalies. # Here intoxicated samples are considered anomalies self.nu = sum(labels) / len(labels) print('Calculated value of Nu ', self.nu) print('Calculated value of Nu ', self.nu, file=self.log_file) for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) data = [(x, y) for x, y in zip(input_data, labels)] random.shuffle(data) input_data, labels = np.array([x[0] for x in data ]), [x[1] for x in data] print('Total data ', len(input_data)) print('Event rate', sum(labels) / len(labels)) print(np.array(input_data).shape, np.array(labels).shape) print('Total data ', len(input_data), file=self.log_file) print('Event rate', sum(labels) / len(labels), file=self.log_file) print(np.array(input_data).shape, np.array(labels).shape, file=self.log_file) print('Min max values used for normalisation ', self._min, self._max) print('Min max values used for normalisation ', self._min, self._max, file=self.log_file) # Normalizing `input data` on train dataset's min and max values if self.normalise: input_data = (input_data - self._min) / (self._max - self._min) if should_batch: batched_input = [ input_data[pos:pos + self.batch_size] for pos in range(0, len(input_data), self.batch_size) ] batched_labels = [ labels[pos:pos + self.batch_size] for pos in range(0, len(labels), self.batch_size) ] return batched_input, batched_labels else: return input_data, labels
def data_reader(self, data_filepath, label_filepath, train, should_batch=True, shuffle=True, infer=False): if infer: pass else: input_data, labels = read_npy(data_filepath), read_npy( label_filepath) if train: print('Total data ', len(input_data)) print('Event rate', sum(labels) / len(labels)) print(np.array(input_data).shape, np.array(labels).shape) # print("Under sampling train data") # # Under-sampling train data. Balancing the classes # ones_idx, zeros_idx = [idx for idx, label in enumerate(labels) if label == 1], [idx for idx, label in # enumerate(labels) if # label == 0] # zeros_idx = zeros_idx[:len(ones_idx)] # ids = ones_idx + zeros_idx # input_data, labels = input_data[ids], labels[ids] print('Total data ', len(input_data), file=self.log_file) print('Event rate', sum(labels) / len(labels), file=self.log_file) print(np.array(input_data).shape, np.array(labels).shape, file=self.log_file) for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) self._mean, self._std = np.mean(input_data), np.std(input_data) data = [(x, y) for x, y in zip(input_data, labels)] random.shuffle(data) input_data, labels = np.array([x[0] for x in data ]), [x[1] for x in data] # Initialize pos_weight based on training data self.pos_weight = len([x for x in labels if x == 0]) / len( [x for x in labels if x == 1]) print('Pos weight for the train data - ', self.pos_weight) print('Pos weight for the train data - ', self.pos_weight, file=self.log_file) print('Total data ', len(input_data)) print('Event rate', sum(labels) / len(labels)) print(np.array(input_data).shape, np.array(labels).shape) print('Total data ', len(input_data), file=self.log_file) print('Event rate', sum(labels) / len(labels), file=self.log_file) print(np.array(input_data).shape, np.array(labels).shape, file=self.log_file) print('Min max values used for normalisation ', self._min, self._max) print('Min max values used for normalisation ', self._min, self._max, file=self.log_file) # Normalizing `input data` on train dataset's min and max values if self.normalise: # input_data = (input_data - self._min) / (self._max - self._min) input_data = (input_data - self._mean) / self._std if should_batch: batched_input = [ input_data[pos:pos + self.batch_size] for pos in range(0, len(input_data), self.batch_size) ] batched_labels = [ labels[pos:pos + self.batch_size] for pos in range(0, len(labels), self.batch_size) ] return batched_input, batched_labels else: return input_data, labels
def data_reader(self, data_filepath, label_filepath, jitter_filepath, train, type, should_batch=True, shuffle=True, infer=False): if infer: pass else: input_data, labels, jitter = read_npy(data_filepath), read_npy( label_filepath), read_npy(jitter_filepath) if train: self.logger.info(f'Original data size - before Augmentation') self.logger.info(f'Total data {str(len(input_data))}') self.logger.info( f'Event rate {str(sum(labels) / len(labels))}') self.logger.info( f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}' ) for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) self._mean, self._std = np.mean(input_data), np.std(input_data) self._jmean, self._jstd = np.mean(jitter), np.std(jitter) self._jmin, self._jmax = np.min(jitter), np.max(jitter) if self.data_augment: self.logger.info(f'Data Augmentation starts . . .') label_to_augment = 1 amount_to_augment = 1.3 ones_ids = [ idx for idx, x in enumerate(labels) if x == label_to_augment ] random_idxs = random.choices( ones_ids, k=int(len(ones_ids) * amount_to_augment)) data_to_augment = input_data[random_idxs] augmented_data, jitter_augmented_data = [], [] augmented_labels = [] for x in data_to_augment: x = librosaSpectro_to_torchTensor(x) x = random.choice([time_mask, freq_mask])(x)[0].numpy() augmented_data.append(x), augmented_labels.append( label_to_augment) # Jitter and shimmer # jitter_augmented_data, jitter_labels = BorderlineSMOTE().fit_resample(X=jitter, y=labels) # # assert np.mean(jitter_labels[len(jitter):][ # :len(augmented_data)]) == 1, 'Issue with Jitter Shimmer Augmentation' # # jitter = np.concatenate((jitter, jitter_augmented_data[len(jitter):][:len(augmented_data)])) input_data = np.concatenate((input_data, augmented_data)) labels = np.concatenate((labels, augmented_labels)) # Temp fix # input_data = input_data[:len(jitter)] # labels = labels[:len(jitter)] # assert len(jitter) == len( # input_data), "Input data and Jitter Shimmer augmentations don't match in length" self.logger.info(f'Data Augmentation done . . .') # data = [(x, y, z) for x, y, z in zip(input_data, labels, jitter)] # random.shuffle(data) # input_data, labels, jitter = np.array([x[0] for x in data]), [x[1] for x in data], np.array( # [x[2] for x in data]) data = [(x, y) for x, y in zip(input_data, labels)] random.shuffle(data) input_data, labels = np.array([x[0] for x in data ]), [x[1] for x in data] # Initialize pos_weight based on training data self.pos_weight = len([x for x in labels if x == 0]) / len( [x for x in labels if x == 1]) self.logger.info( f'Pos weight for the train data - {self.pos_weight}') self.logger.info(f'Total data {str(len(input_data))}') self.logger.info(f'Event rate {str(sum(labels) / len(labels))}') self.logger.info( f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}' ) self.logger.info( f'Min max values used for normalisation {self._min, self._max}' ) self.logger.info( f'Min max values used for normalisation {self._min, self._max}' ) # Normalizing `input data` on train dataset's min and max values if self.normalise: input_data = (input_data - self._min) / (self._max - self._min) input_data = (input_data - self._mean) / self._std # jitter = (jitter - self._jmin) / (self._jmax - self._jmin) # jitter = (jitter - self._jmean) / self._jstd self.dataset_sizes[type] = len(input_data) return DataLoader( TensorDataset( torch.Tensor(input_data).unsqueeze(1).repeat(1, 3, 1, 1), torch.Tensor(labels)), batch_size=self.batch_size # ,sampler=torch.utils.data.SubsetRandomSampler(list([x for x in range(10)])) )
def data_reader(self, data_filepath, label_filepath, jitter_filepath, train, should_batch=True, shuffle=True, infer=False): if infer: pass else: input_data, labels, jitter = read_npy(data_filepath), read_npy( label_filepath), read_npy(jitter_filepath) # jitter = np.expand_dims(jitter, axis=1) # length_to_match = input_data.shape[2] # jitter = np.concatenate( # [jitter, np.zeros([jitter.shape[0], jitter.shape[1], length_to_match - jitter.shape[2]])], axis=2) # input_data = np.concatenate((input_data, jitter), axis=1) if train: self.logger.info(f'Original data size - before Augmentation') self.logger.info(f'Total data {str(len(input_data))}') self.logger.info( f'Event rate {str(sum(labels) / len(labels))}') self.logger.info( f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}' ) for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) self._mean, self._std = np.mean(input_data), np.std(input_data) self._jmean, self._jstd = np.mean(jitter), np.std(jitter) self._jmin, self._jmax = np.min(jitter), np.max(jitter) if self.data_augment: self.logger.info(f'Data Augmentation starts . . .') label_to_augment = 1 amount_to_augment = 1.3 ones_ids = [ idx for idx, x in enumerate(labels) if x == label_to_augment ] random_idxs = random.choices( ones_ids, k=int(len(ones_ids) * amount_to_augment)) data_to_augment = input_data[random_idxs] augmented_data, jitter_augmented_data = [], [] augmented_labels = [] for x in data_to_augment: x = librosaSpectro_to_torchTensor(x) x = random.choice([time_mask, freq_mask])(x)[0].numpy() augmented_data.append(x), augmented_labels.append( label_to_augment) # print(len(input_data), len(jitter)) # Jitter and shimmer jitter_augmented_data, jitter_labels = BorderlineSMOTE( ).fit_resample(X=jitter, y=labels) # print(len(input_data), len(jitter), len(jitter_augmented_data), len(jitter_labels), len(labels), # sum(jitter_labels)) assert np.mean( jitter_labels[len(jitter):][:len(augmented_data)] ) == 1, 'Issue with Jitter Shimmer Augmentation' jitter = np.concatenate( (jitter, jitter_augmented_data[len(jitter):] [:len(augmented_data)])) input_data = np.concatenate((input_data, augmented_data)) labels = np.concatenate((labels, augmented_labels)) # Temp fix input_data = input_data[:len(jitter)] labels = labels[:len(jitter)] assert len(jitter) == len( input_data ), "Input data and Jitter Shimmer augmentations don't match in length" self.logger.info(f'Data Augmentation done . . .') data = [(x, y, z) for x, y, z in zip(input_data, labels, jitter)] random.shuffle(data) input_data, labels, jitter = np.array([x[0] for x in data]), [ x[1] for x in data ], np.array([x[2] for x in data]) # Initialize pos_weight based on training data self.pos_weight = len([x for x in labels if x == 0]) / len( [x for x in labels if x == 1]) self.logger.info( f'Pos weight for the train data - {self.pos_weight}') self.logger.info(f'Total data {str(len(input_data))}') self.logger.info(f'Event rate {str(sum(labels) / len(labels))}') self.logger.info( f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}' ) self.logger.info( f'Min max values used for normalisation {self._min, self._max}' ) self.logger.info( f'Min max values used for normalisation {self._min, self._max}' ) # Normalizing `input data` on train dataset's min and max values if self.normalise: input_data = (input_data - self._min) / (self._max - self._min) input_data = (input_data - self._mean) / self._std jitter = (jitter - self._jmin) / (self._jmax - self._jmin) jitter = (jitter - self._jmean) / self._jstd if should_batch: batched_input = [ input_data[pos:pos + self.batch_size] for pos in range(0, len(input_data), self.batch_size) ] batched_labels = [ labels[pos:pos + self.batch_size] for pos in range(0, len(labels), self.batch_size) ] batched_jitter = [ jitter[pos:pos + self.batch_size] for pos in range(0, len(jitter), self.batch_size) ] return batched_input, batched_labels, batched_jitter else: return input_data, labels, jitter
def data_reader(self, data_filepath, label_filepath='', train=False, should_batch=True, shuffle=True, infer=False): if infer: input_data = read_npy(data_filepath) if self.normalise: input_data = (input_data - np.min(input_data)) / ( np.max(input_data) - np.min(input_data)) return input_data else: input_data, labels = read_npy(data_filepath), read_npy( label_filepath) if train: print('Original data size - before Augmentation') print('Original data size - before Augmentation', file=self.log_file) print('Total data ', len(input_data)) print('Event rate', sum(labels) / len(labels)) print(np.array(input_data).shape, np.array(labels).shape) print('Total data ', len(input_data), file=self.log_file) print('Event rate', sum(labels) / len(labels), file=self.log_file) print(np.array(input_data).shape, np.array(labels).shape, file=self.log_file) for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) print('Data Augmentation starts . . .') print('Data Augmentation starts . . .', file=self.log_file) label_to_augment = 1 amount_to_augment = 1 ones_ids = [ idx for idx, x in enumerate(labels) if x == label_to_augment ] random_idxs = random.choices( ones_ids, k=int(len(ones_ids) * amount_to_augment)) data_to_augment = input_data[random_idxs] augmented_data = [] augmented_labels = [] for x in data_to_augment: x = librosaSpectro_to_torchTensor(x) x = random.choice([time_mask, freq_mask])(x)[0].numpy() augmented_data.append(x), augmented_labels.append( label_to_augment) input_data = np.concatenate((input_data, augmented_data)) labels = np.concatenate((labels, augmented_labels)) print('Data Augmentation done . . .') print('Data Augmentation done . . .', file=self.log_file) data = [(x, y) for x, y in zip(input_data, labels)] random.shuffle(data) input_data, labels = np.array([x[0] for x in data ]), [x[1] for x in data] # Initialize pos_weight based on training data self.pos_weight = len([x for x in labels if x == 0]) / len( [x for x in labels if x == 1]) print('Pos weight for the train data - ', self.pos_weight) print('Pos weight for the train data - ', self.pos_weight, file=self.log_file) print('Total data ', len(input_data)) print('Event rate', sum(labels) / len(labels)) print(np.array(input_data).shape, np.array(labels).shape) print('Total data ', len(input_data), file=self.log_file) print('Event rate', sum(labels) / len(labels), file=self.log_file) print(np.array(input_data).shape, np.array(labels).shape, file=self.log_file) print('Min max values used for normalisation ', self._min, self._max) print('Min max values used for normalisation ', self._min, self._max, file=self.log_file) # Normalizing `input data` on train dataset's min and max values if self.normalise: input_data = (input_data - self._min) / (self._max - self._min) if should_batch: batched_input = [ input_data[pos:pos + self.batch_size] for pos in range(0, len(input_data), self.batch_size) ] batched_labels = [ labels[pos:pos + self.batch_size] for pos in range(0, len(labels), self.batch_size) ] return batched_input, batched_labels else: return input_data, labels