def get_data_source_batch_summary(self, data_source_batch_name): failed_docs_directory = self.load_config.failed_docs_directory( data_source_batch_name) loaded_docs_directory = self.load_config.loaded_docs_directory( data_source_batch_name) updated_ids = [] indexed_ids = [] failed_ids = [] for data_loader_batch_name in os.listdir(failed_docs_directory): data_loader_batch_path = os.path.join( failed_docs_directory, data_loader_batch_name) if os.path.isfile(data_loader_batch_path) and data_loader_batch_name.startswith(DATA_LOADER_BATCH_PREFIX): batch_failed_docs = file_utils.load_file( failed_docs_directory, data_loader_batch_name) failed_ids.extend(batch_failed_docs.keys()) for data_loader_batch_name in os.listdir(loaded_docs_directory): data_loader_batch_path = os.path.join( loaded_docs_directory, data_loader_batch_name) if os.path.isfile(data_loader_batch_path) and data_loader_batch_name.startswith(DATA_LOADER_BATCH_PREFIX): data_loader_batch = file_utils.load_file( loaded_docs_directory, data_loader_batch_name) updated_ids.extend(data_loader_batch['updated_ids']) indexed_ids.extend(data_loader_batch['indexed_ids']) summary = dict() summary['updated_ids'] = updated_ids summary['indexed_ids'] = indexed_ids summary['failed_ids'] = failed_ids return summary
def get_loaded_doc_count(self): updated_ids = [] indexed_ids = [] data_source_directory = self.load_config.data_source_directory() for data_source_batch_name in os.listdir(data_source_directory): data_source_batch_path = os.path.join(data_source_directory, data_source_batch_name) if not os.path.isfile(data_source_batch_path ) and data_source_batch_name.startswith( DATA_SOURCE_BATCH_PREFIX ) and 'old' not in data_source_batch_name: self.load_config.log(LOG_LEVEL_TRACE, 'Processing data source batch:', data_source_batch_path) for data_loader_batch_name in os.listdir( data_source_batch_path): data_loader_batch_path = os.path.join( data_source_batch_path, data_loader_batch_name) if os.path.isfile(data_loader_batch_path ) and data_loader_batch_name.startswith( DATA_LOADER_BATCH_PREFIX): data_loader_batch = file_utils.load_file( data_source_batch_path, data_loader_batch_name) updated_ids.extend(data_loader_batch['updated_ids']) indexed_ids.extend(data_loader_batch['indexed_ids']) return len(updated_ids) + len(indexed_ids)
def get_progress(self): data_source_directory = self.load_config.data_source_directory() stats_file_name = self.data_source_stats_file_name() if self.data_source_stats is None: self.data_source_stats = file_utils.load_file( data_source_directory, stats_file_name) row_count = 0 unique_ids = 0 # if 'row_count' in self.data_source_stats: # row_count = self.data_source_stats['row_count'] if 'unique_ids' in self.data_source_stats: unique_ids = self.data_source_stats['unique_ids'] docs_loaded = self.get_loaded_doc_count() if unique_ids > 0: self.load_config.log(LOG_LEVEL_INFO, 'docs loaded', docs_loaded, 'unique_ids', unique_ids) progress = (docs_loaded / float(unique_ids)) * 100 self.data_source_stats['progress'] = progress file_utils.save_file(data_source_directory, stats_file_name, self.data_source_stats) return progress return -1
def get_failed_docs_files(self, data_source_name=None): data_source_batches = {} if data_source_name is not None: data_source_directory = self.data_source_directory( data_source_name=data_source_name) dsb = file_utils.load_file(data_source_directory, DATA_SOURCE_BATCHES_FILE) if data_source_name not in data_source_batches: data_source_batches[data_source_name] = [] for data_source_batch in dsb: data_source_batches[data_source_name].append(data_source_batch) else: generated_files_directory = self.generated_files_directory() print 'generated_files_directory', generated_files_directory for name in os.listdir(generated_files_directory): print 'data_source_name', name path = os.path.join(generated_files_directory, name) if os.path.isdir(path): data_source_directory = self.data_source_directory( data_source_name=name) print 'data_source_directory', data_source_directory dsb = file_utils.load_file(data_source_directory, DATA_SOURCE_BATCHES_FILE) if name not in data_source_batches: data_source_batches[name] = [] for data_source_batch in dsb: data_source_batches[name].append(data_source_batch) failed_docs_files = [] for data_source_name in data_source_batches: print 'data_source_name', data_source_name for data_source_batch in data_source_batches[data_source_name]: failed_docs_directory = self.failed_docs_directory( data_source_batch_name=data_source_batch, data_source_name=data_source_name) print 'failed_docs_directory', failed_docs_directory for name in os.listdir(failed_docs_directory): # print 'failed_doc_file', name path = os.path.join(failed_docs_directory, name) print 'failed_doc_file', path if os.path.isfile(path) and name.endswith('.json'): failed_docs_files.append(path) return failed_docs_files
def load_stats(self): self.load_config.log(LOG_LEVEL_INFO, 'Loading stats...') data_source_directory = self.load_config.data_source_directory() stats = file_utils.load_file(data_source_directory, 'stats.json') if 'total_rows' in stats: self.total_rows = stats['total_rows'] if 'total_ids' in stats: self.total_ids = stats['total_ids'] return stats
def get_data_source_batch_summary(self, data_source_batch_name): data_source_batch_directory = self.load_config.data_source_batch_directory( data_source_batch_name) data_source_directory = self.load_config.data_source_directory() updated_ids = [] indexed_ids = [] failed_ids = [] skipped_ids = [] for data_loader_batch_name in os.listdir(data_source_batch_directory): data_loader_batch_path = os.path.join(data_source_batch_directory, data_loader_batch_name) if os.path.isfile(data_loader_batch_path ) and data_loader_batch_name.startswith( DATA_LOADER_BATCH_PREFIX): data_loader_batch = file_utils.load_file( data_source_batch_directory, data_loader_batch_name) updated_ids.extend(data_loader_batch['updated_ids']) indexed_ids.extend(data_loader_batch['indexed_ids']) failed_ids.extend(data_loader_batch['failed_ids']) skipped_ids.extend(data_loader_batch['skipped_ids']) summary = dict() summary['updated_ids'] = updated_ids summary['indexed_ids'] = indexed_ids summary['failed_ids'] = failed_ids summary['skipped_ids'] = skipped_ids data_source_batch_summary = file_utils.load_file( data_source_directory, data_source_batch_name + '.json') for key in data_source_batch_summary: summary[key] = data_source_batch_summary[key] return summary
def get_failed_ids(self): self.load_config.log(LOG_LEVEL_INFO, 'Fetching failed ids...') failed_ids = {} data_source_directory = self.load_config.data_source_directory() for name in os.listdir(data_source_directory): file_path = os.path.join(data_source_directory, name) if os.path.isfile(file_path) and name.startswith(DATA_SOURCE_BATCH_PREFIX): batch_info = file_utils.load_file(data_source_directory, name) batch_failed_ids = batch_info['failed_ids'] batch_failed_ids = dict.fromkeys(batch_failed_ids, None) failed_ids.update(batch_failed_ids) self.load_config.log(LOG_LEVEL_INFO, 'Failed ids', len(failed_ids)) return failed_ids
def get_processed_indices(self): self.load_config.log(LOG_LEVEL_INFO, 'Fetching processed ids...') processed_indices = {} data_source_directory = self.load_config.data_source_directory() for name in os.listdir(data_source_directory): file_path = os.path.join(data_source_directory, name) if os.path.isfile(file_path) and name.startswith(DATA_SOURCE_BATCH_PREFIX): batch_info = file_utils.load_file(data_source_directory, name) if 'processed_indices' in batch_info: batch_processed_indices = batch_info['processed_indices'] batch_processed_indices = dict.fromkeys(batch_processed_indices, None) processed_indices.update(batch_processed_indices) self.load_config.log( LOG_LEVEL_INFO, 'Processed indices', len(processed_indices)) return processed_indices
def get_failed_ids(self, data_loader_batch_directory): failed_ids = {} for name in os.listdir(data_loader_batch_directory): file_path = os.path.join(data_loader_batch_directory, name) if os.path.isfile(file_path) and name.startswith("failed_docs_"): self.load_config.log(LOG_LEVEL_TRACE, 'processing file:', file_path) failed_docs = file_utils.load_file(data_loader_batch_directory, name) for _id in failed_docs: failed_ids[_id] = failed_docs[_id] self.load_config.log(LOG_LEVEL_DEBUG, 'Failed ids', len(failed_ids)) return failed_ids
def list_failed_docs(self, data_source_batch_directory): # print '...Processing', data_loader_batch_directory for name in os.listdir(data_source_batch_directory): file_path = os.path.join(data_source_batch_directory, name) if os.path.isfile(file_path) and name.startswith("failed_docs_"): failed_docs = file_utils.load_file(data_source_batch_directory, name) print file_path, '- Failed docs', len(failed_docs) if len(failed_docs) > 0: a = raw_input('List docs? (y/n)') if a.lower() in ['y', 'yes']: for _id in failed_docs: reason = failed_docs[_id]['reason'] print 'Doc:', _id print 'Reason', reason c = raw_input('Continue?') if c.lower() in ['n', 'no']: break
def get_processed_rows(self): data_source_directory = self.load_config.data_source_directory() processed_rows = 0 for name in os.listdir(data_source_directory): file_path = os.path.join(data_source_directory, name) if os.path.isfile(file_path) and name.startswith( DATA_SOURCE_BATCH_PREFIX): self.load_config.log(LOG_LEVEL_DEBUG, 'processing file:', file_path) batch_data = file_utils.load_file(data_source_directory, name) start_index = batch_data['start_index'] row_count = batch_data['row_count'] end_index = start_index + row_count if end_index > processed_rows: processed_rows = end_index return processed_rows
def count_rows(self): data_source_directory = self.load_config.data_source_directory() stats_file_name = self.data_source_stats_file_name() self.data_source_stats = file_utils.load_file(data_source_directory, stats_file_name) if self.data_source_stats is None or len(self.data_source_stats) == 0: self.count = 0 self.data_source_batch = {} self.data_source.process_rows(0, self.count_row) self.load_config.log(LOG_LEVEL_INFO, 'Total rows:', self.count) self.load_config.log(LOG_LEVEL_INFO, 'Total ids:', len(self.data_source_batch)) self.data_source_stats = { 'row_count': self.count, 'unique_ids': len(self.data_source_batch) } file_utils.save_file(data_source_directory, stats_file_name, self.data_source_stats)
def get_failed_ids(self, data_source_batch_name): failed_docs_directory = self.load_config.failed_docs_directory( data_source_batch_name) failed_ids = {} for name in os.listdir(failed_docs_directory): file_path = os.path.join(failed_docs_directory, name) if os.path.isfile(file_path) and name.startswith( DATA_LOADER_BATCH_PREFIX): self.load_config.log(LOG_LEVEL_TRACE, 'processing file:', file_path) batch_failed_docs = file_utils.load_file( failed_docs_directory, name) for _id in batch_failed_docs: failed_ids[_id] = 0 self.load_config.log(LOG_LEVEL_DEBUG, 'Failed ids', len(failed_ids)) return failed_ids
def get_combined_data_source_summary(self): updated_ids = {} indexed_ids = {} failed_ids = {} data_source_directory = self.load_config.data_source_directory() data_source_batches = self.load_data_source_batches() for data_source_batch_name in data_source_batches: data_source_batch_summary = self.get_data_source_batch_summary( data_source_batch_name) # Remove duplicates across batches for _id in data_source_batch_summary['updated_ids']: updated_ids[_id] = 0 for _id in data_source_batch_summary['indexed_ids']: indexed_ids[_id] = 0 for _id in data_source_batch_summary['failed_ids']: failed_ids[_id] = 0 # failed_ids can be present in updated_ids or indexed_ids arrays. # filter those out filtered_failed_ids = {} for _id in failed_ids: if _id not in updated_ids and _id not in indexed_ids: filtered_failed_ids[_id] = 0 # Load the data source stats data_source_summary = file_utils.load_file( data_source_directory, 'stats.json') summary = dict() summary['total_rows'] = data_source_summary['total_rows'] summary['total_ids'] = data_source_summary['total_ids'] summary['updated_ids'] = updated_ids summary['indexed_ids'] = indexed_ids summary['failed_ids'] = filtered_failed_ids return summary
def get_loaded_ids(self, data_loader_batch_directory): loaded_ids = {} for name in os.listdir(data_loader_batch_directory): file_path = os.path.join(data_loader_batch_directory, name) if os.path.isfile(file_path) and name.startswith( DATA_LOADER_BATCH_PREFIX): self.load_config.log(LOG_LEVEL_TRACE, 'processing file:', file_path) batch_data = file_utils.load_file(data_loader_batch_directory, name) updated_ids = batch_data['updated_ids'] indexed_ids = batch_data['indexed_ids'] for _id in updated_ids: loaded_ids[_id] = 0 for _id in indexed_ids: loaded_ids[_id] = 0 self.load_config.log(LOG_LEVEL_DEBUG, 'Loaded ids', len(loaded_ids)) return loaded_ids
def __init__( self, index_filenames, window_length=1, # in seconds shift=0.5, # in seconds timesteps=19, # in seconds sampling_rate=256, # in Hz batch_size=10, do_standard_scaling=True, in_training_mode=False, balance_batches=False, patient_id=None): ''' Constructor to create objects of the class **RawDataProcessor** and load all the data. Future implementations will pave the way to load data from files on-the-fly in order to allow work with large enough datasets. Parameters ---------- :param self: Reference to the current object. :param list index_filenames: List of index filenames from which to load data filenames. :param int window_length: Length of the window (in seconds) to slide through the signal. :param int shift: Number of seconds to shift the window to get each sample. :param int timesteps: Number of time steps for the sequences. :param int sampling_rate: Sampling rate of the signals, in Hz. :param int batch_size: Size of the batch to use. :param boolean do_standard_scaling: Flag to indicate whether to scale features to zero mean and unit variance. :param boolean in_training_mode: Flag to indicate whether the process is in training mode. :param boolean balance_batches: Flag to indicate whether to balance the batches. :param string patient_id: String to indicate the patient id. It is used to save the statistics file. ''' # if patient_id is None: raise Exception('You have to specify a patient id, i.e. "chb01"') self.sampling_rate = sampling_rate self.batch_size = batch_size self.do_standard_scaling = do_standard_scaling self.in_training_mode = in_training_mode self.balance_batches = balance_batches self.patient_id = patient_id # self.window_length = window_length * sampling_rate self.sample_shift = int(sampling_rate * shift) # Half of sample length self.timesteps = timesteps self.num_channels = 23 # self.input_shape = None self.num_batches = 0 # self.filenames = list() for fname in index_filenames: with open(fname, 'r') as f: for l in f: if l[0] != '#': self.filenames.append(l.strip() + '.edf.pbz2') f.close() # # List to store the data separated by files self.data = list() # List to store the labels associated at each sample separated by files self.labels = list() self.num_seizures = 0 num_ictal = 0 num_interictal = 0 print("Loading EDF signals...") for i in tqdm(range(len(self.filenames))): d_p = load_file(self.filenames[i], exclude_seizures=False, do_preemphasis=False, separate_seizures=True, verbose=0) len_file = 0 self.data.append([]) self.labels.append([]) for p, label in d_p: self.data[-1].append(p) # p.tolist() # Append data self.labels[-1] += [label] * len(p) if label == 1: self.num_seizures += 1 num_ictal += len(p) elif label == 0: num_interictal += len(p) len_file += len(p) # #print(self.data[-1][0].shape) self.data[-1] = numpy.concatenate(self.data[-1], axis=0) #print(self.data[-1].shape) # # Iterate over the data list and generate indices for the sequences # We will treat each recording as independent from the others self.file_indexes = list() for i in range(len(self.data)): limit = len(self.data[i]) // self.sample_shift limit -= self.timesteps + 1 limit = (limit * self.sample_shift) # - self.sample_shift self.file_indexes.append( numpy.arange(0, limit, step=self.sample_shift).tolist()) # num_sequences_per_class = [0, 0] # Sequences indexes if self.balance_batches: # The batches will have the same samples for each class # List for the indices of class 0 self.sequences_indexes_c0 = list() # List for the indices of class 1 self.sequences_indexes_c1 = list() for fi in range(len(self.file_indexes)): for t in self.file_indexes[fi]: label = self.labels[fi][t + (self.timesteps + 1) * self.sample_shift - 1] if label == 0: self.sequences_indexes_c0.append((fi, t)) elif label == 1: self.sequences_indexes_c1.append((fi, t)) # # num_sequences_per_class[0] = len(self.sequences_indexes_c0) num_sequences_per_class[1] = len(self.sequences_indexes_c1) num_sequences = num_sequences_per_class[ 0] + num_sequences_per_class[1] self.num_batches = num_sequences // self.batch_size if num_sequences % self.batch_size != 0: self.num_batches += 1 # else: # Just one list for all of the sequences. No balancing self.sequences_indexes = list() for fi in range(len(self.file_indexes)): for t in self.file_indexes[fi]: label = self.labels[fi][t + (self.timesteps + 1) * self.sample_shift - 1] num_sequences_per_class[label] += 1 self.sequences_indexes.append((fi, t, label)) # # num_sequences = len(self.sequences_indexes) self.num_batches = num_sequences // self.batch_size if num_sequences % self.batch_size != 0: self.num_batches += 1 # self.input_shape = (self.window_length, ) print('Signals loaded!') print( '\n-----------------------------------------------------------\n') print(f'Number of seizures available: {self.num_seizures}') #if self.num_seizures < 3: # raise Exception('Not enough seizures, please try other patient.') print( f'Number of samples (not sequences): {num_ictal + num_interictal}') print( f'Interictal samples: {num_interictal} ({(num_interictal / (num_ictal + num_interictal) * 100):.2f} %)' ) print( f'Ictal samples: {num_ictal} ({(num_ictal / (num_ictal + num_interictal) * 100):.2f} %)' ) print(f'\nNumber of sequences: {num_sequences}') print( f'Interictal sequences: {num_sequences_per_class[0]} ({num_sequences_per_class[0] / num_sequences * 100.0 :.2f}%)' ) print( f'Ictal sequences: {num_sequences_per_class[1]} ({num_sequences_per_class[1] / num_sequences * 100.0 :.2f}%)' ) print(f'Number of batches: {self.num_batches}') print( '\n-----------------------------------------------------------\n') # Standard scaling if self.do_standard_scaling: os.makedirs('stats', exist_ok=True) if self.in_training_mode: print('Calculating statistics to scale the data...') means = [] counts = [] stddevs = [] for p in tqdm(self.data): #p = numpy.array(p) #print(p.shape) means.append(p.mean()) counts.append(len(p)) stddevs.append(p.std()) # # self.mean = sum([m * c for m, c in zip(means, counts)]) self.std = sum([s * c for s, c in zip(stddevs, counts)]) self.mean /= sum(counts) self.std /= sum(counts) # array = numpy.array([self.mean, self.std]) numpy.save(f'stats/statistics_detection_raw_{patient_id}.npy', array) del means del counts del stddevs # else: print('Loading statistics to scale the data...') array = numpy.load( f'stats/statistics_detection_raw_{patient_id}.npy') self.mean = array[0] self.std = array[1] # del array gc.collect()
def load_data_source_batches(self): data_source_directory = self.load_config.data_source_directory() return file_utils.load_file(data_source_directory, DATA_SOURCE_BATCHES_FILE)
number_of_subsets = 20 size_of_subsets = 500 max_iterations = 100 # optimal value: 5000 - but extremely slow. Keep it to 100 for faster results hc_count = 30 algorithms = [ ["Constructive", greedy_constructive, None, 'red'], ["Hill Climb (%d iter.)" % max_iterations, hill_climb, [max_iterations], 'green'], ["MHC (%d iter. %d runs)" % (max_iterations, hc_count), multi_hc, [max_iterations, hc_count], 'blue'], # ["Spectrum v1", spectrum_sort, None, 'violet'], ["Spectrum v2", spectrum_sort_shake, None, 'purple'] ] size, colours = load_file() subsets = list(generate_subsets(colours, number_of_subsets, size_of_subsets)) times = [] for algorithm in algorithms: algorithm_times = list(benchmark_algorithm(subsets, algorithm[1], algorithm[2])) times.append(algorithm_times) print("%s min/avg/max/std = %.2f/%.2f/%.2f/%.2f ms" % ( algorithm[0], np.min(algorithm_times) * 1000, np.average(algorithm_times) * 1000, np.max(algorithm_times) * 1000, np.std(algorithm_times) * 1000, ))