Ejemplo n.º 1
0
    def get_data_source_batch_summary(self, data_source_batch_name):
        failed_docs_directory = self.load_config.failed_docs_directory(
            data_source_batch_name)
        loaded_docs_directory = self.load_config.loaded_docs_directory(
            data_source_batch_name)

        updated_ids = []
        indexed_ids = []
        failed_ids = []

        for data_loader_batch_name in os.listdir(failed_docs_directory):
            data_loader_batch_path = os.path.join(
                failed_docs_directory, data_loader_batch_name)
            if os.path.isfile(data_loader_batch_path) and data_loader_batch_name.startswith(DATA_LOADER_BATCH_PREFIX):
                batch_failed_docs = file_utils.load_file(
                    failed_docs_directory, data_loader_batch_name)
                failed_ids.extend(batch_failed_docs.keys())

        for data_loader_batch_name in os.listdir(loaded_docs_directory):
            data_loader_batch_path = os.path.join(
                loaded_docs_directory, data_loader_batch_name)
            if os.path.isfile(data_loader_batch_path) and data_loader_batch_name.startswith(DATA_LOADER_BATCH_PREFIX):
                data_loader_batch = file_utils.load_file(
                    loaded_docs_directory, data_loader_batch_name)

                updated_ids.extend(data_loader_batch['updated_ids'])
                indexed_ids.extend(data_loader_batch['indexed_ids'])

        summary = dict()
        summary['updated_ids'] = updated_ids
        summary['indexed_ids'] = indexed_ids
        summary['failed_ids'] = failed_ids

        return summary
Ejemplo n.º 2
0
    def get_loaded_doc_count(self):
        updated_ids = []
        indexed_ids = []

        data_source_directory = self.load_config.data_source_directory()

        for data_source_batch_name in os.listdir(data_source_directory):
            data_source_batch_path = os.path.join(data_source_directory,
                                                  data_source_batch_name)
            if not os.path.isfile(data_source_batch_path
                                  ) and data_source_batch_name.startswith(
                                      DATA_SOURCE_BATCH_PREFIX
                                  ) and 'old' not in data_source_batch_name:
                self.load_config.log(LOG_LEVEL_TRACE,
                                     'Processing data source batch:',
                                     data_source_batch_path)

                for data_loader_batch_name in os.listdir(
                        data_source_batch_path):
                    data_loader_batch_path = os.path.join(
                        data_source_batch_path, data_loader_batch_name)
                    if os.path.isfile(data_loader_batch_path
                                      ) and data_loader_batch_name.startswith(
                                          DATA_LOADER_BATCH_PREFIX):
                        data_loader_batch = file_utils.load_file(
                            data_source_batch_path, data_loader_batch_name)

                        updated_ids.extend(data_loader_batch['updated_ids'])
                        indexed_ids.extend(data_loader_batch['indexed_ids'])

        return len(updated_ids) + len(indexed_ids)
Ejemplo n.º 3
0
    def get_progress(self):
        data_source_directory = self.load_config.data_source_directory()
        stats_file_name = self.data_source_stats_file_name()

        if self.data_source_stats is None:
            self.data_source_stats = file_utils.load_file(
                data_source_directory, stats_file_name)

        row_count = 0
        unique_ids = 0
        # if 'row_count' in self.data_source_stats:
        #     row_count = self.data_source_stats['row_count']
        if 'unique_ids' in self.data_source_stats:
            unique_ids = self.data_source_stats['unique_ids']

        docs_loaded = self.get_loaded_doc_count()

        if unique_ids > 0:
            self.load_config.log(LOG_LEVEL_INFO, 'docs loaded', docs_loaded,
                                 'unique_ids', unique_ids)
            progress = (docs_loaded / float(unique_ids)) * 100
            self.data_source_stats['progress'] = progress
            file_utils.save_file(data_source_directory, stats_file_name,
                                 self.data_source_stats)
            return progress

        return -1
Ejemplo n.º 4
0
    def get_failed_docs_files(self, data_source_name=None):
        data_source_batches = {}
        if data_source_name is not None:
            data_source_directory = self.data_source_directory(
                data_source_name=data_source_name)
            dsb = file_utils.load_file(data_source_directory,
                                       DATA_SOURCE_BATCHES_FILE)
            if data_source_name not in data_source_batches:
                data_source_batches[data_source_name] = []

            for data_source_batch in dsb:
                data_source_batches[data_source_name].append(data_source_batch)
        else:
            generated_files_directory = self.generated_files_directory()
            print 'generated_files_directory', generated_files_directory
            for name in os.listdir(generated_files_directory):
                print 'data_source_name', name
                path = os.path.join(generated_files_directory, name)
                if os.path.isdir(path):
                    data_source_directory = self.data_source_directory(
                        data_source_name=name)
                    print 'data_source_directory', data_source_directory
                    dsb = file_utils.load_file(data_source_directory,
                                               DATA_SOURCE_BATCHES_FILE)
                    if name not in data_source_batches:
                        data_source_batches[name] = []

                    for data_source_batch in dsb:
                        data_source_batches[name].append(data_source_batch)

        failed_docs_files = []
        for data_source_name in data_source_batches:
            print 'data_source_name', data_source_name
            for data_source_batch in data_source_batches[data_source_name]:
                failed_docs_directory = self.failed_docs_directory(
                    data_source_batch_name=data_source_batch,
                    data_source_name=data_source_name)
                print 'failed_docs_directory', failed_docs_directory
                for name in os.listdir(failed_docs_directory):
                    # print 'failed_doc_file', name
                    path = os.path.join(failed_docs_directory, name)
                    print 'failed_doc_file', path
                    if os.path.isfile(path) and name.endswith('.json'):
                        failed_docs_files.append(path)

        return failed_docs_files
Ejemplo n.º 5
0
    def load_stats(self):
        self.load_config.log(LOG_LEVEL_INFO, 'Loading stats...')
        data_source_directory = self.load_config.data_source_directory()
        stats = file_utils.load_file(data_source_directory, 'stats.json')

        if 'total_rows' in stats:
            self.total_rows = stats['total_rows']

        if 'total_ids' in stats:
            self.total_ids = stats['total_ids']

        return stats
Ejemplo n.º 6
0
    def get_data_source_batch_summary(self, data_source_batch_name):
        data_source_batch_directory = self.load_config.data_source_batch_directory(
            data_source_batch_name)
        data_source_directory = self.load_config.data_source_directory()

        updated_ids = []
        indexed_ids = []
        failed_ids = []
        skipped_ids = []

        for data_loader_batch_name in os.listdir(data_source_batch_directory):
            data_loader_batch_path = os.path.join(data_source_batch_directory,
                                                  data_loader_batch_name)
            if os.path.isfile(data_loader_batch_path
                              ) and data_loader_batch_name.startswith(
                                  DATA_LOADER_BATCH_PREFIX):
                data_loader_batch = file_utils.load_file(
                    data_source_batch_directory, data_loader_batch_name)

                updated_ids.extend(data_loader_batch['updated_ids'])
                indexed_ids.extend(data_loader_batch['indexed_ids'])
                failed_ids.extend(data_loader_batch['failed_ids'])
                skipped_ids.extend(data_loader_batch['skipped_ids'])

        summary = dict()
        summary['updated_ids'] = updated_ids
        summary['indexed_ids'] = indexed_ids
        summary['failed_ids'] = failed_ids
        summary['skipped_ids'] = skipped_ids

        data_source_batch_summary = file_utils.load_file(
            data_source_directory, data_source_batch_name + '.json')
        for key in data_source_batch_summary:
            summary[key] = data_source_batch_summary[key]

        return summary
Ejemplo n.º 7
0
    def get_failed_ids(self):
        self.load_config.log(LOG_LEVEL_INFO, 'Fetching failed ids...')

        failed_ids = {}
        data_source_directory = self.load_config.data_source_directory()
        for name in os.listdir(data_source_directory):
            file_path = os.path.join(data_source_directory, name)
            if os.path.isfile(file_path) and name.startswith(DATA_SOURCE_BATCH_PREFIX):
                batch_info = file_utils.load_file(data_source_directory, name)
                batch_failed_ids = batch_info['failed_ids']
                batch_failed_ids = dict.fromkeys(batch_failed_ids, None)

                failed_ids.update(batch_failed_ids)

        self.load_config.log(LOG_LEVEL_INFO, 'Failed ids', len(failed_ids))

        return failed_ids
Ejemplo n.º 8
0
    def get_processed_indices(self):
        self.load_config.log(LOG_LEVEL_INFO, 'Fetching processed ids...')
        processed_indices = {}
        data_source_directory = self.load_config.data_source_directory()
        for name in os.listdir(data_source_directory):
            file_path = os.path.join(data_source_directory, name)
            if os.path.isfile(file_path) and name.startswith(DATA_SOURCE_BATCH_PREFIX):
                batch_info = file_utils.load_file(data_source_directory, name)
                if 'processed_indices' in batch_info:
                    batch_processed_indices = batch_info['processed_indices']
                    batch_processed_indices = dict.fromkeys(batch_processed_indices, None)
                    processed_indices.update(batch_processed_indices)

        self.load_config.log(
            LOG_LEVEL_INFO, 'Processed indices', len(processed_indices))

        return processed_indices
Ejemplo n.º 9
0
    def get_failed_ids(self, data_loader_batch_directory):
        failed_ids = {}

        for name in os.listdir(data_loader_batch_directory):
            file_path = os.path.join(data_loader_batch_directory, name)
            if os.path.isfile(file_path) and name.startswith("failed_docs_"):
                self.load_config.log(LOG_LEVEL_TRACE, 'processing file:',
                                     file_path)

                failed_docs = file_utils.load_file(data_loader_batch_directory,
                                                   name)
                for _id in failed_docs:
                    failed_ids[_id] = failed_docs[_id]

        self.load_config.log(LOG_LEVEL_DEBUG, 'Failed ids', len(failed_ids))

        return failed_ids
Ejemplo n.º 10
0
 def list_failed_docs(self, data_source_batch_directory):
     # print '...Processing', data_loader_batch_directory
     for name in os.listdir(data_source_batch_directory):
         file_path = os.path.join(data_source_batch_directory, name)
         if os.path.isfile(file_path) and name.startswith("failed_docs_"):
             failed_docs = file_utils.load_file(data_source_batch_directory,
                                                name)
             print file_path, '- Failed docs', len(failed_docs)
             if len(failed_docs) > 0:
                 a = raw_input('List docs? (y/n)')
                 if a.lower() in ['y', 'yes']:
                     for _id in failed_docs:
                         reason = failed_docs[_id]['reason']
                         print 'Doc:', _id
                         print 'Reason', reason
                         c = raw_input('Continue?')
                         if c.lower() in ['n', 'no']:
                             break
Ejemplo n.º 11
0
    def get_processed_rows(self):
        data_source_directory = self.load_config.data_source_directory()

        processed_rows = 0
        for name in os.listdir(data_source_directory):
            file_path = os.path.join(data_source_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    DATA_SOURCE_BATCH_PREFIX):
                self.load_config.log(LOG_LEVEL_DEBUG, 'processing file:',
                                     file_path)
                batch_data = file_utils.load_file(data_source_directory, name)
                start_index = batch_data['start_index']
                row_count = batch_data['row_count']

                end_index = start_index + row_count
                if end_index > processed_rows:
                    processed_rows = end_index

        return processed_rows
Ejemplo n.º 12
0
    def count_rows(self):
        data_source_directory = self.load_config.data_source_directory()
        stats_file_name = self.data_source_stats_file_name()
        self.data_source_stats = file_utils.load_file(data_source_directory,
                                                      stats_file_name)
        if self.data_source_stats is None or len(self.data_source_stats) == 0:
            self.count = 0
            self.data_source_batch = {}
            self.data_source.process_rows(0, self.count_row)
            self.load_config.log(LOG_LEVEL_INFO, 'Total rows:', self.count)
            self.load_config.log(LOG_LEVEL_INFO, 'Total ids:',
                                 len(self.data_source_batch))

            self.data_source_stats = {
                'row_count': self.count,
                'unique_ids': len(self.data_source_batch)
            }
            file_utils.save_file(data_source_directory, stats_file_name,
                                 self.data_source_stats)
Ejemplo n.º 13
0
    def get_failed_ids(self, data_source_batch_name):
        failed_docs_directory = self.load_config.failed_docs_directory(
            data_source_batch_name)

        failed_ids = {}
        for name in os.listdir(failed_docs_directory):
            file_path = os.path.join(failed_docs_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    DATA_LOADER_BATCH_PREFIX):
                self.load_config.log(LOG_LEVEL_TRACE, 'processing file:',
                                     file_path)
                batch_failed_docs = file_utils.load_file(
                    failed_docs_directory, name)

                for _id in batch_failed_docs:
                    failed_ids[_id] = 0

        self.load_config.log(LOG_LEVEL_DEBUG, 'Failed ids', len(failed_ids))

        return failed_ids
Ejemplo n.º 14
0
    def get_combined_data_source_summary(self):
        updated_ids = {}
        indexed_ids = {}
        failed_ids = {}

        data_source_directory = self.load_config.data_source_directory()
        data_source_batches = self.load_data_source_batches()

        for data_source_batch_name in data_source_batches:
            data_source_batch_summary = self.get_data_source_batch_summary(
                data_source_batch_name)

            # Remove duplicates across batches
            for _id in data_source_batch_summary['updated_ids']:
                updated_ids[_id] = 0
            for _id in data_source_batch_summary['indexed_ids']:
                indexed_ids[_id] = 0
            for _id in data_source_batch_summary['failed_ids']:
                failed_ids[_id] = 0

        # failed_ids can be present in updated_ids or indexed_ids arrays.
        # filter those out
        filtered_failed_ids = {}
        for _id in failed_ids:
            if _id not in updated_ids and _id not in indexed_ids:
                filtered_failed_ids[_id] = 0

        # Load the data source stats
        data_source_summary = file_utils.load_file(
            data_source_directory, 'stats.json')

        summary = dict()
        summary['total_rows'] = data_source_summary['total_rows']
        summary['total_ids'] = data_source_summary['total_ids']

        summary['updated_ids'] = updated_ids
        summary['indexed_ids'] = indexed_ids
        summary['failed_ids'] = filtered_failed_ids

        return summary
Ejemplo n.º 15
0
    def get_loaded_ids(self, data_loader_batch_directory):
        loaded_ids = {}
        for name in os.listdir(data_loader_batch_directory):
            file_path = os.path.join(data_loader_batch_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    DATA_LOADER_BATCH_PREFIX):
                self.load_config.log(LOG_LEVEL_TRACE, 'processing file:',
                                     file_path)
                batch_data = file_utils.load_file(data_loader_batch_directory,
                                                  name)
                updated_ids = batch_data['updated_ids']
                indexed_ids = batch_data['indexed_ids']

                for _id in updated_ids:
                    loaded_ids[_id] = 0

                for _id in indexed_ids:
                    loaded_ids[_id] = 0

        self.load_config.log(LOG_LEVEL_DEBUG, 'Loaded ids', len(loaded_ids))

        return loaded_ids
    def __init__(
            self,
            index_filenames,
            window_length=1,  # in seconds
            shift=0.5,  # in seconds
            timesteps=19,  # in seconds
            sampling_rate=256,  # in Hz
            batch_size=10,
            do_standard_scaling=True,
            in_training_mode=False,
            balance_batches=False,
            patient_id=None):
        '''

            Constructor to create objects of the class **RawDataProcessor** and load all the data.
            Future implementations will pave the way to load data from files on-the-fly in order to allow work
            with large enough datasets.

            Parameters
            ----------

            :param self:
                Reference to the current object.

            :param list index_filenames:
                List of index filenames from which to load data filenames.

            :param int window_length:
                Length of the window (in seconds) to slide through the signal.

            :param int shift:
                Number of seconds to shift the window to get each sample.

            :param int timesteps:
                Number of time steps for the sequences.

            :param int sampling_rate:
                Sampling rate of the signals, in Hz.

            :param int batch_size:
                Size of the batch to use.

            :param boolean do_standard_scaling:
                Flag to indicate whether to scale features to zero
                mean and unit variance.

            :param boolean in_training_mode:
                Flag to indicate whether the process is in training mode.

            :param boolean balance_batches:
                Flag to indicate whether to balance the batches.
            
            :param string patient_id:
                String to indicate the patient id. It is used to save the statistics file.

        '''
        #

        if patient_id is None:
            raise Exception('You have to specify a patient id, i.e. "chb01"')

        self.sampling_rate = sampling_rate
        self.batch_size = batch_size
        self.do_standard_scaling = do_standard_scaling
        self.in_training_mode = in_training_mode
        self.balance_batches = balance_batches
        self.patient_id = patient_id
        #
        self.window_length = window_length * sampling_rate
        self.sample_shift = int(sampling_rate * shift)  # Half of sample length
        self.timesteps = timesteps
        self.num_channels = 23
        #
        self.input_shape = None
        self.num_batches = 0
        #
        self.filenames = list()
        for fname in index_filenames:
            with open(fname, 'r') as f:
                for l in f:
                    if l[0] != '#':
                        self.filenames.append(l.strip() + '.edf.pbz2')
                f.close()
        #
        # List to store the data separated by files
        self.data = list()
        # List to store the labels associated at each sample separated by files
        self.labels = list()

        self.num_seizures = 0

        num_ictal = 0
        num_interictal = 0

        print("Loading EDF signals...")
        for i in tqdm(range(len(self.filenames))):
            d_p = load_file(self.filenames[i],
                            exclude_seizures=False,
                            do_preemphasis=False,
                            separate_seizures=True,
                            verbose=0)

            len_file = 0

            self.data.append([])
            self.labels.append([])
            for p, label in d_p:
                self.data[-1].append(p)  # p.tolist() # Append data
                self.labels[-1] += [label] * len(p)
                if label == 1:
                    self.num_seizures += 1
                    num_ictal += len(p)
                elif label == 0:
                    num_interictal += len(p)
                len_file += len(p)
            #
            #print(self.data[-1][0].shape)
            self.data[-1] = numpy.concatenate(self.data[-1], axis=0)
            #print(self.data[-1].shape)
        #
        # Iterate over the data list and generate indices for the sequences
        # We will treat each recording as independent from the others
        self.file_indexes = list()
        for i in range(len(self.data)):
            limit = len(self.data[i]) // self.sample_shift
            limit -= self.timesteps + 1
            limit = (limit * self.sample_shift)  # - self.sample_shift
            self.file_indexes.append(
                numpy.arange(0, limit, step=self.sample_shift).tolist())
        #

        num_sequences_per_class = [0, 0]
        # Sequences indexes
        if self.balance_batches:
            # The batches will have the same samples for each class
            # List for the indices of class 0
            self.sequences_indexes_c0 = list()
            # List for the indices of class 1
            self.sequences_indexes_c1 = list()

            for fi in range(len(self.file_indexes)):
                for t in self.file_indexes[fi]:

                    label = self.labels[fi][t + (self.timesteps + 1) *
                                            self.sample_shift - 1]

                    if label == 0:
                        self.sequences_indexes_c0.append((fi, t))
                    elif label == 1:
                        self.sequences_indexes_c1.append((fi, t))
                #
            #

            num_sequences_per_class[0] = len(self.sequences_indexes_c0)
            num_sequences_per_class[1] = len(self.sequences_indexes_c1)
            num_sequences = num_sequences_per_class[
                0] + num_sequences_per_class[1]
            self.num_batches = num_sequences // self.batch_size
            if num_sequences % self.batch_size != 0:
                self.num_batches += 1
            #

        else:
            # Just one list for all of the sequences. No balancing
            self.sequences_indexes = list()

            for fi in range(len(self.file_indexes)):
                for t in self.file_indexes[fi]:

                    label = self.labels[fi][t + (self.timesteps + 1) *
                                            self.sample_shift - 1]
                    num_sequences_per_class[label] += 1

                    self.sequences_indexes.append((fi, t, label))
                #
            #
            num_sequences = len(self.sequences_indexes)
            self.num_batches = num_sequences // self.batch_size
            if num_sequences % self.batch_size != 0:
                self.num_batches += 1
        #

        self.input_shape = (self.window_length, )

        print('Signals loaded!')
        print(
            '\n-----------------------------------------------------------\n')
        print(f'Number of seizures available: {self.num_seizures}')
        #if self.num_seizures < 3:
        #    raise Exception('Not enough seizures, please try other patient.')

        print(
            f'Number of samples (not sequences): {num_ictal + num_interictal}')
        print(
            f'Interictal samples: {num_interictal} ({(num_interictal / (num_ictal + num_interictal) * 100):.2f} %)'
        )
        print(
            f'Ictal samples: {num_ictal} ({(num_ictal / (num_ictal + num_interictal) * 100):.2f} %)'
        )

        print(f'\nNumber of sequences: {num_sequences}')
        print(
            f'Interictal sequences: {num_sequences_per_class[0]} ({num_sequences_per_class[0] / num_sequences * 100.0 :.2f}%)'
        )
        print(
            f'Ictal sequences: {num_sequences_per_class[1]} ({num_sequences_per_class[1] / num_sequences * 100.0 :.2f}%)'
        )
        print(f'Number of batches: {self.num_batches}')
        print(
            '\n-----------------------------------------------------------\n')

        # Standard scaling
        if self.do_standard_scaling:

            os.makedirs('stats', exist_ok=True)

            if self.in_training_mode:
                print('Calculating statistics to scale the data...')
                means = []
                counts = []
                stddevs = []
                for p in tqdm(self.data):
                    #p = numpy.array(p)
                    #print(p.shape)
                    means.append(p.mean())
                    counts.append(len(p))
                    stddevs.append(p.std())
                    #
                #
                self.mean = sum([m * c for m, c in zip(means, counts)])
                self.std = sum([s * c for s, c in zip(stddevs, counts)])
                self.mean /= sum(counts)
                self.std /= sum(counts)
                #
                array = numpy.array([self.mean, self.std])
                numpy.save(f'stats/statistics_detection_raw_{patient_id}.npy',
                           array)
                del means
                del counts
                del stddevs

            #
            else:
                print('Loading statistics to scale the data...')
                array = numpy.load(
                    f'stats/statistics_detection_raw_{patient_id}.npy')
                self.mean = array[0]
                self.std = array[1]
            #
            del array
            gc.collect()
Ejemplo n.º 17
0
 def load_data_source_batches(self):
     data_source_directory = self.load_config.data_source_directory()
     return file_utils.load_file(data_source_directory, DATA_SOURCE_BATCHES_FILE)
Ejemplo n.º 18
0
number_of_subsets = 20
size_of_subsets = 500

max_iterations = 100  # optimal value: 5000 - but extremely slow. Keep it to 100 for faster results
hc_count = 30

algorithms = [
    ["Constructive", greedy_constructive, None, 'red'],
    ["Hill Climb (%d iter.)" % max_iterations, hill_climb, [max_iterations], 'green'],
    ["MHC (%d iter. %d runs)" % (max_iterations, hc_count), multi_hc, [max_iterations, hc_count], 'blue'],
    # ["Spectrum v1", spectrum_sort, None, 'violet'],
    ["Spectrum v2", spectrum_sort_shake, None, 'purple']
]

size, colours = load_file()

subsets = list(generate_subsets(colours, number_of_subsets, size_of_subsets))

times = []
for algorithm in algorithms:
    algorithm_times = list(benchmark_algorithm(subsets, algorithm[1], algorithm[2]))
    times.append(algorithm_times)
    print("%s min/avg/max/std = %.2f/%.2f/%.2f/%.2f ms" % (
        algorithm[0],
        np.min(algorithm_times) * 1000,
        np.average(algorithm_times) * 1000,
        np.max(algorithm_times) * 1000,
        np.std(algorithm_times) * 1000,
    ))