Exemple #1
0
    def get_data_paths(self, path, folds=5):
        '''
        Get splitted data paths.
        '''

        total_folds_itens = np.zeros(folds)

        # Get data files.
        brains_data = Files(path)
        brains_data = brains_data.get_file_names()
        total_brains = len(brains_data)
        fold_size = np.int(total_brains * 1.0 / folds + 0.5)

        # Get total of brains by category.
        brains_by_patient = group_brains_by_patient_id(brains_data)
        patient_ids = brains_by_patient.keys()
        np.random.shuffle(patient_ids)
        fold_itens = []
        for _ in range(folds):
            fold_itens.append([])

        # Split data, avoiding brains of the same patient being moved to
        # different sets.
        for patient in patient_ids:
            delta_folds_itens = fold_size - total_folds_itens
            selected_fold = np.argmax(delta_folds_itens)

            for brain in brains_by_patient[patient]:
                fold_itens[selected_fold].append(brain)
                total_folds_itens[selected_fold] += 1

        self.fold_itens = np.array(fold_itens)
Exemple #2
0
    def split_train_data(self, paths, ratio=None):
        '''
        Get splitted data paths.
        '''

        if ratio == None:
            ratio = 0.05 / (1 - 1.0 / len(self.fold_itens))

        # Get data files.
        brains_data = Files("")
        brains_data.paths = paths
        brains_data = brains_data.get_file_names()
        total_brains = len(brains_data)
        validation_size = np.int(total_brains * ratio)

        # Get total of brains by category.
        brains_by_patient = group_brains_by_patient_id(brains_data)
        validation_paths = []
        train_paths = []
        brains_by_category = group_brains_by_category(brains_data)
        statistic = {}

        # Initialize statistic data.
        for label in brains_by_category:
            statistic[label] = np.round(
                len(brains_by_category[label]) * validation_size * 1.0 /
                total_brains)

        # Create train and validation set.
        for label in brains_by_category:
            index_brains = 0

            while statistic[label] > 0:
                patient_id = get_patient_id(
                    brains_by_category[label][index_brains])
                brains_patient = brains_by_patient[patient_id]

                for brain_patient in brains_patient:
                    validation_paths.append(brain_patient)
                    statistic[get_category(brain_patient)] -= 1

                index_brains += 1

        for brain in brains_data:
            if brain not in validation_paths:
                train_paths.append(brain)

        # Shuffle data.
        np.random.shuffle(np.array(validation_paths))
        np.random.shuffle(np.array(train_paths))

        return train_paths, validation_paths