def test_keywords_params_combine(self):
        matcher = KeywordsMatching.from_dict({
            'path_to_search': 'testing_data/images2d',
            'filename_contains': '_g',
            'filename_removefromid': 'img|_g'
        })
        f_list, s_list = matcher.matching_subjects_and_filenames()
        self.assertEqual(len(f_list), 10)
        self.assertEqual(len(s_list), 10)

        matcher_comp = KeywordsMatching.from_dict({
            'path_to_search':
            'testing_data/images2d',
            'filename_not_contains': ('_m', '_u'),
            'filename_removefromid':
            "img|_g"
        })
        f_comp, s_comp = matcher_comp.matching_subjects_and_filenames()
        self.assertEqual(f_comp, f_list)
        self.assertEqual(s_comp, s_list)

        matcher = KeywordsMatching.from_dict({
            'path_to_search':
            'testing_data/images2d',
            'filename_removefromid':
            'img|_g|_m|_u'
        })
        with self.assertRaisesRegexp(ValueError, ""):
            matcher.matching_subjects_and_filenames()
    def test_keywords_not_contain(self):
        matcher = KeywordsMatching.from_dict({
            'path_to_search': 'testing_data/images2d',
            'filename_not_contains': 'img'
        })
        with self.assertRaisesRegexp(ValueError, ""):
            # not filename (not containing 'img') matched
            matcher.matching_subjects_and_filenames()

        matcher = KeywordsMatching.from_dict({
            'path_to_search':
            'testing_data/images2d',
            'filename_not_contains': ('_m', '_u')
        })
        f_list, s_list = matcher.matching_subjects_and_filenames()
        self.assertEqual(len(f_list), 10)
        self.assertEqual(len(s_list), 10)

        matcher_comp = KeywordsMatching.from_dict({
            'path_to_search': 'testing_data/images2d',
            'filename_contains': '_g'
        })
        f_comp, s_comp = matcher_comp.matching_subjects_and_filenames()
        self.assertEqual(len(f_comp), 10)
        self.assertEqual(len(f_comp), 10)
        self.assertEqual(f_comp, f_list)
Beispiel #3
0
 def test_from_dict(self):
     with self.assertRaisesRegexp(ValueError, ""):
         KeywordsMatching.from_dict({'path_to_search': 'wrong_folder'})
     matcher = KeywordsMatching.from_dict(
         {'path_to_search': 'testing_data/images2d'})
     f_list, s_list = matcher.matching_subjects_and_filenames()
     self.assertEqual(len(f_list), 30)
     self.assertEqual(len(s_list), 30)
     self.assertEqual(s_list[0][0], 'img0_g')
Beispiel #4
0
def match_and_write_filenames_to_csv(list_constraints, csv_file):
    """
    Combine all elements of file searching until finally writing the names
    :param list_constraints: list of constraints (defined by list of paths to
    search, list of elements the filename should contain and of those that
    are forbidden
    :param csv_file: file on which to write the final list of files.
    :return:
    """
    name_tot = []
    list_tot = []
    if list_constraints is None or len(list_constraints) == 0:
        return
    for c in list_constraints:
        list_files, name_list = \
            KeywordsMatching.matching_subjects_and_filenames(c)
        name_list = remove_duplicated_names(name_list)
        name_tot.append(name_list)
        list_tot.append(list_files)
    list_combined = join_subject_id_and_filename_list(name_tot, list_tot)
    touch_folder(os.path.dirname(csv_file))

    # csv writer has different behaviour in python 2/3
    if sys.version_info[0] >= 3:
        with open(csv_file, 'w', newline='', encoding='utf8') as csvfile:
            file_writer = csv.writer(csvfile)
            for list_temp in list_combined:
                file_writer.writerow(list_temp)
    else:
        with open(csv_file, 'wb') as csvfile:
            file_writer = csv.writer(csvfile, delimiter=',')
            for list_temp in list_combined:
                file_writer.writerow(list_temp)
    return
Beispiel #5
0
def match_and_write_filenames_to_csv(list_constraints, csv_file):
    """
    Combine all elements of file searching until finally writing the names
    :param list_constraints: list of constraints (defined by list of paths to
    search, list of elements the filename should contain and of those that
    are forbidden
    :param csv_file: file on which to write the final list of files.
    :return:
    """
    name_tot = []
    list_tot = []
    if list_constraints is None or len(list_constraints) == 0:
        return
    for c in list_constraints:
        list_files, name_list = \
            KeywordsMatching.matching_subjects_and_filenames(c)
        name_list = remove_duplicated_names(name_list)
        name_tot.append(name_list)
        list_tot.append(list_files)
    list_combined = join_subject_id_and_filename_list(name_tot, list_tot)
    list_combined = filter(lambda names: '' not in names, list_combined)
    list_combined = list(list_combined)
    touch_folder(os.path.dirname(csv_file))
    write_csv(csv_file, list_combined)

    return list_combined
Beispiel #6
0
def match_and_write_filenames_to_csv(list_constraints, csv_file):
    """
    Combine all elements of file searching until finally writing the names
    :param list_constraints: list of constraints (defined by list of paths to
    search, list of elements the filename should contain and of those that
    are forbidden
    :param csv_file: file on which to write the final list of files.
    :return:
    """
    name_tot = []
    list_tot = []
    if list_constraints is None or len(list_constraints) == 0:
        return
    for c in list_constraints:
        list_files, name_list = \
            KeywordsMatching.matching_subjects_and_filenames(c)
        name_list = remove_duplicated_names(name_list)
        name_tot.append(name_list)
        list_tot.append(list_files)
    list_combined = join_subject_id_and_filename_list(name_tot, list_tot)
    list_combined = filter(lambda names: '' not in names, list_combined)
    list_combined = list(list_combined)
    if not list_combined:
        raise IOError('Nothing to write to {}'.format(csv_file))
    touch_folder(os.path.dirname(csv_file))
    write_csv(csv_file, list_combined)

    return list_combined
Beispiel #7
0
def load_and_merge_csv_files(data_param, default_folder=None):
    """
    Converts a list of csv_files in data_param
    in to a joint list of file names (by matching the first column)
    This function returns a <pandas.core.frame.DataFrame> of the
    joint list
    """
    if not data_param:
        tf.logging.fatal('nothing to load, please check reader.names')
        raise ValueError
    _file_list = None
    for modality_name in data_param:
        try:
            csv_file = data_param[modality_name].csv_file
        except AttributeError:
            tf.logging.fatal('unrecognised parameter format')
            raise
        if hasattr(data_param[modality_name], 'path_to_search') and \
                len(data_param[modality_name].path_to_search):
            tf.logging.info('[%s] search file folders, writing csv file %s',
                            modality_name, csv_file)
            section_tuple = data_param[modality_name].__dict__.items()
            matcher = KeywordsMatching.from_tuple(section_tuple,
                                                  default_folder)
            match_and_write_filenames_to_csv([matcher], csv_file)
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped folder search',
                modality_name, csv_file)
        if not os.path.isfile(csv_file):
            tf.logging.fatal("[%s] csv file %s not found.", modality_name,
                             csv_file)
            raise IOError
        csv_list = pandas.read_csv(csv_file,
                                   header=None,
                                   names=['subject_id', modality_name])
        if _file_list is None:
            _file_list = csv_list
            continue

        # merge _file_list based on subject_ids (first column of each csv)
        n_rows = _file_list.shape[0]
        _file_list = pandas.merge(_file_list, csv_list, on='subject_id')
        if _file_list.shape[0] != n_rows:
            tf.logging.warning("rows not matched in %s", csv_file)

    if _file_list is None or _file_list.size == 0:
        tf.logging.fatal(
            "empty filename lists, please check the csv "
            "files. (remove csv_file keyword if it is in the config file "
            "to automatically search folders and generate new csv "
            "files again)\n\n"
            "Please note in the matched file names, each subject name are "
            "matched by removing any keywords listed `filename_contains` "
            "in the config.\n\n")
        raise IOError
    return _file_list
Beispiel #8
0
 def test_keywords_grep(self):
     matcher = KeywordsMatching.from_dict(
         {'path_to_search': 'testing_data/images2d',
          'filename_contains': 'img'})
     f_list, s_list = matcher.matching_subjects_and_filenames()
     self.assertEqual(len(f_list), 30)
     self.assertEqual(len(s_list), 30)
     # filename matched 'img' will return and
     # the matched string is removed from subject_id
     self.assertEqual(s_list[0][0], '0_g')
    def grep_files_by_data_section(self, modality_name):
        """
        list all files by a given input data section::
            if the ``csv_file`` property of the section corresponds to a file,
                read the list from the file;
            otherwise
                write the list to ``csv_file``.

        :return: a table with two columns,
                 the column names are ``(COLUMN_UNIQ_ID, modality_name)``.
        """
        if modality_name not in self.data_param:
            tf.logging.fatal('unknown section name [%s], '
                             'current input section names: %s.',
                             modality_name, list(self.data_param))
            raise ValueError

        # input data section must have a ``csv_file`` section for loading
        # or writing filename lists
        try:
            csv_file = self.data_param[modality_name].csv_file
            if not os.path.isfile(csv_file):
                # writing to the same folder as data_split_file
                csv_file = os.path.join(os.path.dirname(self.data_split_file),
                                        '{}.csv'.format(modality_name))

        except (AttributeError, TypeError):
            tf.logging.fatal('Missing `csv_file` field in the config file, '
                             'unknown configuration format.')
            raise

        if hasattr(self.data_param[modality_name], 'path_to_search') and \
                self.data_param[modality_name].path_to_search:
            tf.logging.info('[%s] search file folders, writing csv file %s',
                            modality_name, csv_file)
            section_properties = self.data_param[modality_name].__dict__.items()
            # grep files by section properties and write csv
            try:
                matcher = KeywordsMatching.from_tuple(
                    section_properties,
                    self.default_image_file_location)
                match_and_write_filenames_to_csv([matcher], csv_file)
            except (IOError, ValueError) as reading_error:
                tf.logging.warning('Ignoring input section: [%s], '
                                   'due to the following error:',
                                   modality_name)
                tf.logging.warning(repr(reading_error))
                return pandas.DataFrame(
                    columns=[COLUMN_UNIQ_ID, modality_name])
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped filenames search',
                modality_name, csv_file)

        if not os.path.isfile(csv_file):
            tf.logging.fatal(
                '[%s] csv file %s not found.', modality_name, csv_file)
            raise IOError
        try:
            csv_list = pandas.read_csv(
                csv_file,
                header=None,
                dtype=(str, str),
                names=[COLUMN_UNIQ_ID, modality_name],
                skipinitialspace=True)
        except Exception as csv_error:
            tf.logging.fatal(repr(csv_error))
            raise
        return csv_list
    def grep_files_by_data_section(self, modality_name):
        """
        list all files by a given input data section::
            if the ``csv_file`` property of ``data_param[modality_name]``
            corresponds to a file, read the list from the file;
            otherwise
                write the list to ``csv_file``.

        :return: a table with two columns,
                 the column names are ``(COLUMN_UNIQ_ID, modality_name)``.
        """
        if modality_name not in self.data_param:
            tf.logging.fatal(
                'unknown section name [%s], '
                'current input section names: %s.', modality_name,
                list(self.data_param))
            raise ValueError

        # input data section must have a ``csv_file`` section for loading
        # or writing filename lists
        if isinstance(self.data_param[modality_name], dict):
            mod_spec = self.data_param[modality_name]
        else:
            mod_spec = vars(self.data_param[modality_name])

        #########################
        # guess the csv_file path
        #########################
        temp_csv_file = None
        try:
            csv_file = os.path.expanduser(mod_spec.get('csv_file', None))
            if not os.path.isfile(csv_file):
                # writing to the same folder as data_split_file
                default_csv_file = os.path.join(
                    os.path.dirname(self.data_split_file),
                    '{}.csv'.format(modality_name))
                tf.logging.info(
                    '`csv_file = %s` not found, '
                    'writing to "%s" instead.', csv_file, default_csv_file)
                csv_file = default_csv_file
                if os.path.isfile(csv_file):
                    tf.logging.info('Overwriting existing: "%s".', csv_file)
            csv_file = os.path.abspath(csv_file)
        except (AttributeError, KeyError, TypeError):
            tf.logging.debug('`csv_file` not specified, writing the list of '
                             'filenames to a temporary file.')
            import tempfile
            temp_csv_file = os.path.join(tempfile.mkdtemp(),
                                         '{}.csv'.format(modality_name))
            csv_file = temp_csv_file

        #############################################
        # writing csv file if path_to_search specified
        ##############################################
        if mod_spec.get('path_to_search', None):
            if not temp_csv_file:
                tf.logging.info(
                    '[%s] search file folders, writing csv file %s',
                    modality_name, csv_file)
            # grep files by section properties and write csv
            try:
                matcher = KeywordsMatching.from_dict(
                    input_dict=mod_spec,
                    default_folder=self.default_image_file_location)
                match_and_write_filenames_to_csv([matcher], csv_file)
            except (IOError, ValueError) as reading_error:
                tf.logging.warning(
                    'Ignoring input section: [%s], '
                    'due to the following error:', modality_name)
                tf.logging.warning(repr(reading_error))
                return pandas.DataFrame(
                    columns=[COLUMN_UNIQ_ID, modality_name])
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped filenames search',
                modality_name, csv_file)

        if not os.path.isfile(csv_file):
            tf.logging.fatal('[%s] csv file %s not found.', modality_name,
                             csv_file)
            raise IOError
        ###############################
        # loading the file as dataframe
        ###############################
        try:
            csv_list = pandas.read_csv(csv_file,
                                       header=None,
                                       dtype=(str, str),
                                       names=[COLUMN_UNIQ_ID, modality_name],
                                       skipinitialspace=True)
        except Exception as csv_error:
            tf.logging.fatal(repr(csv_error))
            raise

        if temp_csv_file:
            shutil.rmtree(os.path.dirname(temp_csv_file), ignore_errors=True)

        return csv_list
Beispiel #11
0
 def test_default(self):
     matcher = KeywordsMatching()
     with self.assertRaisesRegexp(ValueError, ""):
         matcher.matching_subjects_and_filenames()
     with self.assertRaisesRegexp(AttributeError, ""):
         KeywordsMatching.from_dict('wrong_argument')
    def grep_files_by_data_section(self, modality_name):
        """
        list all files by a given input data section::
            if the ``csv_file`` property of ``data_param[modality_name]``
            corresponds to a file, read the list from the file;
            otherwise
                write the list to ``csv_file``.

        :return: a table with two columns,
                 the column names are ``(COLUMN_UNIQ_ID, modality_name)``.
        """
        if modality_name not in self.data_param:
            tf.logging.fatal('unknown section name [%s], '
                             'current input section names: %s.',
                             modality_name, list(self.data_param))
            raise ValueError

        # input data section must have a ``csv_file`` section for loading
        # or writing filename lists
        if isinstance(self.data_param[modality_name], dict):
            mod_spec = self.data_param[modality_name]
        else:
            mod_spec = vars(self.data_param[modality_name])

        #########################
        # guess the csv_file path
        #########################
        temp_csv_file = None
        try:
            csv_file = os.path.expanduser(mod_spec.get('csv_file', None))
            if not os.path.isfile(csv_file):
                # writing to the same folder as data_split_file
                default_csv_file = os.path.join(
                    os.path.dirname(self.data_split_file),
                    '{}.csv'.format(modality_name))
                tf.logging.info('`csv_file = %s` not found, '
                                'writing to "%s" instead.',
                                csv_file, default_csv_file)
                csv_file = default_csv_file
                if os.path.isfile(csv_file):
                    tf.logging.info('Overwriting existing: "%s".', csv_file)
            csv_file = os.path.abspath(csv_file)
        except (AttributeError, KeyError, TypeError):
            tf.logging.debug('`csv_file` not specified, writing the list of '
                             'filenames to a temporary file.')
            import tempfile
            temp_csv_file = os.path.join(
                tempfile.mkdtemp(), '{}.csv'.format(modality_name))
            csv_file = temp_csv_file

        #############################################
        # writing csv file if path_to_search specified
        ##############################################
        if mod_spec.get('path_to_search', None):
            if not temp_csv_file:
                tf.logging.info(
                    '[%s] search file folders, writing csv file %s',
                    modality_name, csv_file)
            # grep files by section properties and write csv
            try:
                matcher = KeywordsMatching.from_dict(
                    input_dict=mod_spec,
                    default_folder=self.default_image_file_location)
                match_and_write_filenames_to_csv([matcher], csv_file)
            except (IOError, ValueError) as reading_error:
                tf.logging.warning('Ignoring input section: [%s], '
                                   'due to the following error:',
                                   modality_name)
                tf.logging.warning(repr(reading_error))
                return pandas.DataFrame(
                    columns=[COLUMN_UNIQ_ID, modality_name])
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped filenames search',
                modality_name, csv_file)

        if not os.path.isfile(csv_file):
            tf.logging.fatal(
                '[%s] csv file %s not found.', modality_name, csv_file)
            raise IOError
        ###############################
        # loading the file as dataframe
        ###############################
        try:
            csv_list = pandas.read_csv(
                csv_file,
                header=None,
                dtype=(str, str),
                names=[COLUMN_UNIQ_ID, modality_name],
                skipinitialspace=True)
        except Exception as csv_error:
            tf.logging.fatal(repr(csv_error))
            raise
        finally:
            if temp_csv_file:
                os.remove(temp_csv_file)
                os.rmdir(os.path.dirname(temp_csv_file))
        return csv_list
Beispiel #13
0
    def grep_files_by_data_section(self, modality_name):
        """
        list all files by a given input data section::
            if the ``csv_file`` property of the section corresponds to a file,
                read the list from the file;
            otherwise
                write the list to ``csv_file``.

        :return: a table with two columns,
                 the column names are ``(COLUMN_UNIQ_ID, modality_name)``.
        """
        if modality_name not in self.data_param:
            tf.logging.fatal(
                'unknown section name [%s], '
                'current input section names: %s.', modality_name,
                list(self.data_param))
            raise ValueError

        # input data section must have a ``csv_file`` section for loading
        # or writing filename lists
        try:
            csv_file = self.data_param[modality_name].csv_file
            if not os.path.isfile(csv_file):
                # writing to the same folder as data_split_file
                csv_file = os.path.join(os.path.dirname(self.data_split_file),
                                        '{}.csv'.format(modality_name))

        except (AttributeError, TypeError):
            tf.logging.fatal('Missing `csv_file` field in the config file, '
                             'unknown configuration format.')
            raise

        if hasattr(self.data_param[modality_name], 'path_to_search') and \
                self.data_param[modality_name].path_to_search:
            tf.logging.info('[%s] search file folders, writing csv file %s',
                            modality_name, csv_file)
            section_properties = self.data_param[modality_name].__dict__.items(
            )
            # grep files by section properties and write csv
            try:
                matcher = KeywordsMatching.from_tuple(
                    section_properties, self.default_image_file_location)
                match_and_write_filenames_to_csv([matcher], csv_file)
            except (IOError, ValueError) as reading_error:
                tf.logging.warning(
                    'Ignoring input section: [%s], '
                    'due to the following error:', modality_name)
                tf.logging.warning(repr(reading_error))
                return pandas.DataFrame(
                    columns=[COLUMN_UNIQ_ID, modality_name])
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped filenames search',
                modality_name, csv_file)

        if not os.path.isfile(csv_file):
            tf.logging.fatal('[%s] csv file %s not found.', modality_name,
                             csv_file)
            raise IOError
        try:
            csv_list = pandas.read_csv(csv_file,
                                       header=None,
                                       dtype=(str, str),
                                       names=[COLUMN_UNIQ_ID, modality_name],
                                       skipinitialspace=True)
        except Exception as csv_error:
            tf.logging.fatal(repr(csv_error))
            raise
        return csv_list