Esempio n. 1
0
 def input_tree(self):
     sessions = []
     for subj_id in self.SUBJECT_IDS:
         for visit_id in self.VISIT_IDS:
             sessions.append(
                 Session(subj_id,
                         visit_id,
                         filesets=[
                             Fileset('one_input',
                                     text_format,
                                     subject_id=subj_id,
                                     visit_id=visit_id),
                             Fileset('ten_input',
                                     text_format,
                                     subject_id=subj_id,
                                     visit_id=visit_id)
                         ]))
     subjects = [
         Subject(i, sessions=[s for s in sessions if s.subject_id == i])
         for i in self.SUBJECT_IDS
     ]
     visits = [
         Visit(i, sessions=[s for s in sessions if s.visit == i])
         for i in self.VISIT_IDS
     ]
     return Tree(subjects=subjects, visits=visits)
Esempio n. 2
0
 def input_tree(self):
     filesets = []
     for subj_id in self.SUBJECT_IDS:
         for visit_id in self.VISIT_IDS:
             filesets.append(
                 Fileset('one_input',
                         text_format,
                         subject_id=subj_id,
                         visit_id=visit_id))
             filesets.append(
                 Fileset('ten_input',
                         text_format,
                         subject_id=subj_id,
                         visit_id=visit_id))
     return Tree.construct(self.dataset.repository, filesets=filesets)
Esempio n. 3
0
    def guess_depth(cls, root_dir):
        """
        Try to guess the depth of a directory repository (i.e. whether it has
        sub-folders for multiple subjects or visits, depending on where files
        and/or derived label files are found in the hierarchy of
        sub-directories under the root dir.

        Parameters
        ----------
        root_dir : str
            Path to the root directory of the repository
        """
        deepest = -1
        for path, dirs, files in os.walk(root_dir):
            depth = cls.path_depth(root_dir, path)
            filtered_files = cls._filter_files(files, path)
            if filtered_files:
                logger.info(
                    "Guessing depth of directory repository at '{}' is"
                    " {} due to unfiltered files ('{}') in '{}'".format(
                        root_dir, depth, "', '".join(filtered_files), path))
                return depth
            if cls.PROV_DIR in dirs:
                depth_to_return = max(depth - 1, 0)
                logger.info("Guessing depth of directory repository at '{}' is"
                            "{} due to \"Derived label file\" in '{}'".format(
                                root_dir, depth_to_return, path))
                return depth_to_return
            if depth >= cls.MAX_DEPTH:
                logger.info("Guessing depth of directory repository at '{}' is"
                            " {} as '{}' is already at maximum depth".format(
                                root_dir, cls.MAX_DEPTH, path))
                return cls.MAX_DEPTH
            try:
                for fpath in chain(filtered_files,
                                   cls._filter_dirs(dirs, path)):
                    Fileset.from_path(fpath)
            except ArcanaError:
                pass
            else:
                if depth > deepest:
                    deepest = depth
        if deepest == -1:
            raise ArcanaRepositoryError(
                "Could not guess depth of '{}' repository as did not find "
                "a valid session directory within sub-directories.".format(
                    root_dir))
        return deepest
Esempio n. 4
0
 def input_tree(self):
     sessions = []
     visit_ids = set()
     for subj_id, visits in list(self.PROJECT_STRUCTURE.items()):
         for visit_id, filesets in list(visits.items()):
             sessions.append(
                 Session(subj_id,
                         visit_id,
                         filesets=[
                             Fileset(d,
                                     text_format,
                                     subject_id=subj_id,
                                     visit_id=visit_id,
                                     from_study=((self.STUDY_NAME
                                                  if d != 'one' else None)))
                             for d in filesets
                         ]))
             visit_ids.add(visit_id)
     subjects = [
         Subject(i, sessions=[s for s in sessions if s.subject_id == i])
         for i in self.PROJECT_STRUCTURE
     ]
     visits = [
         Visit(i, sessions=[s for s in sessions if s.visit == i])
         for i in visit_ids
     ]
     return Tree(subjects=subjects, visits=visits)
Esempio n. 5
0
 def input_directory(self):
     path = op.join(self.tempdir, 'directory')
     if not op.exists(path):
         # Create directory
         os.makedirs(path)
         with open(op.join(path, 'dummy.txt'), 'w') as f:
             f.write('blah')
     return Fileset.from_path(path, format=directory_format)
Esempio n. 6
0
 def input_zip(self):
     path = op.join(self.tempdir, 'zip.zip')
     if not op.exists(path):
         # Create zip file
         zipper = ZipDir()
         zipper.inputs.dirname = self.input_directory.path
         zipper.inputs.zipped = path
         zipper.run()
     return Fileset.from_path(path, format=zip_format)
Esempio n. 7
0
 def input_tree(self):
     filesets = []
     for subj_id, visits in list(self.STRUCTURE.items()):
         for visit_id, fileset_names in list(visits.items()):
             filesets.extend(
                 Fileset(
                     d, text_format, subject_id=subj_id, visit_id=visit_id)
                 for d in fileset_names)
     return Tree.construct(self.dataset.repository, filesets=filesets)
Esempio n. 8
0
 def input_tree(self):
     filesets = []
     for subj_id, visit_ids in list(self.PROJECT_STRUCTURE.items()):
         for visit_id, fileset_names in list(visit_ids.items()):
             # Create filesets
             for name in fileset_names:
                 from_analysis = self.STUDY_NAME if name != 'one' else None
                 filesets.append(
                     Fileset(name,
                             text_format,
                             subject_id=subj_id,
                             visit_id=visit_id,
                             from_analysis=from_analysis))
     return Tree.construct(self.dataset.repository, filesets=filesets)
Esempio n. 9
0
 def setUp(self):
     self.reset_dirs()
     if self.INPUTS_FROM_REF_DIR:
         filesets = {}
         # Unzip reference directory if required
         if not os.path.exists(self.ref_dir) and os.path.exists(
                 self.ref_dir + '.tar.gz'):
             sp.check_call(
                 'tar xzf {}.tar.gz'.format(self.ref_dir),
                 shell=True, cwd=os.path.dirname(self.ref_dir))
         for fname in os.listdir(self.ref_dir):
             if fname.startswith('.'):
                 continue
             fileset = Fileset.from_path(op.join(self.ref_dir,
                                                 fname))
             filesets[fileset.name] = fileset
     else:
         filesets = getattr(self, 'INPUT_DATASETS', None)
     self.add_session(filesets=filesets,
                      fields=getattr(self, 'INPUT_FIELDS', None))
Esempio n. 10
0
    def _get_filesets(self, xsession, **kwargs):
        """
        Returns a list of filesets within an XNAT session

        Parameters
        ----------
        xsession : xnat.classes.MrSessionData
            The XNAT session to extract the filesets from
        freq : str
            The frequency of the returned filesets (either 'per_session',
            'per_subject', 'per_visit', or 'per_study')
        derived : bool
            Whether the session is derived or not

        Returns
        -------
        filesets : list(arcana.data.Fileset)
            List of filesets within an XNAT session
        """
        filesets = []
        for xfileset in xsession.scans.values():
            try:
                file_format = self._guess_file_format(xfileset)
            except ArcanaFileFormatError as e:
                logger.warning(
                    "Ignoring '{}' as couldn't guess its file format:\n{}".
                    format(xfileset.type, e))
            filesets.append(
                Fileset(
                    xfileset.type,
                    format=file_format,  # @ReservedAssignment @IgnorePep8
                    id=xfileset.id,
                    uri=xfileset.uri,
                    repository=self,
                    **kwargs))
        return sorted(filesets)
Esempio n. 11
0
    def find_data(self, dataset, subject_ids=None, visit_ids=None, **kwargs):
        """
        Find all data within a repository, registering filesets, fields and
        provenance with the found_fileset, found_field and found_provenance
        methods, respectively

        Parameters
        ----------
        subject_ids : list(str)
            List of subject IDs with which to filter the tree with. If
            None all are returned
        visit_ids : list(str)
            List of visit IDs with which to filter the tree with. If
            None all are returned
        root_dir : str
            The root dir to use instead of the 'name' (path) of the dataset.
            Only for use in sub-classes (e.g. BIDS)
        all_from_analysis : str
            Global 'from_analysis' to be applied to every found item.
            Only for use in sub-classes (e.g. BIDS)

        Returns
        -------
        filesets : list[Fileset]
            All the filesets found in the repository
        fields : list[Field]
            All the fields found in the repository
        records : list[Record]
            The provenance records found in the repository
        """
        all_filesets = []
        all_fields = []
        all_records = []
        # if root_dir is None:
        root_dir = dataset.name
        for session_path, dirs, files in os.walk(root_dir):
            relpath = op.relpath(session_path, root_dir)
            path_parts = relpath.split(op.sep) if relpath != '.' else []
            ids = self._extract_ids_from_path(dataset.depth, path_parts, dirs,
                                              files)
            if ids is None:
                continue
            subj_id, visit_id, from_analysis = ids
            # if all_from_analysis is not None:
            #     if from_analysis is not None:
            #         raise ArcanaRepositoryError(
            #             "Found from_analysis sub-directory '{}' when global "
            #             "from analysis '{}' was passed".format(
            #                 from_analysis, all_from_analysis))
            #     else:
            #         from_analysis = all_from_analysis
            # Check for summaries and filtered IDs
            if subj_id == self.SUMMARY_NAME:
                subj_id = None
            elif subject_ids is not None and subj_id not in subject_ids:
                continue
            if visit_id == self.SUMMARY_NAME:
                visit_id = None
            elif visit_ids is not None and visit_id not in visit_ids:
                continue
            # Map IDs into ID space of analysis
            subj_id = dataset.map_subject_id(subj_id)
            visit_id = dataset.map_visit_id(visit_id)
            # Determine frequency of session|summary
            if (subj_id, visit_id) == (None, None):
                frequency = 'per_dataset'
            elif subj_id is None:
                frequency = 'per_visit'
            elif visit_id is None:
                frequency = 'per_subject'
            else:
                frequency = 'per_session'
            filtered_files = self._filter_files(files, session_path)
            for fname in filtered_files:
                basename = split_extension(fname)[0]
                all_filesets.append(
                    Fileset.from_path(op.join(session_path, fname),
                                      frequency=frequency,
                                      subject_id=subj_id,
                                      visit_id=visit_id,
                                      dataset=dataset,
                                      from_analysis=from_analysis,
                                      potential_aux_files=[
                                          f for f in filtered_files
                                          if (split_extension(f)[0] == basename
                                              and f != fname)
                                      ],
                                      **kwargs))
            for fname in self._filter_dirs(dirs, session_path):
                all_filesets.append(
                    Fileset.from_path(op.join(session_path, fname),
                                      frequency=frequency,
                                      subject_id=subj_id,
                                      visit_id=visit_id,
                                      dataset=dataset,
                                      from_analysis=from_analysis,
                                      **kwargs))
            if self.FIELDS_FNAME in files:
                with open(op.join(session_path, self.FIELDS_FNAME), 'r') as f:
                    dct = json.load(f)
                all_fields.extend(
                    Field(name=k,
                          value=v,
                          frequency=frequency,
                          subject_id=subj_id,
                          visit_id=visit_id,
                          dataset=dataset,
                          from_analysis=from_analysis,
                          **kwargs) for k, v in list(dct.items()))
            if self.PROV_DIR in dirs:
                if from_analysis is None:
                    raise ArcanaRepositoryError(
                        "Found provenance directory in session directory (i.e."
                        " not in analysis-specific sub-directory)")
                base_prov_dir = op.join(session_path, self.PROV_DIR)
                for fname in os.listdir(base_prov_dir):
                    all_records.append(
                        Record.load(
                            split_extension(fname)[0], frequency, subj_id,
                            visit_id, from_analysis,
                            op.join(base_prov_dir, fname)))
        return all_filesets, all_fields, all_records
Esempio n. 12
0
 def get_tree(self, dataset, sync_with_repo=False):
     filesets = [
         # Subject1
         Fileset('ones', text_format,
                 frequency='per_subject',
                 subject_id='subject1',
                 resource_name='text',
                 dataset=dataset),
         Fileset('tens', text_format,
                 frequency='per_subject',
                 subject_id='subject1',
                 resource_name='text',
                 dataset=dataset),
         # subject1/visit1
         Fileset('hundreds', text_format,
                 subject_id='subject1', visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         Fileset('ones', text_format,
                 subject_id='subject1', visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         Fileset('tens', text_format,
                 subject_id='subject1', visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         Fileset('with_header', text_format,
                 frequency='per_session',
                 subject_id='subject1', visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         # subject1/visit2
         Fileset('ones', text_format,
                 subject_id='subject1', visit_id='visit2',
                 resource_name='text',
                 dataset=dataset),
         Fileset('tens', text_format,
                 subject_id='subject1', visit_id='visit2',
                 resource_name='text',
                 dataset=dataset),
         # Subject 2
         Fileset('ones', text_format,
                 frequency='per_subject',
                 subject_id='subject2',
                 resource_name='text',
                 dataset=dataset),
         Fileset('tens', text_format,
                 frequency='per_subject',
                 subject_id='subject2',
                 resource_name='text',
                 dataset=dataset),
         # subject2/visit1
         Fileset('ones', text_format,
                 subject_id='subject2', visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         Fileset('tens', text_format,
                 subject_id='subject2', visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         # subject2/visit2
         Fileset('ones', text_format,
                 subject_id='subject2', visit_id='visit2',
                 resource_name='text',
                 dataset=dataset),
         Fileset('tens', text_format,
                 subject_id='subject2', visit_id='visit2',
                 resource_name='text',
                 dataset=dataset),
         # Visit 1
         Fileset('ones', text_format,
                 frequency='per_visit',
                 visit_id='visit1',
                 resource_name='text',
                 dataset=dataset),
         # Analysis
         Fileset('ones', text_format,
                 frequency='per_dataset',
                 resource_name='text',
                 dataset=dataset)]
     fields = [
         # Subject 2
         Field('e', value=3.33333,
               frequency='per_subject',
               subject_id='subject2',
               dataset=dataset),
         # subject2/visit2
         Field('a', value=22,
               subject_id='subject2', visit_id='visit2',
               dataset=dataset),
         Field('b', value=220,
               subject_id='subject2', visit_id='visit2',
               dataset=dataset),
         Field('c', value='buggy',
               subject_id='subject2', visit_id='visit2',
               dataset=dataset),
         # Subject1
         Field('e', value=4.44444,
               frequency='per_subject',
               subject_id='subject1',
               dataset=dataset),
         # subject1/visit1
         Field('a', value=1,
               subject_id='subject1', visit_id='visit1',
               dataset=dataset),
         Field('b', value=10,
               subject_id='subject1', visit_id='visit1',
               dataset=dataset),
         Field('d', value=42.42,
               subject_id='subject1', visit_id='visit1',
               dataset=dataset),
         # subject1/visit2
         Field('a', value=2,
               subject_id='subject1', visit_id='visit2',
               dataset=dataset),
         Field('c', value='van',
               subject_id='subject1', visit_id='visit2',
               dataset=dataset),
         # Visit 1
         Field('f', value='dog',
               frequency='per_visit',
               visit_id='visit1',
               dataset=dataset),
         # Visit 2
         Field('f', value='cat',
               frequency='per_visit',
               visit_id='visit2',
               dataset=dataset),
         # Analysis
         Field('g', value=100,
               frequency='per_dataset',
               dataset=dataset)]
     # Set URI and IDs if necessary for repository type
     if sync_with_repo:
         for fileset in filesets:
             fileset.get()
         for field in fields:
             field.get()
     tree = Tree.construct(self.dataset, filesets, fields)
     return tree
Esempio n. 13
0
    def tree(self, subject_ids=None, visit_ids=None):
        """
        Return subject and session information for a project in the local
        repository

        Parameters
        ----------
        subject_ids : list(str)
            List of subject IDs with which to filter the tree with. If None all
            are returned
        visit_ids : list(str)
            List of visit IDs with which to filter the tree with. If None all
            are returned

        Returns
        -------
        project : arcana.repository.Tree
            A hierarchical tree of subject, session and fileset information for
            the repository
        """
        bids_filesets = defaultdict(lambda: defaultdict(dict))
        derived_tree = super(BidsRepository, self).tree(subject_ids=None,
                                                        visit_ids=None)
        for bids_obj in self.layout.get(return_type='object'):
            subj_id = bids_obj.entities['subject']
            if subject_ids is not None and subj_id not in subject_ids:
                continue
            visit_id = bids_obj.entities['session']
            if visit_ids is not None and visit_id not in visit_ids:
                continue
            bids_filesets[subj_id][visit_id] = Fileset.from_path(
                bids_obj.path,
                frequency='per_session',
                subject_id=subj_id,
                visit_id=visit_id,
                repository=self,
                bids_attrs=bids_obj)
        # Need to pull out all filesets and fields
        all_sessions = defaultdict(dict)
        all_visit_ids = set()
        for subj_id, visits in bids_filesets.items():
            for visit_id, filesets in visits.items():
                session = Session(subject_id=subj_id,
                                  visit_id=visit_id,
                                  filesets=filesets)
                try:
                    session.derived = derived_tree.subject(subj_id).visit(
                        visit_id)
                except ArcanaNameError:
                    pass  # No matching derived session
                all_sessions[subj_id][visit_id] = session
                all_visit_ids.add(visit_id)

        subjects = []
        for subj_id, subj_sessions in list(all_sessions.items()):
            try:
                derived_subject = derived_tree.subject(subj_id)
            except ArcanaNameError:
                filesets = []
                fields = []
            else:
                filesets = derived_subject.filesets
                fields = derived_subject.fields
            subjects.append(
                Subject(subj_id, sorted(subj_sessions.values()), filesets,
                        fields))
        visits = []
        for visit_id in all_visit_ids:
            try:
                derived_visit = derived_tree.visit(subj_id)
            except ArcanaNameError:
                filesets = []
                fields = []
            else:
                filesets = derived_visit.filesets
                fields = derived_visit.fields
            visit_sessions = list(
                chain(sess[visit_id] for sess in list(all_sessions.values())))
            visits.append(
                Visit(visit_id, sorted(visit_sessions), filesets, fields))
        return Tree(sorted(subjects), sorted(visits), derived_tree.filesets,
                    derived_tree.fields)
Esempio n. 14
0
    def tree(self, subject_ids=None, visit_ids=None, **kwargs):
        """
        Return subject and session information for a project in the local
        repository

        Parameters
        ----------
        subject_ids : list(str)
            List of subject IDs with which to filter the tree with. If None all
            are returned
        visit_ids : list(str)
            List of visit IDs with which to filter the tree with. If None all
            are returned

        Returns
        -------
        project : arcana.repository.Tree
            A hierarchical tree of subject, session and fileset information for
            the repository
        """
        all_data = defaultdict(dict)
        all_visit_ids = set()
        for session_path, dirs, files in os.walk(self.root_dir):
            relpath = op.relpath(session_path, self.root_dir)
            if relpath == '.':
                path_parts = []
            else:
                path_parts = relpath.split(op.sep)
            depth = len(path_parts)
            if depth == self._depth:
                # Load input data
                from_study = None
            elif (depth == (self._depth + 1)
                  and self.DERIVED_LABEL_FNAME in files):
                # Load study output
                from_study = path_parts.pop()
            elif (depth < self._depth
                  and any(not f.startswith('.') for f in files)):
                # Check to see if there are files in upper level
                # directories, which shouldn't be there (ignoring
                # "hidden" files that start with '.')
                raise ArcanaBadlyFormattedDirectoryRepositoryError(
                    "Files ('{}') not permitted at {} level in local "
                    "repository".format("', '".join(files),
                                        ('subject' if depth else 'project')))
            else:
                # Not a directory that contains data files or directories
                continue
            if len(path_parts) == 2:
                subj_id, visit_id = path_parts
            elif len(path_parts) == 1:
                subj_id = path_parts[0]
                visit_id = self.DEFAULT_SUBJECT_ID
            else:
                subj_id = self.DEFAULT_SUBJECT_ID
                visit_id = self.DEFAULT_VISIT_ID
            subj_id = subj_id if subj_id != self.SUMMARY_NAME else None
            visit_id = visit_id if visit_id != self.SUMMARY_NAME else None
            if (subject_ids is not None and subj_id is not None
                    and subj_id not in subject_ids):
                continue
            if (visit_ids is not None and visit_id is not None
                    and visit_id not in visit_ids):
                continue
            if (subj_id, visit_id) == (None, None):
                frequency = 'per_study'
            elif subj_id is None:
                frequency = 'per_visit'
                all_visit_ids.add(visit_id)
            elif visit_id is None:
                frequency = 'per_subject'
            else:
                frequency = 'per_session'
                all_visit_ids.add(visit_id)
            try:
                # Retrieve filesets and fields from other study directories
                # or root acquired directory
                filesets, fields = all_data[subj_id][visit_id]
            except KeyError:
                filesets = []
                fields = []
            for fname in chain(self._filter_files(files, session_path),
                               self._filter_dirs(dirs, session_path)):
                filesets.append(
                    Fileset.from_path(op.join(session_path, fname),
                                      frequency=frequency,
                                      subject_id=subj_id,
                                      visit_id=visit_id,
                                      repository=self,
                                      from_study=from_study))
            if self.FIELDS_FNAME in files:
                with open(op.join(session_path, self.FIELDS_FNAME), 'r') as f:
                    dct = json.load(f)
                fields = [
                    Field(name=k,
                          value=v,
                          frequency=frequency,
                          subject_id=subj_id,
                          visit_id=visit_id,
                          repository=self,
                          from_study=from_study) for k, v in list(dct.items())
                ]
            filesets = sorted(filesets)
            fields = sorted(fields)
            all_data[subj_id][visit_id] = (filesets, fields)
        all_sessions = defaultdict(dict)
        for subj_id, subj_data in all_data.items():
            if subj_id is None:
                continue  # Create Subject summaries later
            for visit_id, (filesets, fields) in subj_data.items():
                if visit_id is None:
                    continue  # Create Visit summaries later
                all_sessions[subj_id][visit_id] = Session(subject_id=subj_id,
                                                          visit_id=visit_id,
                                                          filesets=filesets,
                                                          fields=fields)
        subjects = []
        for subj_id, subj_sessions in list(all_sessions.items()):
            try:
                filesets, fields = all_data[subj_id][None]
            except KeyError:
                filesets = []
                fields = []
            subjects.append(
                Subject(subj_id, sorted(subj_sessions.values()), filesets,
                        fields))
        visits = []
        for visit_id in all_visit_ids:
            visit_sessions = list(
                chain(sess[visit_id] for sess in list(all_sessions.values())))
            try:
                filesets, fields = all_data[None][visit_id]
            except KeyError:
                filesets = []
                fields = []
            visits.append(
                Visit(visit_id, sorted(visit_sessions), filesets, fields))
        try:
            filesets, fields = all_data[None][None]
        except KeyError:
            filesets = []
            fields = []
        return Tree(sorted(subjects), sorted(visits), filesets, fields,
                    **kwargs)
Esempio n. 15
0
 def get_tree(self, repository, set_ids=False):
     sessions = [
         Session('subject1',
                 'visit1',
                 filesets=[
                     Fileset('hundreds',
                             text_format,
                             subject_id='subject1',
                             visit_id='visit1',
                             repository=repository),
                     Fileset('ones',
                             text_format,
                             subject_id='subject1',
                             visit_id='visit1',
                             repository=repository),
                     Fileset('tens',
                             text_format,
                             subject_id='subject1',
                             visit_id='visit1',
                             repository=repository)
                 ],
                 fields=[
                     Field('a',
                           value=1,
                           subject_id='subject1',
                           visit_id='visit1',
                           repository=repository),
                     Field('b',
                           value=10,
                           subject_id='subject1',
                           visit_id='visit1',
                           repository=repository),
                     Field('d',
                           value=42.42,
                           subject_id='subject1',
                           visit_id='visit1',
                           repository=repository)
                 ]),
         Session('subject1',
                 'visit2',
                 filesets=[
                     Fileset('ones',
                             text_format,
                             subject_id='subject1',
                             visit_id='visit2',
                             repository=repository),
                     Fileset('tens',
                             text_format,
                             subject_id='subject1',
                             visit_id='visit2',
                             repository=repository)
                 ],
                 fields=[
                     Field('a',
                           value=2,
                           subject_id='subject1',
                           visit_id='visit2',
                           repository=repository),
                     Field('c',
                           value='van',
                           subject_id='subject1',
                           visit_id='visit2',
                           repository=repository)
                 ]),
         Session('subject2',
                 'visit1',
                 filesets=[
                     Fileset('ones',
                             text_format,
                             subject_id='subject2',
                             visit_id='visit1',
                             repository=repository),
                     Fileset('tens',
                             text_format,
                             subject_id='subject2',
                             visit_id='visit1',
                             repository=repository)
                 ],
                 fields=[]),
         Session('subject2',
                 'visit2',
                 filesets=[
                     Fileset('ones',
                             text_format,
                             subject_id='subject2',
                             visit_id='visit2',
                             repository=repository),
                     Fileset('tens',
                             text_format,
                             subject_id='subject2',
                             visit_id='visit2',
                             repository=repository)
                 ],
                 fields=[
                     Field('a',
                           value=22,
                           subject_id='subject2',
                           visit_id='visit2',
                           repository=repository),
                     Field('b',
                           value=220,
                           subject_id='subject2',
                           visit_id='visit2',
                           repository=repository),
                     Field('c',
                           value='buggy',
                           subject_id='subject2',
                           visit_id='visit2',
                           repository=repository)
                 ])
     ]
     project = Tree(
         subjects=[
             Subject('subject1',
                     sessions=[
                         s for s in sessions if s.subject_id == 'subject1'
                     ],
                     filesets=[
                         Fileset('ones',
                                 text_format,
                                 frequency='per_subject',
                                 subject_id='subject1',
                                 repository=repository),
                         Fileset('tens',
                                 text_format,
                                 frequency='per_subject',
                                 subject_id='subject1',
                                 repository=repository)
                     ],
                     fields=[
                         Field('e',
                               value=4.44444,
                               frequency='per_subject',
                               subject_id='subject1',
                               repository=repository)
                     ]),
             Subject('subject2',
                     sessions=[
                         s for s in sessions if s.subject_id == 'subject2'
                     ],
                     filesets=[
                         Fileset('ones',
                                 text_format,
                                 frequency='per_subject',
                                 subject_id='subject2',
                                 repository=repository),
                         Fileset('tens',
                                 text_format,
                                 frequency='per_subject',
                                 subject_id='subject2',
                                 repository=repository)
                     ],
                     fields=[
                         Field('e',
                               value=3.33333,
                               frequency='per_subject',
                               subject_id='subject2',
                               repository=repository)
                     ])
         ],
         visits=[
             Visit('visit1',
                   sessions=[s for s in sessions if s.visit_id == 'visit1'],
                   filesets=[
                       Fileset('ones',
                               text_format,
                               frequency='per_visit',
                               visit_id='visit1',
                               repository=repository)
                   ],
                   fields=[
                       Field('f',
                             value='dog',
                             frequency='per_visit',
                             visit_id='visit1',
                             repository=repository)
                   ]),
             Visit('visit2',
                   sessions=[s for s in sessions if s.visit_id == 'visit2'],
                   filesets=[],
                   fields=[
                       Field('f',
                             value='cat',
                             frequency='per_visit',
                             visit_id='visit2',
                             repository=repository)
                   ])
         ],
         filesets=[
             Fileset('ones',
                     text_format,
                     frequency='per_study',
                     repository=repository)
         ],
         fields=[
             Field('g',
                   value=100,
                   frequency='per_study',
                   repository=repository)
         ])
     if set_ids:  # For xnat repository
         for fileset in project.filesets:
             fileset._id = fileset.name
         for visit in project.visits:
             for fileset in visit.filesets:
                 fileset._id = fileset.name
         for subject in project.subjects:
             for fileset in subject.filesets:
                 fileset._id = fileset.name
             for session in subject.sessions:
                 for fileset in session.filesets:
                     fileset._id = fileset.name
     return project
Esempio n. 16
0
    def find_data(self, dataset, subject_ids=None, visit_ids=None, **kwargs):
        """
        Find all filesets, fields and provenance records within an XNAT project

        Parameters
        ----------
        subject_ids : list(str)
            List of subject IDs with which to filter the tree with. If
            None all are returned
        visit_ids : list(str)
            List of visit IDs with which to filter the tree with. If
            None all are returned

        Returns
        -------
        filesets : list[Fileset]
            All the filesets found in the repository
        fields : list[Field]
            All the fields found in the repository
        records : list[Record]
            The provenance records found in the repository
        """
        subject_ids = self.convert_subject_ids(subject_ids)
        # Add derived visit IDs to list of visit ids to filter
        all_filesets = []
        all_fields = []
        all_records = []
        project_id = dataset.name
        # Note we prefer the use of raw REST API calls here for performance
        # reasons over using XnatPy's data structures.
        with self:
            # Get map of internal subject IDs to subject labels in project
            subject_xids_to_labels = {
                s['ID']: s['label'] for s in self._login.get_json(
                    '/data/projects/{}/subjects'.format(project_id))[
                        'ResultSet']['Result']}
            # Get list of all sessions within project
            session_xids = [
                s['ID'] for s in self._login.get_json(
                    '/data/projects/{}/experiments'.format(project_id))[
                        'ResultSet']['Result']
                if (self.session_filter is None
                    or self.session_filter.match(s['label']))]
            for session_xid in tqdm(session_xids,
                                    "Scanning sessions in '{}' project"
                                    .format(project_id)):
                session_json = self._login.get_json(
                    '/data/projects/{}/experiments/{}'.format(
                        project_id, session_xid))['items'][0]
                subject_xid = session_json['data_fields']['subject_ID']
                subject_id = subject_xids_to_labels[subject_xid]
                session_label = session_json['data_fields']['label']
                session_uri = (
                    '/data/archive/projects/{}/subjects/{}/experiments/{}'
                    .format(project_id, subject_xid, session_xid))
                # Get field values. We do this first so we can check for the
                # DERIVED_FROM_FIELD to determine the correct session label and
                # analysis name
                field_values = {}
                try:
                    fields_json = next(
                        c['items'] for c in session_json['children']
                        if c['field'] == 'fields/field')
                except StopIteration:
                    pass
                else:
                    for js in fields_json:
                        try:
                            value = js['data_fields']['field']
                        except KeyError:
                            pass
                        else:
                            field_values[js['data_fields']['name']] = value
                # Extract analysis name and derived-from session
                if self.DERIVED_FROM_FIELD in field_values:
                    df_sess_label = field_values.pop(self.DERIVED_FROM_FIELD)
                    from_analysis = session_label[len(df_sess_label) + 1:]
                    session_label = df_sess_label
                else:
                    from_analysis = None
                # Strip subject ID from session label if required
                if session_label.startswith(subject_id + '_'):
                    visit_id = session_label[len(subject_id) + 1:]
                else:
                    visit_id = session_label
                # Strip project ID from subject ID if required
                if subject_id.startswith(project_id + '_'):
                    subject_id = subject_id[len(project_id) + 1:]
                # Check subject is summary or not and whether it is to be
                # filtered
                if subject_id == XnatRepo.SUMMARY_NAME:
                    subject_id = None
                elif not (subject_ids is None or subject_id in subject_ids):
                    continue
                # Check visit is summary or not and whether it is to be
                # filtered
                if visit_id == XnatRepo.SUMMARY_NAME:
                    visit_id = None
                elif not (visit_ids is None or visit_id in visit_ids):
                    continue
                # Determine frequency
                if (subject_id, visit_id) == (None, None):
                    frequency = 'per_dataset'
                elif visit_id is None:
                    frequency = 'per_subject'
                elif subject_id is None:
                    frequency = 'per_visit'
                else:
                    frequency = 'per_session'
                # Append fields
                for name, value in field_values.items():
                    value = value.replace('&quot;', '"')
                    all_fields.append(Field(
                        name=name, value=value,
                        dataset=dataset,
                        frequency=frequency,
                        subject_id=subject_id,
                        visit_id=visit_id,
                        from_analysis=from_analysis,
                        **kwargs))
                # Extract part of JSON relating to files
                try:
                    scans_json = next(
                        c['items'] for c in session_json['children']
                        if c['field'] == 'scans/scan')
                except StopIteration:
                    scans_json = []
                for scan_json in scans_json:
                    scan_id = scan_json['data_fields']['ID']
                    scan_type = scan_json['data_fields'].get('type', '')
                    scan_quality = scan_json['data_fields'].get('quality',
                                                                None)
                    scan_uri = '{}/scans/{}'.format(session_uri, scan_id)
                    try:
                        resources_json = next(
                            c['items'] for c in scan_json['children']
                            if c['field'] == 'file')
                    except StopIteration:
                        resources = {}
                    else:
                        resources = {js['data_fields']['label']:
                                     js['data_fields'].get('format', None)
                                     for js in resources_json}
                    # Remove auto-generated snapshots directory
                    resources.pop('SNAPSHOTS', None)
                    if scan_type == self.PROV_SCAN:
                        # Download provenance JSON files and parse into
                        # records
                        temp_dir = tempfile.mkdtemp()
                        try:
                            with tempfile.TemporaryFile() as temp_zip:
                                self._login.download_stream(
                                    scan_uri + '/files', temp_zip,
                                    format='zip')
                                with ZipFile(temp_zip) as zip_file:
                                    zip_file.extractall(temp_dir)
                            for base_dir, _, fnames in os.walk(temp_dir):
                                for fname in fnames:
                                    if fname.endswith('.json'):
                                        pipeline_name = fname[:-len('.json')]
                                        json_path = op.join(base_dir, fname)
                                        all_records.append(
                                            Record.load(
                                                pipeline_name, frequency,
                                                subject_id, visit_id,
                                                from_analysis, json_path))
                        finally:
                            shutil.rmtree(temp_dir, ignore_errors=True)
                    else:
                        for resource in resources:
                            all_filesets.append(Fileset(
                                scan_type, id=scan_id, uri=scan_uri,
                                dataset=dataset, frequency=frequency,
                                subject_id=subject_id, visit_id=visit_id,
                                from_analysis=from_analysis,
                                quality=scan_quality,
                                resource_name=resource, **kwargs))
                logger.debug("Found node {}:{} on {}:{}".format(
                    subject_id, visit_id, self.server, project_id))
        return all_filesets, all_fields, all_records