Example #1
0
    def load(self,
             eid,
             dataset_types=None,
             dclass_output=False,
             dry_run=False,
             cache_dir=None,
             download_only=False,
             clobber=False):
        """
        From a Session ID and dataset types, queries Alyx database, downloads the data
        from Globus, and loads into numpy array.

        :param eid: Experiment ID, for IBL this is the UUID of the Session as per Alyx
         database. Could be a full Alyx URL:
         'http://localhost:8000/sessions/698361f6-b7d0-447d-a25d-42afdef7a0da' or only the UUID:
         '698361f6-b7d0-447d-a25d-42afdef7a0da'
        :type eid: str
        :param dataset_types: [None]: Alyx dataset types to be returned.
        :type dataset_types: list
        :param dclass_output: [False]: forces the output as dataclass to provide context.
        :type dclass_output: bool
         If None or an empty dataset_type is specified, the output will be a dictionary by default.
        :param cache_dir: temporarly overrides the cache_dir from the parameter file
        :type cache_dir: str
        :param download_only: do not attempt to load data in memory, just download the files
        :type download_only: bool
        :param clobber: force downloading even if files exists locally
        :type clobber: bool

        :return: List of numpy arrays matching the size of dataset_types parameter, OR
         a dataclass containing arrays and context data.
        :rtype: list, dict, dataclass SessionDataInfo
        """
        # if the input as an UUID, add the beginning of URL to it
        cache_dir = self._get_cache_dir(cache_dir)
        if is_uuid_string(eid):
            eid = '/sessions/' + eid
        eid_str = eid[-36:]
        # get session json information as a dictionary from the alyx API
        ses = self.alyx.get('/sessions?id=' + eid_str)
        if not ses:
            raise FileNotFoundError('Session ' + eid_str + ' does not exist')
        ses = ses[0]
        # if no dataset_type is provided:
        # a) force the output to be a dictionary that provides context to the data
        # b) download all types that have a data url specified
        dataset_types = [dataset_types] if isinstance(dataset_types,
                                                      str) else dataset_types
        if not dataset_types:
            dclass_output = True
            dataset_types = [
                d['dataset_type'] for d in ses['data_dataset_session_related']
                if d['data_url']
            ]
        dc = SessionDataInfo.from_session_details(ses,
                                                  dataset_types=dataset_types)
        # loop over each dataset and download if necessary
        for ind in range(len(dc)):
            if dc.url[ind] and not dry_run:
                relpath = PurePath(dc.url[ind].replace(
                    self._par.HTTP_DATA_SERVER, '.')).parents[0]
                cache_dir_file = PurePath(cache_dir, relpath)
                Path(cache_dir_file).mkdir(parents=True, exist_ok=True)
                dc.local_path[ind] = self._download_file(
                    dc.url[ind], str(cache_dir_file), clobber)
        # load the files content in variables if requested
        if not download_only:
            for ind, fil in enumerate(dc.local_path):
                dc.data[ind] = _load_file_content(fil)
        # parse output arguments
        if dclass_output:
            return dc
        # if required, parse the output as a list that matches dataset_types requested
        list_out = []
        for dt in dataset_types:
            if dt not in dc.dataset_type:
                list_out.append(None)
                continue
            for i, x, in enumerate(dc.dataset_type):
                if dt == x:
                    list_out.append(dc.data[i])
        return list_out
Example #2
0
 def _load(self,
           eid,
           dataset_types=None,
           dclass_output=False,
           dry_run=False,
           cache_dir=None,
           download_only=False,
           clobber=False,
           offline=False):
     """
     From a Session ID and dataset types, queries Alyx database, downloads the data
     from Globus, and loads into numpy array. Single session only
     """
     # if the input as an UUID, add the beginning of URL to it
     cache_dir = self._get_cache_dir(cache_dir)
     if is_uuid_string(eid):
         eid = '/sessions/' + eid
     eid_str = eid[-36:]
     # get session json information as a dictionary from the alyx API
     ses = self.alyx.get('/sessions?id=' + eid_str)
     if not ses:
         raise FileNotFoundError('Session ' + eid_str + ' does not exist')
     ses = ses[0]
     # if no dataset_type is provided:
     # a) force the output to be a dictionary that provides context to the data
     # b) download all types that have a data url specified
     dataset_types = [dataset_types] if isinstance(dataset_types,
                                                   str) else dataset_types
     if not dataset_types:
         dclass_output = True
         dataset_types = [
             d['dataset_type'] for d in ses['data_dataset_session_related']
             if d['data_url']
         ]
     dc = SessionDataInfo.from_session_details(ses,
                                               dataset_types=dataset_types,
                                               eid=eid_str)
     # loop over each dataset and download if necessary
     for ind in range(len(dc)):
         if dc.url[ind] and not dry_run:
             relpath = PurePath(dc.url[ind].replace(
                 self._par.HTTP_DATA_SERVER, '.')).parents[0]
             cache_dir_file = PurePath(cache_dir, relpath)
             Path(cache_dir_file).mkdir(parents=True, exist_ok=True)
             dc.local_path[ind] = self._download_file(dc.url[ind],
                                                      str(cache_dir_file),
                                                      clobber=clobber,
                                                      offline=offline)
     # load the files content in variables if requested
     if not download_only:
         for ind, fil in enumerate(dc.local_path):
             dc.data[ind] = load_file_content(fil)
     # parse output arguments
     if dclass_output:
         return dc
     # if required, parse the output as a list that matches dataset_types requested
     list_out = []
     for dt in dataset_types:
         if dt not in dc.dataset_type:
             logger_.warning('dataset ' + dt + ' not found for session: ' +
                             eid_str)
             list_out.append(None)
             continue
         for i, x, in enumerate(dc.dataset_type):
             if dt == x:
                 list_out.append(dc.data[i])
     return list_out
Example #3
0
    def load(self,
             eid,
             dataset_types=None,
             dclass_output=False,
             dry_run=False):
        """
        From a Session ID and dataset types, queries Alyx database, downloads the data
        from Globus, and loads into numpy array.

        :param eid: Experiment ID, for IBL this is the UUID of the Session as per Alyx
         database. Could be a full Alyx URL:
         'http://localhost:8000/sessions/698361f6-b7d0-447d-a25d-42afdef7a0da' or only the UUID:
         '698361f6-b7d0-447d-a25d-42afdef7a0da'
        :type eid: str
        :param dataset_types: [None]: Alyx dataset types to be returned.
        :type dataset_types: list
        :param dclass_output: [False]: forces the output as dataclass to provide context.
        :type dclass_output: bool
         If None or an empty dataset_type is specified, the output will be a dictionary by default.

        :return: List of numpy arrays matching the size of dataset_types parameter, OR
         a dataclass containing arrays and context data.
        :rtype: list, dict, dataclass SessionInfo
        """
        # TODO: feature that downloads a list of datasets from a list of sessions,
        # TODO in this case force dictionary output
        # if the input as an UUID, add the beginning of URL to it
        if is_uuid_string(eid):
            eid = '/sessions/' + eid
        eid_str = eid[-36:]
        # get session json information as a dictionary from the alyx API
        ses = self._alyxClient.get('/sessions?id=' + eid_str)
        if not ses:
            raise FileNotFoundError('Session ' + eid_str + ' does not exist')
        ses = ses[0]
        # if no dataset_type is provided:
        # a) force the output to be a dictionary that provides context to the data
        # b) download all types that have a data url specified
        dataset_types = [dataset_types] if isinstance(dataset_types,
                                                      str) else dataset_types
        if not dataset_types:
            dclass_output = True
            dataset_types = [
                d['dataset_type'] for d in ses['data_dataset_session_related']
                if d['data_url']
            ]
        # loop over each dataset related to the session ID and get list of files urls
        session_dtypes = [
            d['dataset_type'] for d in ses['data_dataset_session_related']
        ]
        out = SessionInfo()
        # this first loop only downloads the file to ease eventual refactoring
        for ind, dt in enumerate(dataset_types):
            for [i, sdt] in enumerate(session_dtypes):
                if sdt == dt:
                    urlstr = ses['data_dataset_session_related'][i]['data_url']
                    if not dry_run:
                        fil = wc.http_download_file(
                            urlstr,
                            username=par.HTTP_DATA_SERVER_LOGIN,
                            password=par.HTTP_DATA_SERVER_PWD,
                            cache_dir=par.CACHE_DIR)
                    else:
                        fil = ''
                    out.eid.append(eid_str)
                    out.dataset_type.append(dt)
                    out.url.append(urlstr)
                    out.local_path.append(fil)
                    out.dataset_id.append(
                        ses['data_dataset_session_related'][i]['id'])
                    out.data.append([])
        # then another loop over files and load them in numpy. If not npy, just pass empty list
        for ind, fil in enumerate(
                out.local_path):  # this is where I miss switch case
            if fil and os.path.splitext(fil)[1] == '.npy':
                out.data[ind] = np.load(file=fil)
            if fil and os.path.splitext(fil)[1] == '.json':
                pass  # FIXME would be nice to implement json read but param from matlab RIG fails
            if fil and os.path.splitext(fil)[1] == '.tsv':
                pass  # TODO: implement csv reads as well
            if fil and os.path.splitext(fil)[1] == '.csv':
                pass  # TODO: implement tsv reads as well
        if dclass_output:
            return out
        # if required, parse the output as a list that matches dataset types provided
        list_out = []
        for dt in dataset_types:
            if dt not in out.dataset_type:
                list_out.append(None)
                continue
            for i, x, in enumerate(out.dataset_type):
                if dt == x:
                    list_out.append(out.data[i])
        return list_out