def load(self, eid, dataset_types=None, dclass_output=False, dry_run=False, cache_dir=None, download_only=False, clobber=False): """ From a Session ID and dataset types, queries Alyx database, downloads the data from Globus, and loads into numpy array. :param eid: Experiment ID, for IBL this is the UUID of the Session as per Alyx database. Could be a full Alyx URL: 'http://localhost:8000/sessions/698361f6-b7d0-447d-a25d-42afdef7a0da' or only the UUID: '698361f6-b7d0-447d-a25d-42afdef7a0da' :type eid: str :param dataset_types: [None]: Alyx dataset types to be returned. :type dataset_types: list :param dclass_output: [False]: forces the output as dataclass to provide context. :type dclass_output: bool If None or an empty dataset_type is specified, the output will be a dictionary by default. :param cache_dir: temporarly overrides the cache_dir from the parameter file :type cache_dir: str :param download_only: do not attempt to load data in memory, just download the files :type download_only: bool :param clobber: force downloading even if files exists locally :type clobber: bool :return: List of numpy arrays matching the size of dataset_types parameter, OR a dataclass containing arrays and context data. :rtype: list, dict, dataclass SessionDataInfo """ # if the input as an UUID, add the beginning of URL to it cache_dir = self._get_cache_dir(cache_dir) if is_uuid_string(eid): eid = '/sessions/' + eid eid_str = eid[-36:] # get session json information as a dictionary from the alyx API ses = self.alyx.get('/sessions?id=' + eid_str) if not ses: raise FileNotFoundError('Session ' + eid_str + ' does not exist') ses = ses[0] # if no dataset_type is provided: # a) force the output to be a dictionary that provides context to the data # b) download all types that have a data url specified dataset_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types if not dataset_types: dclass_output = True dataset_types = [ d['dataset_type'] for d in ses['data_dataset_session_related'] if d['data_url'] ] dc = SessionDataInfo.from_session_details(ses, dataset_types=dataset_types) # loop over each dataset and download if necessary for ind in range(len(dc)): if dc.url[ind] and not dry_run: relpath = PurePath(dc.url[ind].replace( self._par.HTTP_DATA_SERVER, '.')).parents[0] cache_dir_file = PurePath(cache_dir, relpath) Path(cache_dir_file).mkdir(parents=True, exist_ok=True) dc.local_path[ind] = self._download_file( dc.url[ind], str(cache_dir_file), clobber) # load the files content in variables if requested if not download_only: for ind, fil in enumerate(dc.local_path): dc.data[ind] = _load_file_content(fil) # parse output arguments if dclass_output: return dc # if required, parse the output as a list that matches dataset_types requested list_out = [] for dt in dataset_types: if dt not in dc.dataset_type: list_out.append(None) continue for i, x, in enumerate(dc.dataset_type): if dt == x: list_out.append(dc.data[i]) return list_out
def _load(self, eid, dataset_types=None, dclass_output=False, dry_run=False, cache_dir=None, download_only=False, clobber=False, offline=False): """ From a Session ID and dataset types, queries Alyx database, downloads the data from Globus, and loads into numpy array. Single session only """ # if the input as an UUID, add the beginning of URL to it cache_dir = self._get_cache_dir(cache_dir) if is_uuid_string(eid): eid = '/sessions/' + eid eid_str = eid[-36:] # get session json information as a dictionary from the alyx API ses = self.alyx.get('/sessions?id=' + eid_str) if not ses: raise FileNotFoundError('Session ' + eid_str + ' does not exist') ses = ses[0] # if no dataset_type is provided: # a) force the output to be a dictionary that provides context to the data # b) download all types that have a data url specified dataset_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types if not dataset_types: dclass_output = True dataset_types = [ d['dataset_type'] for d in ses['data_dataset_session_related'] if d['data_url'] ] dc = SessionDataInfo.from_session_details(ses, dataset_types=dataset_types, eid=eid_str) # loop over each dataset and download if necessary for ind in range(len(dc)): if dc.url[ind] and not dry_run: relpath = PurePath(dc.url[ind].replace( self._par.HTTP_DATA_SERVER, '.')).parents[0] cache_dir_file = PurePath(cache_dir, relpath) Path(cache_dir_file).mkdir(parents=True, exist_ok=True) dc.local_path[ind] = self._download_file(dc.url[ind], str(cache_dir_file), clobber=clobber, offline=offline) # load the files content in variables if requested if not download_only: for ind, fil in enumerate(dc.local_path): dc.data[ind] = load_file_content(fil) # parse output arguments if dclass_output: return dc # if required, parse the output as a list that matches dataset_types requested list_out = [] for dt in dataset_types: if dt not in dc.dataset_type: logger_.warning('dataset ' + dt + ' not found for session: ' + eid_str) list_out.append(None) continue for i, x, in enumerate(dc.dataset_type): if dt == x: list_out.append(dc.data[i]) return list_out
def load(self, eid, dataset_types=None, dclass_output=False, dry_run=False): """ From a Session ID and dataset types, queries Alyx database, downloads the data from Globus, and loads into numpy array. :param eid: Experiment ID, for IBL this is the UUID of the Session as per Alyx database. Could be a full Alyx URL: 'http://localhost:8000/sessions/698361f6-b7d0-447d-a25d-42afdef7a0da' or only the UUID: '698361f6-b7d0-447d-a25d-42afdef7a0da' :type eid: str :param dataset_types: [None]: Alyx dataset types to be returned. :type dataset_types: list :param dclass_output: [False]: forces the output as dataclass to provide context. :type dclass_output: bool If None or an empty dataset_type is specified, the output will be a dictionary by default. :return: List of numpy arrays matching the size of dataset_types parameter, OR a dataclass containing arrays and context data. :rtype: list, dict, dataclass SessionInfo """ # TODO: feature that downloads a list of datasets from a list of sessions, # TODO in this case force dictionary output # if the input as an UUID, add the beginning of URL to it if is_uuid_string(eid): eid = '/sessions/' + eid eid_str = eid[-36:] # get session json information as a dictionary from the alyx API ses = self._alyxClient.get('/sessions?id=' + eid_str) if not ses: raise FileNotFoundError('Session ' + eid_str + ' does not exist') ses = ses[0] # if no dataset_type is provided: # a) force the output to be a dictionary that provides context to the data # b) download all types that have a data url specified dataset_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types if not dataset_types: dclass_output = True dataset_types = [ d['dataset_type'] for d in ses['data_dataset_session_related'] if d['data_url'] ] # loop over each dataset related to the session ID and get list of files urls session_dtypes = [ d['dataset_type'] for d in ses['data_dataset_session_related'] ] out = SessionInfo() # this first loop only downloads the file to ease eventual refactoring for ind, dt in enumerate(dataset_types): for [i, sdt] in enumerate(session_dtypes): if sdt == dt: urlstr = ses['data_dataset_session_related'][i]['data_url'] if not dry_run: fil = wc.http_download_file( urlstr, username=par.HTTP_DATA_SERVER_LOGIN, password=par.HTTP_DATA_SERVER_PWD, cache_dir=par.CACHE_DIR) else: fil = '' out.eid.append(eid_str) out.dataset_type.append(dt) out.url.append(urlstr) out.local_path.append(fil) out.dataset_id.append( ses['data_dataset_session_related'][i]['id']) out.data.append([]) # then another loop over files and load them in numpy. If not npy, just pass empty list for ind, fil in enumerate( out.local_path): # this is where I miss switch case if fil and os.path.splitext(fil)[1] == '.npy': out.data[ind] = np.load(file=fil) if fil and os.path.splitext(fil)[1] == '.json': pass # FIXME would be nice to implement json read but param from matlab RIG fails if fil and os.path.splitext(fil)[1] == '.tsv': pass # TODO: implement csv reads as well if fil and os.path.splitext(fil)[1] == '.csv': pass # TODO: implement tsv reads as well if dclass_output: return out # if required, parse the output as a list that matches dataset types provided list_out = [] for dt in dataset_types: if dt not in out.dataset_type: list_out.append(None) continue for i, x, in enumerate(out.dataset_type): if dt == x: list_out.append(out.data[i]) return list_out