def _patch_file(self, download=True): """ Downloads a file from the flatiron, modify it locally, patch it and download it again """ dataset_id = '04abb580-e14b-4716-9ff2-f7b95740b99f' dataset = self.one.alyx.rest('datasets', 'read', id=dataset_id) # download local_file_path = self.one.load(dataset['session'], dataset_types=dataset['dataset_type'], download_only=True, clobber=True)[0] # change it np.save(local_file_path, ~np.load(local_file_path)) new_check_sum = hashfile.md5(local_file_path) # try once with dry self.patcher.patch_dataset(local_file_path, dset_id=dataset['url'][-36:], dry=True) self.patcher.patch_dataset(local_file_path, dset_id=dataset['url'][-36:], dry=False) # the dataset hash should have been updated dataset = self.one.alyx.rest('datasets', 'read', id=dataset_id) self.assertEqual(uuid.UUID(dataset['hash']), uuid.UUID(new_check_sum)) self.assertEqual(dataset['version'], version.ibllib()) if download: # download again and check the hash local_file_path.unlink() local_file_path = self.one.load(dataset['session'], dataset_types=dataset['dataset_type'], download_only=True, clobber=True)[0] self.assertEqual(hashfile.md5(local_file_path), new_check_sum)
def test_create_and_delete_file(self): session_path = Path(self.par.LOCAL_PATH).joinpath( 'flowers', '2018-07-13', '001') alf_path = session_path.joinpath('alf') alf_path.mkdir(parents=True, exist_ok=True) new_files = [ alf_path.joinpath('spikes.amps.npy'), alf_path.joinpath('spikes.times.npy') ] for nf in new_files: np.save(nf, np.random.rand(500, 1)) # try a dry run first self.patcher.create_dataset(new_files, dry=True) # creates it on the database self.patcher.create_dataset(new_files, repository='flatiron_zadorlab') self.patcher.launch_transfers(self.gtc, wait=True) # download through ONE and check hashes eid = self.one.search(subjects='flowers', dataset_types=['spikes.amps'])[0] download0 = self.one.load(eid, dataset_types=['spikes.amps'], download_only=True, dclass_output=True, clobber=True)[0] self.assertEqual(download0.dataset_id, download0.dataset_id) self.assertTrue( hashfile.md5(download0.local_path) == hashfile.md5(new_files[0])) # deletes the file self.patcher.delete_dataset(dset_id=download0.dataset_id, dry=False) self.patcher.launch_transfers(self.gtc, wait=True) # makes sure it's not in the database anymore session = self.one.search(subjects='flowers', dataset_types=[download0.dataset_type]) self.assertEqual(len(session), 0)
def register_dataset(file_list, one=None, created_by=None, repository=None, server_only=False, versions=None, dry=False, max_md5_size=None): """ Registers a set of files belonging to a session only on the server :param file_list: (list of pathlib.Path or pathlib.Path) :param one: optional (oneibl.ONE), current one object, will create an instance if not provided :param created_by: (string) name of user in Alyx (defaults to 'root') :param repository: optional: (string) name of the repository in Alyx :param server_only: optional: (bool) if True only creates on the Flatiron (defaults to False) :param versions: optional (list of strings): versions tags (defaults to ibllib version) :param dry: (bool) False by default :param verbose: (bool) logs :param max_md5_size: (int) maximum file in bytes to compute md5 sum (always compute if Npne) defaults to None :return: """ if created_by is None: created_by = one._par.ALYX_LOGIN if file_list is None or file_list == '' or file_list == []: return elif not isinstance(file_list, list): file_list = [Path(file_list)] assert len(set([alf.io.get_session_path(f) for f in file_list])) == 1 assert all([Path(f).exists() for f in file_list]) if versions is None: versions = version.ibllib() if isinstance(versions, str): versions = [versions for _ in file_list] assert isinstance(versions, list) and len(versions) == len(file_list) # computing the md5 can be very long, so this is an option to skip if the file is bigger # than a certain threshold if max_md5_size: hashes = [hashfile.md5(p) if p.stat().st_size < max_md5_size else None for p in file_list] else: hashes = [hashfile.md5(p) for p in file_list] session_path = alf.io.get_session_path(file_list[0]) # first register the file r = {'created_by': created_by, 'path': session_path.relative_to((session_path.parents[2])).as_posix(), 'filenames': [p.relative_to(session_path).as_posix() for p in file_list], 'name': repository, 'server_only': server_only, 'hashes': hashes, 'filesizes': [p.stat().st_size for p in file_list], 'versions': versions} if not dry: if one is None: one = ONE() response = one.alyx.rest('register-file', 'create', data=r) for p in file_list: _logger.info(f"ALYX REGISTERED DATA: {p}") return response
def _download_file(self, url, cache_dir, clobber=False, offline=False, keep_uuid=False, file_size=None, hash=None): local_path = cache_dir + os.sep + os.path.basename(url) if not keep_uuid: local_path = remove_uuid_file(local_path, dry=True) if Path(local_path).exists(): # overwrites the file if the expected filesize is different from the cached filesize if file_size and Path(local_path).stat().st_size != file_size: clobber = True # overwrites the file if the expected hash is different from the cached hash if hash and hashfile.md5(Path(local_path)) != hash: clobber = True # if there is no cached file, download else: clobber = True if clobber: local_path = wc.http_download_file( url, username=self._par.HTTP_DATA_SERVER_LOGIN, password=self._par.HTTP_DATA_SERVER_PWD, cache_dir=str(cache_dir), clobber=clobber, offline=offline) if keep_uuid: return local_path else: return remove_uuid_file(local_path)
def register_dataset(self, file_list, created_by='root', server_repository=None, dry=False): """ Registers a set of files belonging to a session only on the server :param session_path: :param filenames: :param created_by: :param server_repository: :param dry: :return: """ if not isinstance(file_list, list): file_list = [Path(file_list)] assert len(set([alf.io.get_session_path(f) for f in file_list])) == 1 assert all([Path(f).exists() for f in file_list]) session_path = alf.io.get_session_path(file_list[0]) # first register the file r = {'created_by': created_by, 'path': str(session_path.relative_to((session_path.parents[2]))), 'filenames': [str(p.relative_to(session_path)) for p in file_list], 'name': server_repository, 'server_only': True, 'hashes': [md5(p) for p in file_list], 'filesizes': [p.stat().st_size for p in file_list], 'versions': [version.ibllib() for _ in file_list]} if not dry: return self.one.alyx.rest('register-file', 'create', data=r) else: print(r)
def _download_file(self, url, target_dir, clobber=False, offline=False, keep_uuid=False, file_size=None, hash=None): """ Downloads a single file from an HTTP webserver :param url: :param cache_dir: :param clobber: (bool: False) overwrites local dataset if any :param offline: :param keep_uuid: :param file_size: :param hash: :return: """ Path(target_dir).mkdir(parents=True, exist_ok=True) local_path = str(target_dir) + os.sep + os.path.basename(url) if not keep_uuid: local_path = alfio.remove_uuid_file(local_path, dry=True) if Path(local_path).exists() and not offline: # the local file hash doesn't match the dataset table cached hash hash_mismatch = hash and hashfile.md5(Path(local_path)) != hash file_size_mismatch = file_size and Path( local_path).stat().st_size != file_size if hash_mismatch or file_size_mismatch: clobber = True _logger.warning( f" local md5 or size mismatch, re-downloading {local_path}" ) # if there is no cached file, download else: clobber = True if clobber: local_path, md5 = wc.http_download_file( url, username=self._par.HTTP_DATA_SERVER_LOGIN, password=self._par.HTTP_DATA_SERVER_PWD, cache_dir=str(target_dir), clobber=clobber, offline=offline, return_md5=True) # post download, if there is a mismatch between Alyx and the newly downloaded file size # or hash flag the offending file record in Alyx for database maintenance hash_mismatch = hash and md5 != hash file_size_mismatch = file_size and Path( local_path).stat().st_size != file_size if hash_mismatch or file_size_mismatch: self._tag_mismatched_file_record(url) if keep_uuid: return local_path else: return alfio.remove_uuid_file(local_path)
def test_check_ephys_file(self): self.tdir = tempfile.TemporaryDirectory(prefix='glx_test') self.addCleanup(self.tdir.cleanup) bin_3b = spikeglx._mock_spikeglx_file( Path(self.tdir.name).joinpath('sample3B_g0_t0.imec1.ap.bin'), self.workdir / 'sample3B_g0_t0.imec1.ap.meta', ns=32, nc=385, sync_depth=16) self.assertEqual(hashfile.md5(bin_3b['bin_file']), "207ba1666b866a091e5bb8b26d19733f") self.assertEqual(hashfile.sha1(bin_3b['bin_file']), '1bf3219c35dea15409576f6764dd9152c3f8a89c') sr = spikeglx.Reader(bin_3b['bin_file']) self.assertTrue(sr.verify_hash())
def test_download_hash(self): eid = self.eid # get the original file from the server file = one.load(eid, dataset_types=['channels.localCoordinates'], download_only=True, clobber=True)[0] fsize = file.stat().st_size hash = hashfile.md5(file) data_server = np.load(file) # overwrite the local file np.save(file, np.zeros([25, 0])) # here we patch the dataset with the server filesize and hash dset = one.alyx.rest('datasets', 'list', dataset_type='channels.localCoordinates', session=eid) one.alyx.rest('datasets', 'partial_update', id=dset[0]['url'][-36:], data={ 'file_size': fsize, 'hash': hash }) data = one.load(eid, dataset_types=['channels.localCoordinates'])[0] self.assertTrue(data.shape == data_server.shape) # Verify new hash / filesizes added to cache table rec, = one._make_dataclass_offline( eid, dataset_types='channels.localCoordinates') self.assertEqual(rec.file_size, fsize) self.assertEqual(rec.hash, hash) # here we patch a dataset and make sure it overwrites if the checksum is different np.save(file, data_server * 2) data = one.load(eid, dataset_types=['channels.localCoordinates'])[0] self.assertTrue(data.shape == data_server.shape) self.assertTrue(np.all(np.equal(data, data_server))) # here we corrupt the md5 hash on the database, the file will get downloaded again, # but on checking the file one.load should have labeled the json field for database # maintenance one.alyx.rest('datasets', 'partial_update', id=dset[0]['url'][-36:], data={ 'file_size': fsize, 'hash': "5d1d13589934440a9947c2477b2e61ea" }) one.load(eid, dataset_types=['channels.localCoordinates'])[0] fr = one.alyx.rest('files', 'list', django=f"dataset,{dset[0]['url'][-36:]}," f"data_repository__globus_is_personal,False") self.assertTrue(fr[0]['json'] == {'mismatch_hash': True})
def _test_create_and_delete_file(self): """ Creates a file, upload it to Flatiron twice and then removes it """ with tempfile.TemporaryDirectory() as td: # creates the local file session_path = Path(td).joinpath('flowers', '2018-07-13', '001') alf_path = session_path.joinpath('alf') alf_path.mkdir(parents=True) new_file = alf_path.joinpath('spikes.amps.npy') np.save(new_file, np.random.rand(500, 1)) # try a dry run first self.patcher.create_dataset(new_file, dry=True) # creates it on the database self.patcher.create_dataset(new_file, repository='flatiron_zadorlab') # download through ONE and check hashes eid = self.one.search(subjects='flowers', dataset_types=['spikes.amps'])[0] download0 = self.one.load(eid, dataset_types=['spikes.amps'], download_only=True, dclass_output=True, clobber=True)[0] # creates it a second time an makes sure it's not duplicated (also test automatic repo) self.patcher.create_dataset(new_file) download = self.one.load(eid, dataset_types=['spikes.amps'], download_only=True, dclass_output=True, clobber=True)[0] self.assertEqual(download.dataset_id, download0.dataset_id) self.assertTrue( hashfile.md5(download.local_path) == hashfile.md5(new_file)) # deletes the file self.patcher.delete_dataset(dset_id=download.dataset_id, dry=False) # makes sure it's not in the database anymore session = self.one.search(subjects='flowers', dataset_types=['spikes.amps']) self.assertEqual(len(session), 0)
def patch_dataset(self, path, dset_id=None, dry=False): """ Uploads a dataset from an arbitrary location to FlatIron. :param path: :param dset_id: :param dry: :return: """ status = self._patch_dataset(path, dset_id=dset_id, dry=dry) if not dry and status == 0: self.one.alyx.rest('datasets', 'partial_update', id=dset_id, data={'hash': md5(path), 'file_size': path.stat().st_size, 'version': version.ibllib()} )
def _download_file(self, url, target_dir, clobber=False, offline=False, keep_uuid=False, file_size=None, hash=None): """ Downloads a single file from an HTTP webserver :param url: :param cache_dir: :param clobber: (bool: False) overwrites local dataset if any :param offline: :param keep_uuid: :param file_size: :param hash: :return: """ Path(target_dir).mkdir(parents=True, exist_ok=True) local_path = str(target_dir) + os.sep + os.path.basename(url) if not keep_uuid: local_path = remove_uuid_file(local_path, dry=True) if Path(local_path).exists(): # overwrites the file if the expected filesize is different from the cached filesize if file_size and Path(local_path).stat().st_size != file_size: clobber = True # overwrites the file if the expected hash is different from the cached hash if hash and hashfile.md5(Path(local_path)) != hash: clobber = True # if there is no cached file, download else: clobber = True if clobber: local_path = wc.http_download_file( url, username=self._par.HTTP_DATA_SERVER_LOGIN, password=self._par.HTTP_DATA_SERVER_PWD, cache_dir=str(target_dir), clobber=clobber, offline=offline) if keep_uuid: return local_path else: return remove_uuid_file(local_path)
def test_load_newversion(self): eid = self.eid # get the original file from the server file = one.load(eid, dataset_types=['channels.localCoordinates'], download_only=True, clobber=True)[0] fsize = file.stat().st_size hash = hashfile.md5(file) data_server = np.load(file) # overwrite the local file np.save(file, np.zeros([25, 0])) # here we patch the dataset with the server filesize and hash dset = one.alyx.rest('datasets', 'list', dataset_type='channels.localCoordinates', session=eid) one.alyx.rest('datasets', 'partial_update', id=dset[0]['url'][-36:], data={'file_size': fsize, 'hash': hash}) data = one.load(eid, dataset_types=['channels.localCoordinates'])[0] self.assertTrue(data.shape == data_server.shape) # here we patch a dataset and make sure it overwrites if the checksum is different np.save(file, data_server * 2) data = one.load(eid, dataset_types=['channels.localCoordinates'])[0] self.assertTrue(data.shape == data_server.shape) self.assertTrue(np.all(np.equal(data, data_server)))
def test_download_datasets(self): # test downloading a single file full_link_to_file = r'http://ibl.flatironinstitute.org/mainenlab/Subjects/clns0730'\ '/2018-08-24/1/licks.times.51852a2f-c76e-4c0c-95cb-9c7ba54be0f9.npy' file_name, md5 = wc.http_download_file(full_link_to_file, username=par.HTTP_DATA_SERVER_LOGIN, password=par.HTTP_DATA_SERVER_PWD, return_md5=True, clobber=True) a = np.load(file_name) self.assertTrue(hashfile.md5(file_name) == md5) self.assertTrue(len(a) > 0) # test downloading a list of files links = [r'http://ibl.flatironinstitute.org/mainenlab/Subjects/clns0730' '/2018-08-24/1/licks.times.51852a2f-c76e-4c0c-95cb-9c7ba54be0f9.npy', r'http://ibl.flatironinstitute.org/mainenlab/Subjects/clns0730' '/2018-08-24/1/probes.sitePositions.3ddd45be-7d24-4fc7-9dd3-a98717342af6.npy' ] file_list = wc.http_download_file_list(links, username=par.HTTP_DATA_SERVER_LOGIN, password=par.HTTP_DATA_SERVER_PWD) a = np.load(file_list[0]) b = np.load(file_list[1]) self.assertTrue(len(a) > 0) self.assertTrue(len(b) > 0)
def http_download_file(full_link_to_file, chunks=None, *, clobber=False, username='', password='', cache_dir='', return_md5=False, headers=None): """ :param full_link_to_file: http link to the file. :type full_link_to_file: str :param clobber: [False] If True, force overwrite the existing file. :type clobber: bool :param username: [''] authentication for password protected file server. :type username: str :param password: [''] authentication for password protected file server. :type password: str :param cache_dir: [''] directory in which files are cached; defaults to user's Download directory. :param: headers: [{}] additional headers to add to the request (auth tokens etc..) :type cache_dir: str :return: (str) a list of the local full path of the downloaded files. """ from ibllib.io import hashfile if not full_link_to_file: return '' # default cache directory is the home dir if not cache_dir: cache_dir = str(Path.home().joinpath("Downloads")) # This is the local file name file_name = str(cache_dir) + os.sep + os.path.basename(full_link_to_file) # do not overwrite an existing file unless specified if not clobber and os.path.exists(file_name): return (file_name, hashfile.md5(file_name)) if return_md5 else file_name # This should be the base url you wanted to access. baseurl = os.path.split(str(full_link_to_file))[0] # Create a password manager manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() if (len(password) != 0) & (len(username) != 0): manager.add_password(None, baseurl, username, password) # Create an authentication handler using the password manager auth = urllib.request.HTTPBasicAuthHandler(manager) # Create an opener that will replace the default urlopen method on further calls opener = urllib.request.build_opener(auth) urllib.request.install_opener(opener) # Support for partial download. req = urllib.request.Request(full_link_to_file) if chunks is not None: first_byte, n_bytes = chunks req.add_header("Range", "bytes=%d-%d" % (first_byte, first_byte + n_bytes - 1)) # add additional headers if headers is not None: for k in headers: req.add_header(k, headers[k]) # Open the url and get the length try: u = urllib.request.urlopen(req) except urllib.error.HTTPError as e: _logger.error(f"{str(e)} {full_link_to_file}") raise e file_size = int(u.getheader('Content-length')) print(f"Downloading: {file_name} Bytes: {file_size}") file_size_dl = 0 block_sz = 8192 * 64 * 8 md5 = hashlib.md5() f = open(file_name, 'wb') while True: buffer = u.read(block_sz) if not buffer: break file_size_dl += len(buffer) f.write(buffer) if return_md5: md5.update(buffer) print_progress(file_size_dl, file_size, prefix='', suffix='') f.close() return (file_name, md5.hexdigest()) if return_md5 else file_name
def register_session(self, ses_path, file_list=True): """ Register session in Alyx :param ses_path: path to the session :param file_list: bool. Set to False will only create the session and skip registration :param repository_name: Optional, repository on which to register the data :return: Status string on error """ if isinstance(ses_path, str): ses_path = Path(ses_path) # read meta data from the rig for the session from the task settings file settings_json_file = list( ses_path.glob('**/_mflab_taskSettings.raw*.json')) if not settings_json_file: settings_json_file = list( ses_path.glob('**/_mflab_taskSettings.raw*.json')) if not settings_json_file: _logger.error( ['could not find _mflab_taskSettings.raw.json. Abort.']) return _logger.warning( [f'Settings found in a strange place: {settings_json_file}']) else: settings_json_file = settings_json_file[0] md = _read_settings_json_compatibility_enforced(settings_json_file) # query alyx endpoints for subject, error if not found try: subject = self.one.alyx.rest( 'subjects?nickname=' + md['SUBJECT_NAME'], 'list')[0] except IndexError: _logger.error( f"Subject: {md['SUBJECT_NAME']} doesn't exist in Alyx. ABORT.") raise ibllib.exceptions.AlyxSubjectNotFound(md['SUBJECT_NAME']) # look for a session from the same subject, same number on the same day session_id, session = self.one.search(subjects=subject['nickname'], date_range=md['SESSION_DATE'], number=md['SESSION_NUMBER'], details=True) print('session if exists', session_id, session) try: user = self.one.alyx.rest('users', 'read', id=md["PROTOCOL_CREATOR"][0]) except Exception as e: _logger.error( f"User: {md['PROTOCOL_CREATOR'][0]} doesn't exist in Alyx. ABORT" ) raise e username = user['username'] if user else subject['responsible_user'] # load the trials data to get information about session duration and performance ses_data = raw.load_data(ses_path) start_time, end_time = _get_session_times(ses_path, md, ses_data) n_trials, n_correct_trials = _get_session_performance(md, ses_data) # this is the generic relative path: subject/yyyy-mm-dd/NNN gen_rel_path = Path(subject['nickname'], md['SESSION_DATE'], '{0:03d}'.format(int(md['SESSION_NUMBER']))) # if nothing found create a new session in Alyx task_protocol = md['PROTOCOL'] + md['VERSION_TAG'] dset_types = md['DATASET_TYPES'] alyx_procedure = _alyx_procedure_from_task(task_protocol) if not session: ses_ = { 'subject': subject['nickname'], 'users': [username], 'procedures': [] if alyx_procedure is None else [alyx_procedure], 'lab': subject['lab'], 'project': md['PROJECT'], 'type': 'Experiment', 'task_protocol': task_protocol, 'number': md['SESSION_NUMBER'], 'start_time': ibllib.time.date2isostr(start_time), 'end_time': ibllib.time.date2isostr(end_time) if end_time else None, 'n_correct_trials': n_correct_trials, 'n_trials': n_trials, 'data_dataset_session_related': md['DATASET_TYPES'], 'dset_types': md['DATASET_TYPES'], 'json': md, } session = self.one.alyx.rest('sessions', 'create', data=ses_) #if md['SUBJECT_WEIGHT']: if 'SUBJECT_WEIGHT' in md.keys(): wei_ = { 'subject': subject['nickname'], 'date_time': ibllib.time.date2isostr(start_time), 'weight': md['SUBJECT_WEIGHT'], 'user': username } self.one.alyx.rest('weighings', 'create', data=wei_) else: # TODO: if session exists and no json partial_upgrade it print('session exists, lets update it') ses_ = { 'subject': subject['nickname'], 'users': [username], 'procedures': [] if alyx_procedure is None else [alyx_procedure], 'lab': subject['lab'], 'project': md['PROJECT'], 'type': 'Experiment', 'task_protocol': task_protocol, 'number': md['SESSION_NUMBER'], 'start_time': ibllib.time.date2isostr(start_time), 'end_time': ibllib.time.date2isostr(end_time) if end_time else None, 'n_correct_trials': n_correct_trials, 'n_trials': n_trials, 'data_dataset_session_related': md['DATASET_TYPES'], 'dset_types': md['DATASET_TYPES'], 'json': md, } print('ses', ses_) #session = self.one.alyx.rest('sessions', 'read', id=session_id[0]) print('session_id', session_id[0]) session = self.one.alyx.rest('sessions', 'partial_update', id=session_id[0], data=ses_) #can try update as well _logger.info(session['url'] + ' ') # create associated water administration if not found if not session['wateradmin_session_related'] and ses_data: wa_ = { 'subject': subject['nickname'], 'date_time': ibllib.time.date2isostr(end_time), 'water_administered': ses_data[-1]['water_delivered'] / 1000, 'water_type': md.get('REWARD_TYPE') or 'Water', 'user': username, 'session': session['url'][-36:], 'adlib': False } self.one.alyx.rest('water-administrations', 'create', data=wa_) # at this point the session has been created. If create only, exit if not file_list: return # register all files that match the Alyx patterns, warn user when files are encountered rename_files_compatibility(ses_path, md['VERSION_TAG']) F = [ ] # empty list whose keys will be relative paths and content filenames md5s = [] file_sizes = [] for fn in _glob_session(ses_path): if fn.suffix in EXCLUDED_EXTENSIONS: _logger.debug('Excluded: ', str(fn)) continue if not _check_filename_for_registration( fn, self.registration_patterns): _logger.warning('No matching dataset type for: ' + str(fn)) continue if fn.suffix not in self.file_extensions: _logger.warning( 'No matching dataformat (ie. file extension) for: ' + str(fn)) continue if not _register_bool(fn.name, file_list): _logger.debug('Not in filelist: ' + str(fn)) continue try: assert (str(gen_rel_path) in str(fn)) except AssertionError as e: strerr = 'ALF folder mismatch: data is in wrong subject/date/number folder. \n' strerr += ' Expected ' + str( gen_rel_path) + ' actual was ' + str(fn) _logger.error(strerr) raise e # extract the relative path of the file rel_path = Path(str(fn)[str(fn).find(str(gen_rel_path)):]) F.append(str(rel_path.relative_to(gen_rel_path))) file_sizes.append(fn.stat().st_size) md5s.append( hashfile.md5(fn) if fn.stat().st_size < 1024**3 else None) _logger.info('Registering ' + str(fn)) r_ = { 'created_by': username, 'path': str(gen_rel_path), 'filenames': F, 'hashes': md5s, 'filesizes': file_sizes, 'versions': [version.ibllib() for _ in F] } self.one.alyx.post('/register-file', data=r_)