def test_locks(self): # Make sure the lock can be created and deleted with no contention # and in non-blocking mode. with lockfile.LockFile(self.lock_path): # Make sure the lock was created. self.assertTrue(self.lock_path.exists()) # Make sure the lock is deleted after close. self.assertFalse(self.lock_path.exists()) lockfile.LockFile._create_lockfile(self.lock_path, 100, '1234') # Remove the lockfile after 1 second of trying. sp.call("sleep 1; rm {}".format(self.lock_path), shell=True) # Test waiting for the lockfile. with lockfile.LockFile(self.lock_path, timeout=2): pass # Making sure that we can automatically acquire and delete an # expired lockfile. lockfile.LockFile._create_lockfile(path=self.lock_path, expires=-100, lock_id='1234') with lockfile.LockFile(self.lock_path, timeout=1): pass # Lock objects are reusable. lock = lockfile.LockFile(self.lock_path) with lock: pass with lock: pass # Make sure we can set the group on the lockfile. # We need a group other than our default. groups = os.getgroups() if os.getuid() != 0: # This is only valid for non-root users. if os.getuid() in groups: groups.remove(os.getuid()) if groups: group = groups.pop() with lockfile.LockFile(self.lock_path, group=grp.getgrgid(group).gr_name): stat = self.lock_path.stat() self.assertEqual(stat.st_gid, group) self.assertEqual(stat.st_mode & 0o777, lockfile.LockFile.LOCK_PERMS)
def test_lock_contention(self): proc_count = 6 procs = [] fight_path = pathlib.Path(__file__).parent / 'lock_fight.py' try: for p in range(proc_count): procs.append( sp.Popen(['python3', str(fight_path), str(self.lock_path)])) # Give the procs a chance to start. time.sleep(0.5) # Get the lock 5 times, hold it a sec, and verify that it's # uncorrupted. for i in range(5): with lockfile.LockFile(self.lock_path, timeout=2) as lock: time.sleep(1) host, user, expires, lock_id = lock.read_lockfile() self.assertTrue(host is not None) self.assertTrue(user is not None) self.assertTrue(expires is not None) self.assertEqual(lock_id, lock._id) # Let the other procs get the lock this time. time.sleep(0.2) finally: # Make sure we kill all the subprocesses. for proc in procs: proc.terminate() proc.kill()
def test_lock_contention(self): proc_count = 6 procs = [] fight_path = os.path.join(os.path.dirname(__file__), 'lock_fight.py') try: for p in range(proc_count): procs.append(sp.Popen(['python', fight_path, self.lock_path])) # Give the procs a chance to start. time.sleep(0.5) # Get the lock 5 times, hold it a sec, and verify that it's uncorrupted. for i in range(5): with lockfile.LockFile(self.lock_path, timeout=2) as lock: # print("Test - {} got lock {}".format(os.getpid(), lock._id)) time.sleep(1) host, user, expires, lock_id = lock.read_lockfile() self.assertTrue(host is not None) self.assertTrue(user is not None) self.assertTrue(expires is not None) self.assertEqual(lock_id, lock._id) # print("Test - {} bye lock {}".format(os.getpid(), lock._id)) # Let the other procs get the lock this time. time.sleep(0.2) finally: # Make sure we kill all the subprocesses. for proc in procs: proc.terminate() proc.kill()
def create_id_dir(id_dir): """In the given directory, create the lowest numbered (positive integer) directory that doesn't already exist. :param str id_dir: Path to the directory that contains these 'id' directories :returns: The id and path to the created directory. :raises OSError: on directory creation failure. :raises TimeoutError: If we couldn't get the lock in time. """ lockfile_path = os.path.join(id_dir, '.lockfile') with lockfile.LockFile(lockfile_path, timeout=1): ids = os.listdir(id_dir) # Only return the test directories that could be integers. ids = filter(str.isdigit, ids) ids = filter(lambda d: os.path.isdir(os.path.join(id_dir, d)), ids) ids = list(map(int, ids)) ids.sort() # Find the first unused id. id_ = 1 while id_ in ids: id_ += 1 path = make_id_path(id_dir, id_) os.mkdir(path) return id_, path
def delete(id_dir: Path, filter_func: Callable[[Path], bool] = default_filter, transform: Callable[[Path], Any] = None, verbose: bool = False): """Delete all id directories in a given path that match the given filter. :param id_dir: The directory to iterate through. :param filter_func: A passed filter function, to be passed to select. :param transform: As per 'select_from' :param verbose: Verbose output. :return int count: The number of directories removed. :return list msgs: Any messages generated during removal. """ count = 0 msgs = [] lock_path = id_dir.with_suffix('.lock') with lockfile.LockFile(lock_path, timeout=1): for path in select(id_dir=id_dir, filter_func=filter_func, transform=transform).paths: try: shutil.rmtree(path.as_posix()) except OSError as err: msgs.append("Could not remove {} {}: {}" .format(id_dir.name, path.as_posix(), err)) continue count += 1 if verbose: msgs.append("Removed {} {}.".format(id_dir.name, path.name)) reset_pkey(id_dir) return count, msgs
def _load_config(cls, test_path): """Load a saved test configuration.""" config_path = test_path/'config' # make lock lock_path = test_path/'config.lockfile' config_lock = lockfile.LockFile(lock_path) if not config_path.is_file(): raise TestRunError("Could not find config file for test at {}." .format(test_path)) try: config_lock.lock() with config_path.open('r') as config_file: # Because only string keys are allowed in test configs, # this is a reasonable way to load them. return json.load(config_file) except TypeError as err: raise TestRunError("Bad config values for config '{}': {}" .format(config_path, err)) except (IOError, OSError) as err: raise TestRunError("Error reading config file '{}': {}" .format(config_path, err)) finally: config_lock.unlock()
def _save_config(self): """Save the configuration for this test to the test config file.""" config_path = self.path/'config' # make lock lock_path = self.path/'config.lockfile' config_lock = lockfile.LockFile( lock_path, group=self._pav_cfg.shared_group) try: config_lock.lock() with config_path.open('w') as json_file: pavilion.output.json_dump(self.config, json_file) except (OSError, IOError) as err: raise TestRunError( "Could not save TestRun ({}) config at {}: {}" .format(self.name, self.path, err)) except TypeError as err: raise TestRunError( "Invalid type in config for ({}): {}" .format(self.name, err)) finally: config_lock.unlock()
def create_id_dir(id_dir): """In the given directory, create the lowest numbered (positive integer) directory that doesn't already exist. :param Path id_dir: Path to the directory that contains these 'id' directories :returns: The id and path to the created directory. :rtype: list(int, Path) :raises OSError: on directory creation failure. :raises TimeoutError: If we couldn't get the lock in time. """ lockfile_path = id_dir / '.lockfile' with lockfile.LockFile(lockfile_path, timeout=1): ids = list(os.listdir(str(id_dir))) # Only return the test directories that could be integers. ids = [id_ for id_ in ids if id_.isdigit()] ids = [id_ for id_ in ids if (id_dir / id_).is_dir()] ids = [int(id_) for id_ in ids] ids.sort() # Find the first unused id. id_ = 1 while id_ in ids: id_ += 1 path = utils.make_id_path(id_dir, id_) path.mkdir() return id_, path
def create_id_dir(id_dir: Path, group: str, umask: int) -> (int, Path): """In the given directory, create the lowest numbered (positive integer) directory that doesn't already exist. :param id_dir: Path to the directory that contains these 'id' directories :param group: The group owner for this path. :param umask: The umask to apply to this path. :returns: The id and path to the created directory. :raises OSError: on directory creation failure. :raises TimeoutError: If we couldn't get the lock in time. """ lockfile_path = id_dir/'.lockfile' with lockfile.LockFile(lockfile_path, timeout=1): next_fn = id_dir/PKEY_FN next_valid = True if next_fn.exists(): try: with next_fn.open() as next_file: next_id = int(next_file.read()) next_id_path = make_id_path(id_dir, next_id) if next_id_path.exists(): next_valid = False except (OSError, ValueError): # In either case, on failure, invalidate the next file. next_valid = False else: next_valid = False if not next_valid: # If the next file's id wasn't valid, then find the next available # id directory the hard way. ids = list(os.listdir(str(id_dir))) # Only return the test directories that could be integers. ids = [id_ for id_ in ids if id_.isdigit()] ids = [int(id_) for id_ in ids] ids.sort() # Find the first unused id. next_id = 1 while next_id in ids: next_id += 1 next_id_path = make_id_path(id_dir, next_id) with permissions.PermissionsManager(next_id_path, group, umask), \ permissions.PermissionsManager(next_fn, group, umask): next_id_path.mkdir() with next_fn.open('w') as next_file: next_file.write(str(next_id + 1)) return next_id, next_id_path
def test_locks(self): # Make sure the lock can be created and deleted with no contention and in non-blocking mode. with lockfile.LockFile(self.lock_path): # Make sure the lock was created. self.assertTrue(os.path.exists(self.lock_path)) # Make sure the lock is deleted after close. self.assertFalse(os.path.exists(self.lock_path)) lockfile.LockFile._create_lockfile(self.lock_path, 100, '1234') # Remove the lockfile after 1 second of trying. sp.call("sleep 1; rm {}".format(self.lock_path), shell=True) # Test waiting for the lockfile. with lockfile.LockFile(self.lock_path, timeout=2): pass # Making sure that we can automatically acquire and delete an expired lockfile. lockfile.LockFile._create_lockfile(self.lock_path, -100, '1234') with lockfile.LockFile(self.lock_path, timeout=1): pass # Lock objects are reusable. lock = lockfile.LockFile(self.lock_path) with lock: pass with lock: pass # Make sure we can set the group on the lockfile. # We need a group other than our default. groups = os.getgroups() groups.remove(os.getuid()) if not groups: print( "Could not test group permissions with lockfile, no suitable alternate group " "found.", file=sys.stderr) else: group = groups.pop() with lockfile.LockFile(self.lock_path, group=grp.getgrgid(group).gr_name): stat = os.stat(self.lock_path) self.assertEqual(stat.st_gid, group) self.assertEqual(stat.st_mode & 0o777, lockfile.LockFile.LOCK_PERMS)
def reset_pkey(id_dir: Path) -> None: """Reset the the 'next_id' for the given directory by deleting the pkey file ('next_id') if present.""" with lockfile.LockFile(id_dir/'.lockfile', timeout=1): try: (id_dir/PKEY_FN).unlink() except OSError: pass
def test_lock_errors(self): def _acquire_lock(*args, **kwargs): with lockfile.LockFile(self.lock_path, *args, **kwargs): pass # The lock should time out properly. lockfile.LockFile._create_lockfile(self.lock_path, 100, '1234') self.assertRaises(TimeoutError, _acquire_lock, timeout=0.2) self.lock_path.unlink() # This shouldn't cause an error, but should get logged. with lockfile.LockFile(self.lock_path): self.lock_path.unlink() with lockfile.LockFile(self.lock_path): self.lock_path.unlink() lockfile.LockFile._create_lockfile(self.lock_path, 100, 'abcd') # Remove our bad lockfile self.lock_path.unlink()
def test_lock_errors(self): def _acquire_lock(*args, **kwargs): with lockfile.LockFile(self.lock_path, *args, **kwargs): pass # We can't acquire the lock more than once at a time. with lockfile.LockFile(self.lock_path): self.assertRaises(RuntimeError, _acquire_lock) # The lock should time out properly. lockfile.LockFile._create_lockfile(self.lock_path, 100, '1234') self.assertRaises(lockfile.TimeoutError, _acquire_lock, timeout=0.2) os.unlink(self.lock_path) # This shouldn't cause an error, but should get logged. with lockfile.LockFile(self.lock_path): os.unlink(self.lock_path) with lockfile.LockFile(self.lock_path): os.unlink(self.lock_path) lockfile.LockFile._create_lockfile(self.lock_path, 100, 'abcd') # Remove our bad lockfile os.unlink(self.lock_path)
def delete_unused(tests_dir: Path, builds_dir: Path, verbose: bool = False) \ -> (int, List[str]): """Delete all the build directories, that are unused by any test run. :param tests_dir: The test_runs directory path object. :param builds_dir: The builds directory path object. :param verbose: Print :return int count: The number of builds that were removed. """ used_build_paths = _get_used_build_paths(tests_dir) def filter_builds(build_path: Path) -> bool: """Return whether a build is not used.""" return build_path.name not in used_build_paths count = 0 lock_path = builds_dir.with_suffix('.lock') msgs = [] with lockfile.LockFile(lock_path) as lock: for path in dir_db.select(builds_dir, filter_builds, fn_base=16)[0]: lock.renew() try: shutil.rmtree(path.as_posix()) path.with_suffix(TestBuilder.FINISHED_SUFFIX).unlink() except OSError as err: msgs.append("Could not remove build {}: {}".format(path, err)) continue count += 1 if verbose: msgs.append('Removed build {}.'.format(path.name)) return count, msgs
def prune_result_log(log_path: Path, ids: List[str]) -> List[dict]: """Remove records corresponding to the given test ids. Ids can be either an test run id or a test run uuid. :param log_path: The result log path. :param ids: A list of test run ids and/or uuids. :returns: A list of the pruned result dictionaries. :raises ResultError: When we can't overwrite the log file. """ pruned = [] rewrite_log_path = log_path.with_suffix('.rewrite') lockfile_path = log_path.with_suffix(log_path.suffix + '.lock') with _lockfile.LockFile(lockfile_path) as lock, \ log_path.open() as result_log, \ rewrite_log_path.open('w') as rewrite_log: for line in result_log: lock.renew() try: result = json.loads(line) except json.JSONDecodeError: # If we can't parse the line, just rewrite it as is. rewrite_log.write(line) continue if not (str(result.get('id')) in ids or result.get('uuid') in ids): rewrite_log.write(line) else: pruned.append(result) log_path.unlink() rewrite_log_path.rename(log_path) return pruned
def build(self): """Perform the build if needed, do a soft-link copy of the build directory into our test directory, and note that we've used the given build. Returns True if these steps completed successfully. """ # Only try to do the build if it doesn't already exist. if not os.path.exists(self.build_origin): # Make sure another test doesn't try to do the build at # the same time. # Note cleanup of failed builds HAS to occur under this lock to # avoid a race condition, even though it would be way simpler to # do it in .build() lock_path = '{}.lock'.format(self.build_origin) with lockfile.LockFile(lock_path, group=self._pav_cfg.shared_group): # Make sure the build wasn't created while we waited for # the lock. if not os.path.exists(self.build_origin): build_dir = self.build_origin + '.tmp' # Attempt to perform the actual build, this shouldn't # raise an exception unless # something goes terribly wrong. if not self._build(build_dir): # The build failed. The reason should already be set # in the status file. def handle_error(_, path, exc_info): self.LOGGER.error("Error removing temporary build " "directory '{}': {}" .format(path, exc_info)) # Cleanup the temporary build tree. shutil.rmtree(path=build_dir, onerror=handle_error) return False # Rename the build to it's final location. os.rename(build_dir, self.build_origin) # Perform a symlink copy of the original build directory into our test # directory. try: shutil.copytree(self.build_origin, self.build_path, symlinks=True, copy_function=utils.symlink_copy) except OSError as err: msg = "Could not perform the build directory copy: {}".format(err) self.status.set(STATES.BUILD_ERROR, msg) self.LOGGER.error(msg) return False # Touch the original build directory, so that we know it was used # recently. try: now = time.time() os.utime(self.build_origin, (now, now)) except OSError as err: self.LOGGER.warning("Could not update timestamp on build directory " "'{}': {}" .format(self.build_origin, err)) return True
def build(self, cancel_event=None): """Perform the build if needed, do a soft-link copy of the build directory into our test directory, and note that we've used the given build. :param threading.Event cancel_event: Allows builds to tell each other to die. :return: True if these steps completed successfully. """ # Only try to do the build if it doesn't already exist and is finished. if not self.finished_path.exists(): # Make sure another test doesn't try to do the build at # the same time. # Note cleanup of failed builds HAS to occur under this lock to # avoid a race condition, even though it would be way simpler to # do it in .build() self.tracker.update(state=STATES.BUILD_WAIT, note="Waiting on lock for build {}.".format( self.name)) lock_path = self.path.with_suffix('.lock') with lockfile.LockFile(lock_path, group=self._pav_cfg.shared_group)\ as lock: # Make sure the build wasn't created while we waited for # the lock. if not self.finished_path.exists(): self.tracker.update(state=STATES.BUILDING, note="Starting build {}.".format( self.name)) # If the build directory exists, we're assuming there was # an incomplete build at this point. if self.path.exists(): self.tracker.warn( "Build lock acquired, but build exists that was " "not marked as finished. Deleting...") try: shutil.rmtree(self.path) except OSError as err: self.tracker.error( "Could not remove unfinished build.\n{}". format(err.args[0])) return False # Attempt to perform the actual build, this shouldn't # raise an exception unless something goes terribly # wrong. # This will also set the test status for # non-catastrophic cases. with PermissionsManager(self.path, self._group, self._umask): if not self._build(self.path, cancel_event, lock=lock): try: self.path.rename(self.fail_path) except FileNotFoundError as err: self.tracker.error( "Failed to move build {} from {} to " "failure path {}: {}".format( self.name, self.path, self.fail_path, err)) self.fail_path.mkdir() if cancel_event is not None: cancel_event.set() return False # Make a file with the test id of the building test. built_by_path = self.path / '.built_by' try: with PermissionsManager(built_by_path, self._group, self._umask | 0o222), \ built_by_path.open('w') as built_by: built_by.write(str(self.test.id)) except OSError: self.tracker.warn("Could not create built_by file.") try: with PermissionsManager(self.finished_path, self._group, self._umask): self.finished_path.touch() except OSError: self.tracker.warn("Could not touch '<build>.finished' " "file.") else: self.tracker.update( state=STATES.BUILD_REUSED, note="Build {s.name} created while waiting for build " "lock.".format(s=self)) else: self.tracker.update( note=("Test {s.name} run {s.test.id} reusing build.".format( s=self)), state=STATES.BUILD_REUSED) return True
def build(self): """Perform the build if needed, do a soft-link copy of the build directory into our test directory, and note that we've used the given build. :return: True if these steps completed successfully. """ # Only try to do the build if it doesn't already exist. if not self.build_origin.exists(): fprint("Test {s.name} run {s.id} building {s.build_hash}".format( s=self), file=sys.stderr) self.status.set(STATES.BUILDING, "Starting build {}.".format(self.build_hash)) # Make sure another test doesn't try to do the build at # the same time. # Note cleanup of failed builds HAS to occur under this lock to # avoid a race condition, even though it would be way simpler to # do it in .build() lock_path = self.build_origin.with_suffix('.lock') with lockfile.LockFile(lock_path, group=self._pav_cfg.shared_group): # Make sure the build wasn't created while we waited for # the lock. if not self.build_origin.exists(): build_dir = self.build_origin.with_suffix('.tmp') # Attempt to perform the actual build, this shouldn't # raise an exception unless something goes terribly # wrong. # This will also set the test status for # non-catastrophic cases. if not self._build(build_dir): # If the build didn't succeed, copy the attempted build # into the test run, and set the run as complete. if build_dir.exists(): build_dir.rename(self.build_path) self.set_run_complete() return False # Rename the build to it's final location. build_dir.rename(self.build_origin) else: self.status.set( STATES.BUILDING, "Build {} created while waiting for build lock.". format(self.build_hash)) # Make a symlink in the build directory that points to # the original test that built it try: dst = self.build_origin / '.built_by' src = self.path dst.symlink_to(src, True) dst.resolve() except OSError: self.logger.warning("Could not create symlink to test") else: fprint( "Test {s.name} run {s.id} reusing build {s.build_hash}".format( s=self), file=sys.stderr) self.status.set(STATES.BUILDING, "Build {} already exists.".format(self.build_hash)) # Perform a symlink copy of the original build directory into our test # directory. try: shutil.copytree(self.build_origin.as_posix(), self.build_path.as_posix(), symlinks=True, copy_function=utils.symlink_copy) except OSError as err: msg = "Could not perform the build directory copy: {}".format(err) self.status.set(STATES.BUILD_ERROR, msg) self.logger.error(msg) self.set_run_complete() return False # Touch the original build directory, so that we know it was used # recently. try: now = time.time() os.utime(self.build_origin.as_posix(), (now, now)) except OSError as err: self.logger.warning( "Could not update timestamp on build directory '%s': %s", self.build_origin, err) return True
def _acquire_lock(*args, **kwargs): with lockfile.LockFile(self.lock_path, *args, **kwargs): pass
def build(self, cancel_event=None): """Perform the build if needed, do a soft-link copy of the build directory into our test directory, and note that we've used the given build. :param threading.Event cancel_event: Allows builds to tell each other to die. :return: True if these steps completed successfully. """ # Only try to do the build if it doesn't already exist. if not self.path.exists(): # Make sure another test doesn't try to do the build at # the same time. # Note cleanup of failed builds HAS to occur under this lock to # avoid a race condition, even though it would be way simpler to # do it in .build() self.tracker.update(state=STATES.BUILD_WAIT, note="Waiting on lock for build {}.".format( self.name)) lock_path = self.path.with_suffix('.lock') with lockfile.LockFile(lock_path, group=self._pav_cfg.shared_group): # Make sure the build wasn't created while we waited for # the lock. if not self.path.exists(): self.tracker.update(state=STATES.BUILDING, note="Starting build {}.".format( self.name)) build_dir = self.path.with_suffix('.tmp') # Attempt to perform the actual build, this shouldn't # raise an exception unless something goes terribly # wrong. # This will also set the test status for # non-catastrophic cases. if not self._build(build_dir, cancel_event): try: build_dir.rename(self.fail_path) except FileNotFoundError as err: self.tracker.error( "Failed to move build {} from {} to " "failure path {}: {}".format( self.name, build_dir, self.fail_path, err)) self.fail_path.mkdir() if cancel_event is not None: cancel_event.set() # If the build didn't succeed, copy the attempted build # into the test run, and set the run as complete. return False # Rename the build to it's final location. build_dir.rename(self.path) # Make a file with the test id of the building test. try: dst = self.path / '.built_by' with dst.open('w') as built_by: built_by.write(str(self.test.id)) except OSError: self.tracker.warn("Could not create built_by file.") else: self.tracker.update( state=STATES.BUILD_REUSED, note="Build {s.name} created while waiting for build " "lock.".format(s=self)) else: self.tracker.update( note=("Test {s.name} run {s.test.id} reusing build.".format( s=self)), state=STATES.BUILD_REUSED) return True
if 'USER' in os.environ: return os.environ['USER'] try: name = subprocess.check_output(['id', '-un'], stderr=subprocess.DEVNULL) return name.decode('utf8').strip() except Exception: raise RuntimeError( "Could not get the name of the current user.") log_dir = '/tmp/{}'.format(get_login()) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.basicConfig(filename=os.path.join(log_dir, 'pavilion_tests.log')) package_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.append(os.path.join(package_root, 'lib')) from pavilion import lockfile while True: try: with lockfile.LockFile(sys.argv[1], timeout=0.5) as lock: time.sleep(0.01) # If we don't sleep, the sem proc will probably get the lock right back. time.sleep(0.2) except TimeoutError: continue
from pavilion import lockfile from pathlib import Path import time acquires = 500 acquired = 0 now = time.time() # Delay for the rest of the minute go_time = (now - now%60 + 60) while time.time() < go_time: time.sleep(.01) acquire_times = [] print('starting', time.time(), flush=True) # Acquire a bunch of locks to give plenty of chances for things to break. # More locking attempts also mean more time for runs on multiple systems # to overlap. while acquired < acquires: start = time.time() with lockfile.LockFile('/usr/projects/hpctest/.locktest'): acquire_times.append(time.time() - start) print(".", end="", flush=True) acquired += 1 print('finished', time.time()) print('avg acquire', sum(acquire_times)/len(acquire_times))