def run_indri(args, output, overwrite_threads=False): from subprocess import Popen, PIPE import os cancel = Variable('cancel', get_client()) if cancel.get(): return ('canceled', get_worker().address, 0, get_loadinfo()) start = time.time() if overwrite_threads: processes = len(os.sched_getaffinity(0)) - 1 args = (args[0], '-threads={}'.format(processes), *args[1:]) with Popen(args, stdout=PIPE, stderr=PIPE) as proc: content = [] for l in proc.stdout: content.append(l) if len(content) % 1000 != 0: continue if cancel.get(): proc.kill() return ('killed', get_worker().address, time.time() - start, get_loadinfo()) with open(output, 'wb') as f: f.writelines(content) return ('completed', get_worker().address, time.time() - start, get_loadinfo())
class ClusterShareMemory(ShareMemory): """Share Memory for dask cluster.""" def __init__(self, name): from dask.distributed import Variable self.var = Variable(name, client=ShareMemoryClient().client) def put(self, value): """Put value into shared data.""" self.var.set(str(value)) def get(self): """Get value from shared data.""" # TODO: block issue when var no data. return ast.literal_eval(self.var.get(timeout=2)) def delete(self): """Delete data according to name.""" self.var.delete() def close(self): """Close Share Memory.""" ShareMemoryClient().close()
def run_indri_cluster(scheduler, indri, params, runs, overwrite): client = Client(scheduler) available_workers = get_worker_load(client) ntasks = len(params) for w in available_workers: logging.info('{:<27} {:<22}'.format(w[0], format_loadavg(w[1:]))) logging.info('{} tasks in total'.format(len(params))) logging.info('{} workers in total'.format(len(available_workers))) cancel = Variable('cancel', client) cancel.set(False) def signal_handler(sig, frame): cancel.set(True) logging.info( 'CTRL-C received. It may take a while to kill running tasks.') signal.signal(signal.SIGINT, signal_handler) indri_args = [(str(indri.resolve()), str(p.resolve())) for p in params] fp_runs = [str(r.resolve()) for r in runs] overwrite = [overwrite] * len(runs) schedule_loop(client, ntasks, cancel, runs, indri_args, fp_runs, overwrite)
def __init__(self, Client, Ssize, rank, arrays, deisa_arrays_dtype): self.client = Client self.rank = rank listw = Variable("workers").get() if Ssize > len(listw): # more processes than workers self.workers = [listw[rank % len(listw)]] else: k = len(listw) // Ssize # more workers than processes self.workers = listw[rank * k:rank * k + k] self.arrays = arrays for ele in self.arrays: self.arrays[ele]["dtype"] = str(deisa_arrays_dtype[ele]) self.arrays[ele]["timedim"] = self.arrays[ele]["timedim"][0] self.position = [ self.arrays[ele]["starts"][i] // self.arrays[ele]["subsizes"][i] for i in range(len(np.array(self.arrays[ele]["sizes"]))) ] if rank == 0: Queue("Arrays").put( self.arrays ) # If and only if I have a perfect domain decomposition
def __init__(self, Sworker, scheduler_info): with open(scheduler_info) as f: s = json.load(f) self.adr = s["address"] self.client = Client(self.adr, serializers=[ 'dask', 'pickle' ]) # msgpack pour grand message ne serialize pas dask.config.set({ "distributed.deploy.lost-worker-timeout": 60, "distributed.workers.memory.spill": 0.97, "distributed.workers.memory.target": 0.95, "distributed.workers.memory.terminate": 0.99 }) self.workers = [ comm.get_address_host_port(i, strict=False) for i in self.client.scheduler_info()["workers"].keys() ] while (len(self.workers) != Sworker): self.workers = [ comm.get_address_host_port(i, strict=False) for i in self.client.scheduler_info()["workers"].keys() ] Variable("workers").set(self.workers)
def run_test_with_timeout( test_config: TestConfig, incoming_state: dict, hostnames: List[str], duration: int = 15, ) -> dict: """ Calls run_test with a timeout and signals run_test to end gracefully if timeout has completed Args: test_config: Config of test to run incoming_state: Initial state to run actions/asserts in hostnames: List of runner hostnames duration: Optional timeout to run test within (I suppose this is to make it convenient to call in runners) Returns: New state after running actions and asserts """ if duration is None or duration < 0: return run_test(test_config, incoming_state, hostnames) # NOTE: Use a dask cluster scheduler? client = get_client() # NOTE: may improve way of doing this timeout_signal_name = f"keep-going-{str(uuid.uuid4())}" keep_going = Variable(timeout_signal_name) keep_going.set(True) run_test_task: Future = client.submit( run_test, test_config=test_config, incoming_state=incoming_state, hostnames=hostnames, timeout_signal_name=timeout_signal_name, ) LOGGER.debug("Test duration config: %d seconds", duration) def distributed_timeout(): # If a timeout from a previous test did not complete, it will keep running (it cannot be canceled) # However, if it keeps running, it can end another test early # This means it needs to receive a signal to return end_time = datetime.now() + timedelta(seconds=duration) while datetime.now() <= end_time and keep_going.get(): time.sleep(test_config.get("secondsBetweenCycles", 1)) timeout_task: Future = client.submit(distributed_timeout) # Wait for either test or timeout to finish # Return test result if it finishes first # End test if timeout finishes first and return state start = datetime.now() wait([run_test_task, timeout_task], return_when="FIRST_COMPLETED") end = datetime.now() LOGGER.debug("Test %s took %d seconds", test_config["name"], (end - start).seconds) if run_test_task.done(): keep_going.set(False) return run_test_task.result() elif timeout_task.done(): LOGGER.debug(timeout_task) LOGGER.info("Test %s timed out", test_config["name"]) # NOTE: add timed out to summary? keep_going.set(False) return run_test_task.result()
def run_test( test_config: TestConfig, incoming_state: dict, hostnames: List[str], timeout_signal_name: str = None, ) -> dict: """ Runs actions and asserts in provided test and returns new state with finished actions/asserts Args: test_config: test configuration to run incoming_state: Initial state of test (does not modify) hostnames: Addresses of runners to run actions/asserts on timeout_signal_name: Optional Dask variable to check if test has timed out so it can end gracefully Returns: New state after running actions and asserts """ actions = test_config.get("actions", []) asserts = test_config.get("asserts", []) default_cycles = get_default_cycles(actions, asserts) remaining_cycles = test_config.get("cycles", default_cycles) completed_cycles = 0 # NOTE: possibly use infinite default dict state = defaultdict(dict, incoming_state) # Validate test before running action_names = [] assert_names = [] for action in actions: assert ( "type" in action ), f"Action in test '{test_config['name']}' is missing property 'type'" action_name = action.get("name") if action_name is None: action_name = create_item_name(action["type"], action_names) # NOTE: sets action name if not set action["name"] = action_name action_names.append(action_name) for asrt in asserts: assert ( "type" in asrt ), f"Assert in test '{test_config['name']}' is missing property 'type'" assert_name = asrt.get("name") if assert_name is None: assert_name = create_item_name(asrt["type"], assert_names) # NOTE: sets assert name if not set asrt["name"] = assert_name assert_names.append(assert_name) assert hostnames, "Must have at least one host to run tests" assert len(set(action_names)) == len( action_names ), "Action names if specified must be unique" assert len(set(assert_names)) == len( assert_names ), "Assert names if specified must be unique" start_time = datetime.now() # stop if remaining_cycles == 0 or had asserts and no asserts remain while continue_running( asserts, remaining_cycles, state[test_config["name"]].get("asserts", {}) ): # Check if running with a timeout and break if timeout has signaled if timeout_signal_name is not None: keep_going = Variable(timeout_signal_name, client=get_client()) if not keep_going.get(): break # NOTE: exceptions thrown in actions/asserts cause rest of test to exit action_distribution_strategy = test_config.get( "actionDistributionStrategy", "parallel" ) if actions: assert action_distribution_strategy in [ "parallel", "series", ], f"actionDistributionStrategy must be 'parallel' or 'series', got '{action_distribution_strategy}'" if action_distribution_strategy == "series": run_actions_func = run_actions_series else: run_actions_func = run_actions_parallel state[test_config["name"]]["actions"] = run_actions_func( actions, state, test_config["name"], hostnames, test_config.get("secondsBetweenActions", 0), ) assert_distribution_strategy = test_config.get( "assertDistributionStrategy", "series" ) if asserts: assert assert_distribution_strategy in [ "parallel", "series", ], f"assertDistributionStrategy must be 'parallel' or 'series', got '{assert_distribution_strategy}'" if assert_distribution_strategy == "parallel": run_asserts_func = run_asserts_parallel else: run_asserts_func = run_asserts_series state[test_config["name"]]["asserts"] = run_asserts_func( asserts, state, test_config["name"], hostnames, test_config.get("secondsBetweenAsserts", 0), ) remaining_cycles -= 1 completed_cycles += 1 # Wait between cycles if test is to continue running if continue_running( asserts, remaining_cycles, state[test_config["name"]].get("asserts", {}) ): time.sleep(test_config.get("secondsBetweenCycles", 1)) remaining_asserts = get_remaining_asserts( asserts, state[test_config["name"]].get("asserts", {}) ) state[test_config["name"]]["summary"] = TestSummary( description=test_config.get("description"), completed_cycles=completed_cycles, remaining_asserts=[asrt["name"] for asrt in remaining_asserts], error=None, duration=(datetime.now() - start_time).seconds, ) return state
brake.set(False) return None #######start the sankof algo here ####################### print('starting sankof') #scale cluster #scatter the blank tree and row index for each process #remote_tree = client.scatter(tree) remote_index = client.scatter(IDindex) inq = Queue('inq') outq = Queue('outq') lock = Lock('x') stopiter = Variable(False) brake = Variable(True) saver_started = False workers_started = False #start workers for workers in range(NCORE*ncpu ): w = client.submit( calculate_small_parsimony , inq= None ,outq = None ,stopiter= stopiter , treefile=treefile , bootstrap_replicates = bootstrap_replicates, matfile= alnfile+'.h5' , row_index= remote_index , iolock = lock, verbose = False ) fire_and_forget(w) s = client.submit( collect_futures , queue= None , stopiter=stopiter , brake = brake, runName= runName , nucleotides_only =False ) saver_started = True fire_and_forget(s)
v = np.zeros(P) max_iterations = P * 10 # In cluster mode, pass the address:port of the Scheduler client = Client() # Read a text file into a Dask bag ~ Spark RDD input = db.read_text(file_s) #Apparently in Bags, you need another Bag which has the indices to zip with. Hence, creating indices for our data l = db.from_sequence(range(input.count()), npartitions = 1) S = input_to_rowmatrix(input, l, True) #Global/Broadcast Variables _U_ = Variable('_U_') _UU_ = Variable('_UU_') _I_ = Variable('_I_') _VI_ = Variable('_VI_') file_D = os.path.join(args['dictionary'], "{}_D.txt".format(args["prefix"])) file_z = os.path.join(args['output'], "{}_z.txt".format(args["prefix"])) #Start the loop! for m in range(M): print ('M: '+str(m)) seed = np.random.randint(max_iterations + 1, high = 4294967295) np.random.seed(seed) u_old = np.random.random(T) num_iterations = 0 delta = 2 * epsilon
def __init__(self, name): from dask.distributed import Variable self.var = Variable(name, client=ShareMemoryClient().client)