def testmain(self): # Same specification as before generation_specification = {"seed": [1, 2, 3, 4, 5, 6, 7, 8], "num_calls": [[10, 20, 30]]} specifications = SpecificationGenerator().generate(generation_specification) output_generation_specification = {"seed": [1, 2, 3, 4, 5, 6, 7, 8], "num_calls": [10, 20, 30]} output_specifications = SpecificationGenerator().generate(output_generation_specification) name = "test" # This time we will run them all in parallel runner = ExperimentRunner() expr = SimpleExperiment() runner.run(name, specifications, expr, specification_runner=MultiprocessingRunner(), use_dashboard=True, propagate_exceptions=True,context_type="spawn") log_base = os.path.join("experiment_runs",name,"logs") for root, dirs, files in os.walk(log_base): for file in files: with open(os.path.join(root,file),"r") as f: lines = f.readlines() self.assertNotEqual([],lines) for result in experiment_iterator(name): if result["result"] != []: output_specifications.remove(result["specification"]) self.assertEqual([],output_specifications)
def testmain(self): # Same specification as before generation_specification = { "seed": [1, 2, 3, 4, 5, 6, 7, 8], "num_calls": [[10, 20, 30]] } specifications = SpecificationGenerator().generate( generation_specification) output_generation_specification = { "seed": [1, 2, 3, 4, 5, 6, 7, 8], "num_calls": [10, 20, 30] } output_specifications = SpecificationGenerator().generate( output_generation_specification) name = "test" # This time we will run them all in parallel runner = ExperimentRunner() runner.run(name, specifications, SimpleExperiment(), specification_runner=MultiprocessingRunner(), use_dashboard=False, propagate_exceptions=True) for result in experiment_iterator(name): if result["result"] != []: output_specifications.remove(result["specification"]) self.assertEqual([], output_specifications)
) for spec in specifications: if spec["seed"] == 0: print(spec) runner = ExperimentRunner() map_memory(base_specs["file"], base_specs["state_space_dimensionality"]) DEBUG = False if DEBUG: runner.run(name, specifications, PlanningExperiment(), propagate_exceptions=True, specification_runner=MainRunner(), use_dashboard=False, force_pickle=True, context_type="fork") else: gpus = 4 jobs_per_gpu = 2 resources = list(product(list(range(gpus)), list(range(jobs_per_gpu)))) runner.run(name, specifications, PlanningExperiment(), propagate_exceptions=False, specification_runner=MultiprocessingRunner(), context_type="fork", use_dashboard=True, force_pickle=True)
def run(self, name: typing.AnyStr, specifications: typing.List[Specification], experiment: ExperimentBase, continue_from_last_run=True, propagate_exceptions=False, force_pickle=False, specification_runner: SimpleAbstractRunner = MultiprocessingRunner(), use_dashboard=True, context_type="fork", multiprocessing_lib=None,save_every_k=None) -> typing.NoReturn: """ The method called to run an experiment :param propagate_exceptions: If True, exceptions won't be caught and logged as failed experiments but will cause the program to crash (like normal), useful for debugging exeperiments :param name: The name of this experiment batch :param specifications: The list of specifications to run. Should be a list of dictionaries. Each dictionary is passed to the experiment run method :param experiment: The experiment object to run :param continue_from_last_run: If true, will not redo already completed experiments. Defaults to true :param show_progress: Whether or not to show a progress bar for experiment completion :param force_pickle: If true, don't attempt to json serialze results and default to pickling :param specification_runner: An instance of ```AbstractRunner``` that will be used to run the specification :param use_dashboard: If true, use the terminal monitoring dashboard. If false, just stream logs to stdout. :return: No return """ if multiprocessing_lib is None: import multiprocessing as mp else: mp = multiprocessing_lib ctx = mp.get_context(context_type) specification_runner.set_multiprocessing_context(ctx) if specification_runner is None: specification_runner = JoblibRunner(None) dashboard_process = None try: manager = ctx.Manager() eventQueue = manager.Queue(maxsize=2000) put_in_event_queue(eventQueue, StartExperimentEvent(name)) # Set up root smallab logger folder_loc = os.path.join("experiment_runs", name, "logs", str(datetime.datetime.now())) file_loc = os.path.join(folder_loc, "main.log") if not os.path.exists(folder_loc): os.makedirs(folder_loc) logger = logging.getLogger("smallab") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Can't do this with non-fork multiprocessing if context_type == "fork": fh = logging.FileHandler(file_loc) fh.setFormatter(formatter) logger.addHandler(fh) if not use_dashboard: sh = logging.StreamHandler() sh.setFormatter(formatter) logger.addHandler(sh) else: # fq = LogToEventQueue(eventQueue) # sh = logging.StreamHandler(fq) # sh.setFormatter(formatter) # Add to root so all logging appears in dashboard not just smallab. # logging.getLogger().addHandler(sh) dashboard_process = ctx.Process(target=start_dashboard, args=(eventQueue,)) dashboard_process.start() experiment.set_logging_folder(folder_loc) self.force_pickle = force_pickle if not os.path.exists(get_save_directory(name)): os.makedirs(get_save_directory(name)) if continue_from_last_run: need_to_run_specifications = self._find_uncompleted_specifications(name, specifications) else: need_to_run_specifications = specifications for callback in self.callbacks: callback.set_experiment_name(name) for specification in need_to_run_specifications: put_in_event_queue(eventQueue, RegisterEvent(specification_hash(specification), specification)) if isinstance(specification_runner, SimpleAbstractRunner): specification_runner.run(need_to_run_specifications, lambda specification: run_and_save(name, experiment, specification, propagate_exceptions, self.callbacks, self.force_pickle, eventQueue)) elif isinstance(specification_runner, ComplexAbstractRunner): specification_runner.run(need_to_run_specifications, name, experiment, propagate_exceptions, self.callbacks, self.force_pickle, eventQueue) self._write_to_completed_json(name, specification_runner.get_completed(), specification_runner.get_failed_specifications()) # Call batch complete functions if specification_runner.get_exceptions() != []: for callback in self.callbacks: callback.on_batch_failure(specification_runner.get_exceptions(), specification_runner.get_failed_specifications()) if specification_runner.get_completed() != []: for callback in self.callbacks: callback.on_batch_complete(specification_runner.get_completed()) finally: if dashboard_process is not None: dashboard_process.terminate()
# time.sleep(int(.5 * self.r)) #this experiment can have a random transient failure! #Since it's checkpointed, it will likely succeed after running it again if random.randint(0,100) > 90: raise Exception("Something bad happened, a moth flew into the computer!") if self.i >= self.num_calls: #Done with the experiment, return the results dictionary like normal return {"number": self.r} else: #This experiment isn't done, return the progress as a tuple to update the dashboard return (self.i, self.num_calls) # Tells the experiment framework how many iterations this will experiment run for def max_iterations(self, specification): return specification["num_calls"] #Same specification as before generation_specification = {"seed": [1, 2, 3, 4, 5, 6, 7, 8], "num_calls": [100, 200, 300]} specifications = SpecificationGenerator().generate(generation_specification) name = "checkpointed_run" #This time we will run them all in parallel runner = ExperimentRunner() runner.run(name, specifications, SimpleExperiment(),specification_runner=MultiprocessingRunner()) #Some of our experiments may have failed, let's call run again to hopefully solve that runner.run(name, specifications, SimpleExperiment(),specification_runner=MultiprocessingRunner()) #Cleanup example delete_experiments_folder(name)
print(results["specification"]["seed"]) print(results["result"]["number"]) # If you have an experiment you want run on a lot of computers you can use the MultiComputerGenerator # You assign each computer a number from 0..number_of_computers-1 and it gives each computer every number_of_computerth specification from smallab.specification_generator import MultiComputerGenerator all_specifications = SpecificationGenerator().from_json_file('test.json') g1 = MultiComputerGenerator(0, 2) g2 = MultiComputerGenerator(1, 2) specifications_1 = g1.from_json_file("test.json") specifications_2 = g2.from_json_file("test.json") assert len(specifications_1) + len(specifications_2) == len(all_specifications) # Need to freeze the sets in order to do set manipulation on dictionaries specifications_1 = set([frozenset(sorted(x.items())) for x in specifications_1]) specifications_2 = set([frozenset(sorted(x.items())) for x in specifications_2]) all_specifications = set([frozenset(sorted(x.items())) for x in all_specifications]) # This will generate two disjoint sets of specifications assert specifications_1.isdisjoint(specifications_2) # That together make the whole specification assert specifications_1.union(specifications_2) == all_specifications # You can use the provided logging callbacks to log completion and failure of specific specifcations runner.run('with_logging', SpecificationGenerator().from_json_file("test.json"), SimpleExperiment(), continue_from_last_run=True, specification_runner=MultiprocessingRunner())
specification["num_calls"] = self.i result = {"r": self.r} progress = self.i max_iterations = self.num_calls return OverlappingOutputCheckpointedExperimentReturnValue(should_continue, specification, result, progress, max_iterations) else: # This experiment isn't done, return the progress as a tuple to update the dashboard return (self.i, self.num_calls) #Tells the dashboard how many iterations this experiment will run for def max_iterations(self,specification): return specification["num_calls"] # Same specification as before generation_specification = {"seed": [1, 2, 3, 4, 5, 6, 7, 8], "num_calls": (10, 20, 30)} specifications = SpecificationGenerator().generate(generation_specification) name = "overlapping_checkpointed_run" # This time we will run them all in parallel runner = ExperimentRunner() runner.run(name, specifications, SimpleExperiment(), specification_runner=MultiprocessingRunner(), use_dashboard=False, propagate_exceptions=True) # Some of our experiments may have failed, let's call run again to hopefully solve that runner.run(name, specifications, SimpleExperiment(), specification_runner=MultiprocessingRunner(), use_dashboard=False, propagate_exceptions=True) # Cleanup example delete_experiments_folder(name)
"seed": list(range(5)), "alpha_param": 6, "beta_param":1, "epsilon": 10, "delta": 0.1, "plan_commitment_algorithm": "n_steps", "plan_threshold": [1], "sample_observations": False, "use_expected_improvement":False, "planning_steps": 200 } ##Create shared memory map_memory(generation_specifications["file"], generation_specifications["state_space_dimensionality"]) specifications = SpecificationGenerator().generate(generation_specifications) runner = ExperimentRunner() DEBUG = False if DEBUG: runner.run(name, specifications, PlanningExperiment(), propagate_exceptions=True, specification_runner=MainRunner(), use_dashboard=False, force_pickle=True, context_type="fork") else: runner.run(name, specifications, PlanningExperiment(), propagate_exceptions=False, specification_runner=MultiprocessingRunner(), context_type="fork", use_dashboard=True, force_pickle=True)