def main(): sys.setrecursionlimit(2000) init_directories() clean_up_empty() GPUs = conf['GPUs'] START_PHASE = "EVALUATING" while True: if START_PHASE != "EVALUATING": # SELF-PLAY init_predicting_workers(GPUs) workers = [ NoModelSelfPlayWorker(i) for i in range(conf['N_GAME_PROCESS']) ] for p in workers: p.start() for p in workers: p.join() destroy_predicting_workers(GPUs) # EVALUATE init_predicting_workers( GPUs ) # re-init predicting worker to run with latest trained model (sent from train server) workers = [ NoModelEvaluateWorker(i) for i in range(conf['N_GAME_PROCESS']) ] for p in workers: p.start() for p in workers: p.join() workers.clear() destroy_predicting_workers(GPUs) if promote_best_model(): START_PHASE = "" # there are new best model so we doing self-play in next loop
def main(): sys.setrecursionlimit(10000) init_directories() clean_up_empty() GPUs = conf['GPUs'] finished_best_model_name = None while True: init_predicting_workers(GPUs) # Check if we did self-play on this best model or not curr_best_model_name = put_name_request("BEST") if curr_best_model_name != finished_best_model_name: finished_best_model_name = curr_best_model_name else: print("No new best model for self-playing. Stopping..") destroy_predicting_workers(GPUs) break print("SELF-PLAYING BEST MODEL ", curr_best_model_name) workers = [ NoModelSelfPlayWorker(i) for i in range(conf['N_GAME_PROCESS']) ] for p in workers: p.start() for p in workers: p.join() destroy_predicting_workers(GPUs)
def create_initial_model(name, self_play=True): from utils import init_directories init_directories() full_filename = os.path.join(conf['MODEL_DIR'], name) + ".h5" if os.path.isfile(full_filename): model = load_model(full_filename, custom_objects={'loss': loss}) return model model = build_model(name) # Save graph in tensorboard. This graph has the name scopes making it look # good in tensorboard, the loaded models will not have the scopes. tf_callback = TensorBoard(log_dir=os.path.join(conf['LOG_DIR'], name), histogram_freq=0, batch_size=1, write_graph=True, write_grads=False) tf_callback.set_model(model) tf_callback.on_epoch_end(0) tf_callback.on_train_end(0) if self_play: from self_play import self_play self_play(model, n_games=conf['N_GAMES'], mcts_simulations=conf['MCTS_SIMULATIONS']) model.save(full_filename) best_filename = os.path.join(conf['MODEL_DIR'], 'best_model.h5') model.save(best_filename) return model
def main(): init_directories() GPUs = conf['GPUs'] # workers = list() # workers.append(TrainWorker([i for i in range(n_gpu)])) # for p in workers: p.start() # for p in workers: p.join() # workers.clear() train_multi_gpus(n_gpu=len(GPUs))
def main(): print("Starting run (v{})".format(__version__)) init_directories() if conf['THREAD_SIMULATION']: init_simulation_workers() model_name = "model_1" model = create_initial_model(name=model_name) while True: model = load_latest_model() best_model = load_best_model() train(model, game_model_name=best_model.name) evaluate(best_model, model) K.clear_session()
def main(): init_directories() clean_up_empty() resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1)) sys.setrecursionlimit(10**6) GPUs = conf['GPUs'] mgr = registerRemoteFunc() while True: jobs = mgr.get_job(concurency=len(GPUs))._getvalue() logger.info("GOT JOBS %s", jobs) out_dirs = jobs['out_dirs'] assert len(out_dirs) <= len(GPUs) state = jobs['state'] model_check_update(jobs['latest_model_name'], jobs['best_model_name'], mgr) if state == ASYNC_PIPELINE_STATE.SELF_PLAYING.name: logger.info("STARTING REMOTE SELF_PLAY PHASE WITH %s GPUs", len(GPUs)) workers = [ SelfPlayWorker(i, one_game_only=extract_game_number(dir)) for i, dir in enumerate(out_dirs) ] for p in workers: p.start() for p in workers: p.join() workers.clear() send_finish_jobs(jobs, mgr) logger.info("FINISHED SELF_PLAY JOBS %", jobs['id']) elif state == ASYNC_PIPELINE_STATE.EVALUATING.name: logger.info("STARTING REMOTE EVALUATION PHASE WITH %s GPUs", len(GPUs)) workers = [ EvaluateWorker(i, one_game_only=extract_game_number(dir)) for i in GPUs ] for p in workers: p.start() for p in workers: p.join() workers.clear() send_finish_jobs(jobs, mgr) logger.info("FINISHED EVALUATION JOBS %", jobs["id"]) else: print("Unhandled state %s. Sleep 5 to wait for new state" % state) time.sleep(5) continue
def main(): init_directories() clean_up_empty() resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1)) sys.setrecursionlimit(10**6) GPUs = conf['GPUs'] START_PHASE = "SELF-PLAY" STARTED = False while True: if STARTED or START_PHASE == "SELF-PLAY": STARTED = True logger.info("STARTING SELF_PLAY PHASE WITH %s GPUs", len(GPUs)) turn_on_event(ASYNC_PIPELINE_STATE.SELF_PLAYING) init_predicting_workers(GPUs) workers = [NoModelSelfPlayWorker(i) for i in GPUs] for p in workers: p.start() for p in workers: p.join() while is_slave_working(): time.sleep(2) destroy_predicting_workers(GPUs) workers.clear() if STARTED or START_PHASE == "TRAINING": STARTED = True logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs)) turn_on_event(ASYNC_PIPELINE_STATE.TRAINING) trainer = TrainWorker([i for i in GPUs]) trainer.start() trainer.join() if STARTED or START_PHASE == "EVALUATION": STARTED = True logger.info("STARTING EVALUATION PHASE WITH %s GPUs", len(GPUs)) turn_on_event(ASYNC_PIPELINE_STATE.EVALUATING) init_predicting_workers(GPUs) workers = [NoModelEvaluateWorker(i) for i in GPUs] for p in workers: p.start() for p in workers: p.join() while is_slave_working(): time.sleep(2) workers.clear() destroy_predicting_workers(GPUs) promote_best_model()
def main(): init_directories() clean_up_empty() GPUs = conf['GPUs'] EPOCHS_PER_SAVE = conf['EPOCHS_PER_SAVE'] BATCH_SIZE = conf['TRAIN_BATCH_SIZE'] NUM_WORKERS = conf['NUM_WORKERS'] SIZE = conf['SIZE'] n_gpu = len(GPUs) if n_gpu <= 1: raise EnvironmentError( "Number of GPU need > 1 for multi-gpus training") logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs)) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUs).strip('[').strip(']').strip( ' ') global model model = load_latest_model() base_name, index = model.name.split('_') smallest_loss = Inf # try: # model = multi_gpu_model(model, cpu_relocation=True) # print("Training using multiple GPUs..") # except: # print("Training using single GPU or CPU..") opt = SGD(lr=1e-2, momentum=0.9, clipnorm=0.9) model.compile(loss=loss, optimizer=opt, metrics=["accuracy"]) params = { 'dim': (SIZE, SIZE, 17), 'batch_size': BATCH_SIZE * n_gpu, 'shuffle': True } while True: new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5" # partition = get_KGS_training_desc() # get_training_desc() training_generator = KGSDataGenerator([], None, **params) # validation_generator = KGSDataGenerator(partition['validation'], None, **params) reduce_lr = ReduceLROnPlateau(monitor='policy_out_acc', factor=0.1, patience=3, verbose=1, mode='auto', min_lr=0) callbacks_list = [reduce_lr] EPOCHS_PER_BACKUP = conf['EPOCHS_PER_BACKUP'] cycle = EPOCHS_PER_SAVE // EPOCHS_PER_BACKUP for i in range(cycle): logger.info("CYCLE {}/{}".format(i + 1, cycle)) model.fit_generator( generator=training_generator, # validation_data=validation_generator, use_multiprocessing=True, workers=NUM_WORKERS, epochs=EPOCHS_PER_BACKUP, verbose=1, callbacks=callbacks_list) model.save(os.path.join(conf['MODEL_DIR'], "backup.h5")) logger.info('Auto save model backup.h5')