def main(): sys.setrecursionlimit(2000) init_directories() clean_up_empty() GPUs = conf['GPUs'] START_PHASE = "EVALUATING" while True: if START_PHASE != "EVALUATING": # SELF-PLAY init_predicting_workers(GPUs) workers = [ NoModelSelfPlayWorker(i) for i in range(conf['N_GAME_PROCESS']) ] for p in workers: p.start() for p in workers: p.join() destroy_predicting_workers(GPUs) # EVALUATE init_predicting_workers( GPUs ) # re-init predicting worker to run with latest trained model (sent from train server) workers = [ NoModelEvaluateWorker(i) for i in range(conf['N_GAME_PROCESS']) ] for p in workers: p.start() for p in workers: p.join() workers.clear() destroy_predicting_workers(GPUs) if promote_best_model(): START_PHASE = "" # there are new best model so we doing self-play in next loop
def main(): sys.setrecursionlimit(10000) init_directories() clean_up_empty() GPUs = conf['GPUs'] finished_best_model_name = None while True: init_predicting_workers(GPUs) # Check if we did self-play on this best model or not curr_best_model_name = put_name_request("BEST") if curr_best_model_name != finished_best_model_name: finished_best_model_name = curr_best_model_name else: print("No new best model for self-playing. Stopping..") destroy_predicting_workers(GPUs) break print("SELF-PLAYING BEST MODEL ", curr_best_model_name) workers = [ NoModelSelfPlayWorker(i) for i in range(conf['N_GAME_PROCESS']) ] for p in workers: p.start() for p in workers: p.join() destroy_predicting_workers(GPUs)
def main(): init_directories() clean_up_empty() resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1)) sys.setrecursionlimit(10**6) GPUs = conf['GPUs'] mgr = registerRemoteFunc() while True: jobs = mgr.get_job(concurency=len(GPUs))._getvalue() logger.info("GOT JOBS %s", jobs) out_dirs = jobs['out_dirs'] assert len(out_dirs) <= len(GPUs) state = jobs['state'] model_check_update(jobs['latest_model_name'], jobs['best_model_name'], mgr) if state == ASYNC_PIPELINE_STATE.SELF_PLAYING.name: logger.info("STARTING REMOTE SELF_PLAY PHASE WITH %s GPUs", len(GPUs)) workers = [ SelfPlayWorker(i, one_game_only=extract_game_number(dir)) for i, dir in enumerate(out_dirs) ] for p in workers: p.start() for p in workers: p.join() workers.clear() send_finish_jobs(jobs, mgr) logger.info("FINISHED SELF_PLAY JOBS %", jobs['id']) elif state == ASYNC_PIPELINE_STATE.EVALUATING.name: logger.info("STARTING REMOTE EVALUATION PHASE WITH %s GPUs", len(GPUs)) workers = [ EvaluateWorker(i, one_game_only=extract_game_number(dir)) for i in GPUs ] for p in workers: p.start() for p in workers: p.join() workers.clear() send_finish_jobs(jobs, mgr) logger.info("FINISHED EVALUATION JOBS %", jobs["id"]) else: print("Unhandled state %s. Sleep 5 to wait for new state" % state) time.sleep(5) continue
def main(): init_directories() clean_up_empty() resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1)) sys.setrecursionlimit(10**6) GPUs = conf['GPUs'] START_PHASE = "SELF-PLAY" STARTED = False while True: if STARTED or START_PHASE == "SELF-PLAY": STARTED = True logger.info("STARTING SELF_PLAY PHASE WITH %s GPUs", len(GPUs)) turn_on_event(ASYNC_PIPELINE_STATE.SELF_PLAYING) init_predicting_workers(GPUs) workers = [NoModelSelfPlayWorker(i) for i in GPUs] for p in workers: p.start() for p in workers: p.join() while is_slave_working(): time.sleep(2) destroy_predicting_workers(GPUs) workers.clear() if STARTED or START_PHASE == "TRAINING": STARTED = True logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs)) turn_on_event(ASYNC_PIPELINE_STATE.TRAINING) trainer = TrainWorker([i for i in GPUs]) trainer.start() trainer.join() if STARTED or START_PHASE == "EVALUATION": STARTED = True logger.info("STARTING EVALUATION PHASE WITH %s GPUs", len(GPUs)) turn_on_event(ASYNC_PIPELINE_STATE.EVALUATING) init_predicting_workers(GPUs) workers = [NoModelEvaluateWorker(i) for i in GPUs] for p in workers: p.start() for p in workers: p.join() while is_slave_working(): time.sleep(2) workers.clear() destroy_predicting_workers(GPUs) promote_best_model()
def main(): init_directories() clean_up_empty() GPUs = conf['GPUs'] EPOCHS_PER_SAVE = conf['EPOCHS_PER_SAVE'] BATCH_SIZE = conf['TRAIN_BATCH_SIZE'] NUM_WORKERS = conf['NUM_WORKERS'] SIZE = conf['SIZE'] n_gpu = len(GPUs) if n_gpu <= 1: raise EnvironmentError( "Number of GPU need > 1 for multi-gpus training") logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs)) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUs).strip('[').strip(']').strip( ' ') global model model = load_latest_model() base_name, index = model.name.split('_') smallest_loss = Inf # try: # model = multi_gpu_model(model, cpu_relocation=True) # print("Training using multiple GPUs..") # except: # print("Training using single GPU or CPU..") opt = SGD(lr=1e-2, momentum=0.9, clipnorm=0.9) model.compile(loss=loss, optimizer=opt, metrics=["accuracy"]) params = { 'dim': (SIZE, SIZE, 17), 'batch_size': BATCH_SIZE * n_gpu, 'shuffle': True } while True: new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5" # partition = get_KGS_training_desc() # get_training_desc() training_generator = KGSDataGenerator([], None, **params) # validation_generator = KGSDataGenerator(partition['validation'], None, **params) reduce_lr = ReduceLROnPlateau(monitor='policy_out_acc', factor=0.1, patience=3, verbose=1, mode='auto', min_lr=0) callbacks_list = [reduce_lr] EPOCHS_PER_BACKUP = conf['EPOCHS_PER_BACKUP'] cycle = EPOCHS_PER_SAVE // EPOCHS_PER_BACKUP for i in range(cycle): logger.info("CYCLE {}/{}".format(i + 1, cycle)) model.fit_generator( generator=training_generator, # validation_data=validation_generator, use_multiprocessing=True, workers=NUM_WORKERS, epochs=EPOCHS_PER_BACKUP, verbose=1, callbacks=callbacks_list) model.save(os.path.join(conf['MODEL_DIR'], "backup.h5")) logger.info('Auto save model backup.h5')