Ejemplo n.º 1
0
def main():
    sys.setrecursionlimit(2000)
    init_directories()
    clean_up_empty()
    GPUs = conf['GPUs']
    START_PHASE = "EVALUATING"
    while True:
        if START_PHASE != "EVALUATING":
            # SELF-PLAY
            init_predicting_workers(GPUs)
            workers = [
                NoModelSelfPlayWorker(i) for i in range(conf['N_GAME_PROCESS'])
            ]
            for p in workers:
                p.start()
            for p in workers:
                p.join()
            destroy_predicting_workers(GPUs)

        # EVALUATE
        init_predicting_workers(
            GPUs
        )  # re-init predicting worker to run with latest trained model (sent from train server)
        workers = [
            NoModelEvaluateWorker(i) for i in range(conf['N_GAME_PROCESS'])
        ]
        for p in workers:
            p.start()
        for p in workers:
            p.join()
        workers.clear()
        destroy_predicting_workers(GPUs)

        if promote_best_model():
            START_PHASE = ""  # there are new best model so we doing self-play in next loop
Ejemplo n.º 2
0
def main():
    sys.setrecursionlimit(10000)
    init_directories()
    clean_up_empty()
    GPUs = conf['GPUs']
    finished_best_model_name = None
    while True:
        init_predicting_workers(GPUs)
        #  Check if we did self-play on this best model or not
        curr_best_model_name = put_name_request("BEST")
        if curr_best_model_name != finished_best_model_name:
            finished_best_model_name = curr_best_model_name
        else:
            print("No new best model for self-playing. Stopping..")
            destroy_predicting_workers(GPUs)
            break
        print("SELF-PLAYING BEST MODEL ", curr_best_model_name)
        workers = [
            NoModelSelfPlayWorker(i) for i in range(conf['N_GAME_PROCESS'])
        ]
        for p in workers:
            p.start()
        for p in workers:
            p.join()
        destroy_predicting_workers(GPUs)
Ejemplo n.º 3
0
def main():
    init_directories()
    clean_up_empty()
    resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1))
    sys.setrecursionlimit(10**6)
    GPUs = conf['GPUs']

    mgr = registerRemoteFunc()

    while True:
        jobs = mgr.get_job(concurency=len(GPUs))._getvalue()
        logger.info("GOT JOBS %s", jobs)
        out_dirs = jobs['out_dirs']
        assert len(out_dirs) <= len(GPUs)
        state = jobs['state']
        model_check_update(jobs['latest_model_name'], jobs['best_model_name'],
                           mgr)
        if state == ASYNC_PIPELINE_STATE.SELF_PLAYING.name:
            logger.info("STARTING REMOTE SELF_PLAY PHASE WITH %s GPUs",
                        len(GPUs))
            workers = [
                SelfPlayWorker(i, one_game_only=extract_game_number(dir))
                for i, dir in enumerate(out_dirs)
            ]
            for p in workers:
                p.start()
            for p in workers:
                p.join()
            workers.clear()
            send_finish_jobs(jobs, mgr)
            logger.info("FINISHED SELF_PLAY JOBS %", jobs['id'])
        elif state == ASYNC_PIPELINE_STATE.EVALUATING.name:
            logger.info("STARTING REMOTE EVALUATION PHASE WITH %s GPUs",
                        len(GPUs))
            workers = [
                EvaluateWorker(i, one_game_only=extract_game_number(dir))
                for i in GPUs
            ]
            for p in workers:
                p.start()
            for p in workers:
                p.join()
            workers.clear()
            send_finish_jobs(jobs, mgr)
            logger.info("FINISHED EVALUATION JOBS %", jobs["id"])
        else:
            print("Unhandled state %s. Sleep 5 to wait for new state" % state)
            time.sleep(5)
            continue
Ejemplo n.º 4
0
def main():
    init_directories()
    clean_up_empty()
    resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1))
    sys.setrecursionlimit(10**6)
    GPUs = conf['GPUs']
    START_PHASE = "SELF-PLAY"
    STARTED = False

    while True:
        if STARTED or START_PHASE == "SELF-PLAY":
            STARTED = True
            logger.info("STARTING SELF_PLAY PHASE WITH %s GPUs", len(GPUs))
            turn_on_event(ASYNC_PIPELINE_STATE.SELF_PLAYING)
            init_predicting_workers(GPUs)
            workers = [NoModelSelfPlayWorker(i) for i in GPUs]
            for p in workers:
                p.start()
            for p in workers:
                p.join()
            while is_slave_working():
                time.sleep(2)
            destroy_predicting_workers(GPUs)
            workers.clear()
        if STARTED or START_PHASE == "TRAINING":
            STARTED = True
            logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs))
            turn_on_event(ASYNC_PIPELINE_STATE.TRAINING)
            trainer = TrainWorker([i for i in GPUs])
            trainer.start()
            trainer.join()
        if STARTED or START_PHASE == "EVALUATION":
            STARTED = True
            logger.info("STARTING EVALUATION PHASE WITH %s GPUs", len(GPUs))
            turn_on_event(ASYNC_PIPELINE_STATE.EVALUATING)
            init_predicting_workers(GPUs)
            workers = [NoModelEvaluateWorker(i) for i in GPUs]
            for p in workers:
                p.start()
            for p in workers:
                p.join()
            while is_slave_working():
                time.sleep(2)
            workers.clear()
            destroy_predicting_workers(GPUs)

            promote_best_model()
Ejemplo n.º 5
0
def main():
    init_directories()
    clean_up_empty()
    GPUs = conf['GPUs']
    EPOCHS_PER_SAVE = conf['EPOCHS_PER_SAVE']
    BATCH_SIZE = conf['TRAIN_BATCH_SIZE']
    NUM_WORKERS = conf['NUM_WORKERS']
    SIZE = conf['SIZE']
    n_gpu = len(GPUs)
    if n_gpu <= 1:
        raise EnvironmentError(
            "Number of GPU need > 1 for multi-gpus training")

    logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs))
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUs).strip('[').strip(']').strip(
        ' ')

    global model
    model = load_latest_model()

    base_name, index = model.name.split('_')
    smallest_loss = Inf

    # try:
    #     model = multi_gpu_model(model, cpu_relocation=True)
    #     print("Training using multiple GPUs..")
    # except:
    #     print("Training using single GPU or CPU..")
    opt = SGD(lr=1e-2, momentum=0.9, clipnorm=0.9)
    model.compile(loss=loss, optimizer=opt, metrics=["accuracy"])

    params = {
        'dim': (SIZE, SIZE, 17),
        'batch_size': BATCH_SIZE * n_gpu,
        'shuffle': True
    }
    while True:
        new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5"
        # partition = get_KGS_training_desc()  # get_training_desc()
        training_generator = KGSDataGenerator([], None, **params)
        # validation_generator = KGSDataGenerator(partition['validation'], None, **params)
        reduce_lr = ReduceLROnPlateau(monitor='policy_out_acc',
                                      factor=0.1,
                                      patience=3,
                                      verbose=1,
                                      mode='auto',
                                      min_lr=0)

        callbacks_list = [reduce_lr]

        EPOCHS_PER_BACKUP = conf['EPOCHS_PER_BACKUP']
        cycle = EPOCHS_PER_SAVE // EPOCHS_PER_BACKUP
        for i in range(cycle):
            logger.info("CYCLE {}/{}".format(i + 1, cycle))
            model.fit_generator(
                generator=training_generator,
                # validation_data=validation_generator,
                use_multiprocessing=True,
                workers=NUM_WORKERS,
                epochs=EPOCHS_PER_BACKUP,
                verbose=1,
                callbacks=callbacks_list)
            model.save(os.path.join(conf['MODEL_DIR'], "backup.h5"))
            logger.info('Auto save model backup.h5')