Ejemplo n.º 1
0
def train_by_multi_gpus(n_gpu=1, epochs=None):
    import tensorflow as tf
    logger.info("Training with %s GPUs", n_gpu)
    if n_gpu > 1:
        with tf.device('/cpu:0'):
            model = load_latest_model()
    else:
        model = load_latest_model()

    with tf.device('/cpu:0'):
        best_model = load_best_model()

    base_name, index = model.name.split('_')
    new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5"

    all_data_file_names = get_file_names_data_dir(os.path.join(SELF_PLAY_DATA, best_model.name))
    tf_callback = TensorBoard(log_dir=os.path.join(conf['LOG_DIR'], new_name),
                              histogram_freq=conf['HISTOGRAM_FREQ'], batch_size=BATCH_SIZE, write_graph=False,
                              write_grads=False)
    nan_callback = TerminateOnNaN()

    if n_gpu > 1:
        pmodel = multi_gpu_model(model, gpus=n_gpu)
        opt = SGD(lr=1e-2, momentum=0.9)
        pmodel.compile(loss=loss, optimizer=opt, metrics=["accuracy"])
    else:
        pmodel = model

    if epochs is None:
        epochs = EPOCHS_PER_SAVE
    for epoch in tqdm.tqdm(range(epochs), desc="Epochs"):
        for worker in tqdm.tqdm(range(NUM_WORKERS), desc="Iteration"):
            files = sample(all_data_file_names, BATCH_SIZE)  # RANDOM because we use SGD (Stochastic Gradient Decent)

            X = np.zeros((BATCH_SIZE, SIZE, SIZE, 17))
            policy_y = np.zeros((BATCH_SIZE, 1))
            value_y = np.zeros((BATCH_SIZE, SIZE * SIZE + 1))
            for j, filename in enumerate(files):
                with h5py.File(filename) as f:
                    board = f['board'][:]
                    policy = f['policy_target'][:]
                    value_target = f['value_target'][()]
                    X[j] = board
                    policy_y[j] = value_target
                    value_y[j] = policy

            fake_epoch = epoch * NUM_WORKERS + worker  # used as initial_epoch, epochs is to be understood as "final epoch". The model is trained until the epoch of index epochs is reached.

            pmodel.fit(X, [value_y, policy_y],
                      initial_epoch=fake_epoch,
                      epochs=fake_epoch + 1,
                      validation_split=VALIDATION_SPLIT,  # Needed for TensorBoard histograms and gradi
                      callbacks=[tf_callback, nan_callback],
                      verbose=0, batch_size=BATCH_SIZE)

    model.name = new_name.split('.')[0]
    model.save(os.path.join(conf['MODEL_DIR'], new_name))
    logger.info("Finished training with multi GPUs. New model %s saved", new_name)
    return model
Ejemplo n.º 2
0
    def load_model(self):
        best_model = load_best_model()
        logger.info("Loaded best model %s", best_model.name)

        latest_model = load_latest_model()
        logger.info("Loaded latest %s", latest_model.name)
        return latest_model, best_model
Ejemplo n.º 3
0
def main():
    print("Starting run (v{})".format(__version__))
    init_directories()
    model_name = "model_1"
    model = create_initial_model(name=model_name)

    while True:
        model = load_latest_model()
        best_model = load_best_model()
        train(model, game_model_name=best_model.name)
        evaluate(best_model, model)
        K.clear_session()
Ejemplo n.º 4
0
def main():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    K.set_session(tf.Session(config=config))
    init_directories()

    while True:
        model = load_latest_model()
        best_model = load_best_model()
        evaluate(best_model, model)
        train(model, game_model_name=best_model.name)
        K.clear_session()
Ejemplo n.º 5
0
def main():

    init_directories()
    model_name = "model_1"
    model = create_initial_model(name=model_name)

    while True:
        model = load_latest_model()
        best_model = load_model(os.path.join(conf['MODEL_DIR'],
                                             conf['BEST_MODEL']),
                                custom_objects={'loss': loss})
        train(model, game_model_name=best_model.name)
        evaluate(best_model, model)
def main():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    K.set_session(tf.Session(config=config))
    init_directories()

    model_name = "model_1"
    model = create_initial_model(name=model_name)

    while True:
        model = load_latest_model()
        best_model = load_best_model()
        self_play(best_model, n_games=conf['N_GAMES'], mcts_simulations=conf['MCTS_SIMULATIONS'])
        train(model, game_model_name=best_model.name)
        evaluate(best_model, model)

        K.clear_session()
Ejemplo n.º 7
0
def main():
    
    
    model = load_latest_model()
    best_model = load_best_model()
    evaluate(best_model, model)
    K.clear_session()
    
    event_handler = MyHandler()
    observer = Observer()
    observer.schedule(event_handler, path=os.path.join(conf['MODEL_DIR']), recursive=False)
    observer.start()

    try:
        while True:
            time.sleep(60)
    except KeyboardInterrupt:
        observer.stop()

    observer.join()
Ejemplo n.º 8
0
 def load_model(self):
     self.best_model = load_best_model()
     self.latest_model = load_latest_model()
Ejemplo n.º 9
0
 def on_created(self, event):
     time.sleep(30)
     model = load_latest_model()
     best_model = load_best_model()
     evaluate(best_model, model)
     K.clear_session()
Ejemplo n.º 10
0
def main():
    init_directories()
    clean_up_empty()
    GPUs = conf['GPUs']
    EPOCHS_PER_SAVE = conf['EPOCHS_PER_SAVE']
    BATCH_SIZE = conf['TRAIN_BATCH_SIZE']
    NUM_WORKERS = conf['NUM_WORKERS']
    SIZE = conf['SIZE']
    n_gpu = len(GPUs)
    if n_gpu <= 1:
        raise EnvironmentError(
            "Number of GPU need > 1 for multi-gpus training")

    logger.info("STARTING TRAINING PHASE with %s GPUs", len(GPUs))
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUs).strip('[').strip(']').strip(
        ' ')

    global model
    model = load_latest_model()

    base_name, index = model.name.split('_')
    smallest_loss = Inf

    # try:
    #     model = multi_gpu_model(model, cpu_relocation=True)
    #     print("Training using multiple GPUs..")
    # except:
    #     print("Training using single GPU or CPU..")
    opt = SGD(lr=1e-2, momentum=0.9, clipnorm=0.9)
    model.compile(loss=loss, optimizer=opt, metrics=["accuracy"])

    params = {
        'dim': (SIZE, SIZE, 17),
        'batch_size': BATCH_SIZE * n_gpu,
        'shuffle': True
    }
    while True:
        new_name = "_".join([base_name, str(int(index) + 1)]) + ".h5"
        # partition = get_KGS_training_desc()  # get_training_desc()
        training_generator = KGSDataGenerator([], None, **params)
        # validation_generator = KGSDataGenerator(partition['validation'], None, **params)
        reduce_lr = ReduceLROnPlateau(monitor='policy_out_acc',
                                      factor=0.1,
                                      patience=3,
                                      verbose=1,
                                      mode='auto',
                                      min_lr=0)

        callbacks_list = [reduce_lr]

        EPOCHS_PER_BACKUP = conf['EPOCHS_PER_BACKUP']
        cycle = EPOCHS_PER_SAVE // EPOCHS_PER_BACKUP
        for i in range(cycle):
            logger.info("CYCLE {}/{}".format(i + 1, cycle))
            model.fit_generator(
                generator=training_generator,
                # validation_data=validation_generator,
                use_multiprocessing=True,
                workers=NUM_WORKERS,
                epochs=EPOCHS_PER_BACKUP,
                verbose=1,
                callbacks=callbacks_list)
            model.save(os.path.join(conf['MODEL_DIR'], "backup.h5"))
            logger.info('Auto save model backup.h5')
Ejemplo n.º 11
0
 def load_model(self):
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu_id)
     self.best_model = load_best_model()
     self.latest_model = load_latest_model()