def train():
    env = cs.TrainingEnvironment()

    checkpoint_dir = env.hyperparameters.get("checkpoint_path", env.model_dir)
    train_steps = env.hyperparameters.get('training_steps', 1000)
    eval_steps = env.hyperparameters.get('evaluation_steps', 100)

    # https://github.com/tensorflow/tensorflow/issues/15868
    # The default request timeout for S3, within the C++ SDK, is 3 seconds, which times out when
    # saving checkpoints of larger sizes.
    os.environ['S3_REQUEST_TIMEOUT_MSEC'] = str(
        env.hyperparameters.get('s3_checkpoint_save_timeout', 60000))

    env.download_user_module()
    env.pip_install_requirements()

    customer_script = env.import_user_module()

    train_wrapper = Trainer(customer_script=customer_script,
                            current_host=env.current_host,
                            hosts=env.hosts,
                            train_steps=train_steps,
                            eval_steps=eval_steps,
                            training_path=env.channel_dirs[CHANNEL_DIR],
                            model_path=checkpoint_dir,
                            output_path=env.output_dir,
                            customer_params=env.hyperparameters)

    tf_config = train_wrapper.build_tf_config()

    # only creating a parameter servers for distributed runs
    if len(env.hosts) > 1:
        _run_ps_server(env.current_host, env.hosts, tf_config)

    save_tf_config_env_var(tf_config)

    try:
        run.train_and_log_exceptions(train_wrapper, env.output_dir)

        # only the master should export the model at the end of the execution
        if checkpoint_dir != env.model_dir and train_wrapper.task_type == 'master':
            serve.export_saved_model(checkpoint_dir, env.model_dir)

        if train_wrapper.task_type != 'master':
            _wait_until_master_is_down(_get_master(tf_config))
    finally:
        # Since threads in Python cannot be stopped, this is the only way to stop the application
        # https://stackoverflow.com/questions/9591350/what-is-difference-between-sys-exit0-and-os-exit0
        os._exit(0)
def train():
    env = cs.TrainingEnvironment()

    checkpoint_dir = _get_checkpoint_dir(env)
    train_steps = env.hyperparameters.get('training_steps', 1000)
    eval_steps = env.hyperparameters.get('evaluation_steps', 100)

    # https://github.com/tensorflow/tensorflow/issues/15868
    # The default request timeout for S3, within the C++ SDK, is 3 seconds, which times out when
    # saving checkpoints of larger sizes.
    os.environ['S3_REQUEST_TIMEOUT_MSEC'] = str(
        env.hyperparameters.get('s3_checkpoint_save_timeout', 60000))

    if env.user_script_archive.lower().startswith('s3://'):
        env.download_user_module()
    env.pip_install_requirements()

    customer_script = env.import_user_module()

    trainer_class = _get_trainer_class()
    train_wrapper = trainer_class(customer_script=customer_script,
                                  current_host=env.current_host,
                                  hosts=env.hosts,
                                  train_steps=train_steps,
                                  eval_steps=eval_steps,
                                  input_channels=env.channel_dirs,
                                  model_path=checkpoint_dir,
                                  output_path=env.output_dir,
                                  customer_params=env.hyperparameters)

    tf_config = train_wrapper.build_tf_config()

    # only creating a parameter servers for distributed runs
    if len(env.hosts) > 1:
        _run_ps_server(env.current_host, env.hosts, tf_config)

    save_tf_config_env_var(tf_config)

    configure_mkl()

    train_wrapper.train()

    # only the master should export the model at the end of the execution
    if checkpoint_dir != env.model_dir and train_wrapper.task_type == 'master' and train_wrapper.saves_training(
    ):
        serve.export_saved_model(checkpoint_dir, env.model_dir)

    if train_wrapper.task_type != 'master':
        _wait_until_master_is_down(_get_master(tf_config))
Beispiel #3
0
def train():
    env = cs.TrainingEnvironment()

    print(device_lib.list_local_devices())
    os.system('mkdir -p logs')

    # ### Loading the files ###
    # ** You need to copy all your files to the directory where you are runing this notebook **
    # ** into a folder named "data"                                                          **

    data = []

    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
        for f in files:
            if f.endswith('.zip'):
                unzip_file(root, f)

    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
        data.extend(
            [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')])


    # ### Loading throttle and angle ###

    angle = [d[0] for d in data]
    angle_array = np.array(angle)

    # ### Loading images ###
    images = np.array([img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f')

    # slide images vs orders
    if env.hyperparameters.get('with_slide', False):
        images = images[:len(images) - 2]
        angle_array = angle_array[2:]

    # ### Start training ###
    def linear_bin(a):
        a = a + 1
        b = round(a / (2 / 14))
        arr = np.zeros(15)
        arr[int(b)] = 1
        return arr

    logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True)
    save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1,
                                          save_best_only=True, mode='min')
    early_stop = callbacks.EarlyStopping(monitor='val_loss',
                                         min_delta=.0005,
                                         patience=10,
                                         verbose=1,
                                         mode='auto')
    # Only for export model to tensorflow
    sess = tf.Session()
    K.set_session(sess)

    # First layer, input layer, Shape comes from camera.py resolution, RGB
    img_in = Input(shape=(128, 160, 3),
                   name='img_in')
    x = img_in
    # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
    x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')(x)
    # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
    x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')(x)
    # 64 features, 5px5p kernal window, 2wx2h stride, relu
    x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')(x)
    # 64 features, 3px3p kernal window, 2wx2h stride, relu
    x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x)
    # 64 features, 3px3p kernal window, 1wx1h stride, relu
    x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x)

    # Possibly add MaxPooling (will make it less sensitive to position in image).  Camera angle fixed, so may not to be needed

    x = Flatten(name='flattened')(x)  # Flatten to 1D (Fully connected)
    x = Dense(100, activation='relu')(x)  # Classify the data into 100 features, make all negatives 0
    x = Dropout(.1)(x)
    x = Dense(50, activation='relu')(x)
    # Randomly drop out 10% of the neurons (Prevent overfitting)
    x = Dropout(.1)(x)
    # categorical output of the angle
    callbacks_list = [save_best, early_stop, logs]
    # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage.
    # 15 categories and find best one based off percentage 0.0-1.0
    angle_out = Dense(15, activation='softmax', name='angle_out')(x)

    angle_cat_array = np.array([linear_bin(a) for a in angle_array])
    model = Model(inputs=[img_in], outputs=[angle_out])
    model.compile(optimizer='adam',
                  loss={'angle_out': 'categorical_crossentropy', },
                  loss_weights={'angle_out': 0.9 })
    model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=32,
              epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

    # Save model for tensorflow using
    builder = tf.saved_model.builder.SavedModelBuilder("/opt/ml/model/tfModel")

    # Tag the model, required for Go
    builder.add_meta_graph_and_variables(sess, ["myTag"])
    builder.save()
    sess.close()
Beispiel #4
0
def train():
    env = cs.TrainingEnvironment()

    print(device_lib.list_local_devices())
    os.system('mkdir -p logs')

    # ### Loading the files ###
    # ** You need to copy all your files to the directory where you are runing this notebook into a folder named "data" **

    numbers = re.compile(r'(\d+)')
    data = []

    def get_data(root, f):
        d = json.load(open(os.path.join(root, f)))
        if ('pilot/throttle' in d):
            return [
                d['user/mode'], d['user/throttle'], d['user/angle'], root,
                d['cam/image_array'], d['pilot/throttle'], d['pilot/angle']
            ]
        else:
            return [
                d['user/mode'], d['user/throttle'], d['user/angle'], root,
                d['cam/image_array']
            ]

    def numericalSort(value):
        parts = numbers.split(value)
        parts[1::2] = map(int, parts[1::2])
        return parts

    def unzip_file(root, f):
        zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r')
        zip_ref.extractall(root)
        zip_ref.close()

    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
        for f in files:
            if f.endswith('.zip'):
                unzip_file(root, f)

    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
        data.extend([
            get_data(root, f) for f in sorted(files, key=numericalSort)
            if f.startswith('record') and f.endswith('.json')
        ])

    # Normalize / correct data
    data = [d for d in data if d[1] > 0.1]
    for d in data:
        if d[1] < 0.2:
            d[1] = 0.2

    # ### Loading throttle and angle ###

    angle = [d[2] for d in data]
    throttle = [d[1] for d in data]
    angle_array = np.array(angle)
    throttle_array = np.array(throttle)
    if (len(data[0]) > 5):
        pilot_angle = [d[6] for d in data]
        pilot_throttle = [d[5] for d in data]
        pilot_angle_array = np.array(pilot_angle)
        pilot_throttle_array = np.array(pilot_throttle)
    else:
        pilot_angle = []
        pilot_throttle = []

    # ### Loading images ###
    images = np.array(
        [img_to_array(load_img(os.path.join(d[3], d[4]))) for d in data], 'f')

    # slide images vs orders
    if env.hyperparameters.get('with_slide', False):
        images = images[:len(images) - 2]
        angle_array = angle_array[2:]
        throttle_array = throttle_array[2:]

    # ### Start training ###
    def linear_bin(a):
        a = a + 1
        b = round(a / (2 / 14))
        arr = np.zeros(15)
        arr[int(b)] = 1
        return arr

    logs = callbacks.TensorBoard(log_dir='logs',
                                 histogram_freq=0,
                                 write_graph=True,
                                 write_images=True)
    save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat',
                                          monitor='angle_out_loss',
                                          verbose=1,
                                          save_best_only=True,
                                          mode='min')
    early_stop = callbacks.EarlyStopping(monitor='angle_out_loss',
                                         min_delta=.0005,
                                         patience=10,
                                         verbose=1,
                                         mode='auto')
    img_in = Input(
        shape=(120, 160, 3), name='img_in'
    )  # First layer, input layer, Shape comes from camera.py resolution, RGB
    x = img_in
    x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')(
        x
    )  # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
    x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')(
        x)  # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
    x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')(
        x)  # 64 features, 5px5p kernal window, 2wx2h stride, relu
    x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(
        x)  # 64 features, 3px3p kernal window, 2wx2h stride, relu
    x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(
        x)  # 64 features, 3px3p kernal window, 1wx1h stride, relu

    # Possibly add MaxPooling (will make it less sensitive to position in image).  Camera angle fixed, so may not to be needed

    x = Flatten(name='flattened')(x)  # Flatten to 1D (Fully connected)
    x = Dense(100, activation='relu')(
        x)  # Classify the data into 100 features, make all negatives 0
    x = Dropout(.1)(x)
    x = Dense(50, activation='relu')(x)
    x = Dropout(.1)(
        x)  # Randomly drop out 10% of the neurons (Prevent overfitting)
    #categorical output of the angle
    callbacks_list = [save_best, early_stop, logs]
    angle_out = Dense(15, activation='softmax', name='angle_out')(
        x
    )  # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. 15 categories and find best one based off percentage 0.0-1.0

    #continous output of throttle
    throttle_out = Dense(1, activation='relu', name='throttle_out')(
        x)  # Reduce to 1 number, Positive number only
    angle_cat_array = np.array([linear_bin(a) for a in angle_array])
    model = Model(inputs=[img_in], outputs=[angle_out, throttle_out])
    model.compile(optimizer='adam',
                  loss={
                      'angle_out': 'categorical_crossentropy',
                      'throttle_out': 'mean_absolute_error'
                  },
                  loss_weights={
                      'angle_out': 0.9,
                      'throttle_out': .001
                  })
    model.fit({'img_in': images}, {
        'angle_out': angle_cat_array,
        'throttle_out': throttle_array
    },
              batch_size=32,
              epochs=100,
              verbose=1,
              validation_split=0.2,
              shuffle=True,
              callbacks=callbacks_list)