コード例 #1
0
def main(*args):
    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    steps_per_epoch = int(
        round(
            math.ceil(
                float(NUM_SAMPLES_TRAIN) /
                (flags.batch_size * num_gpus * num_workers))))

    if flags.is_training:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                optimizer_fn=mox.get_optimizer_fn(name='adam',
                                                  learning_rate=0.001),
                run_mode=mox.ModeKeys.TRAIN,
                batch_size=flags.batch_size,
                log_dir=flags.train_url,
                max_number_of_steps=steps_per_epoch * 150,
                log_every_n_steps=20,
                save_summary_steps=50,
                save_model_secs=120,
                export_model=mox.ExportKeys.TF_SERVING)
    else:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.EVAL,
                batch_size=5,
                log_every_n_steps=1,
                max_number_of_steps=int(NUM_SAMPLES_EVAL / 5),
                checkpoint_path=flags.train_url)
        mox.run(input_fn=input_fn,
                output_fn=output_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.PREDICT,
                batch_size=24,
                max_number_of_steps=int(NUM_SAMPLES_TEST / 24),
                log_every_n_steps=50,
                output_every_n_steps=int(NUM_SAMPLES_TEST / 24),
                checkpoint_path=flags.train_url)

        # Write results to file. tf.gfile allow writing file to EBS/s3
        submission_file = os.path.join(flags.train_url, 'submission.csv')
        result = submission.to_csv(path_or_buf=None, index=False)
        with tf.gfile.Open(submission_file, 'w') as f:
            f.write(result)
コード例 #2
0
def config_bs_ims(strategy):
    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    res = []
    if ":" not in strategy:
        image_size, batch_size = strategy.split('-')
        return [(float(image_size), float(batch_size))]
    else:
        stags = strategy.split(",")
        last_steps, last_epoch = 0, 0
        for i in range(len(stags)):
            cur_epoch, value = stags[i].strip().split(':')
            image_size, batch_size = value.strip().split('-')
            cur_epoch, image_size, batch_size = float(cur_epoch), float(
                image_size), float(batch_size)
            cur_batch_tot = batch_size * num_gpus * num_workers
            cur_steps = int(
                round(math.ceil(flags.num_samples / float(cur_batch_tot)))) * (
                    cur_epoch - last_epoch) + last_steps
            res.append((int(cur_steps), int(image_size), int(batch_size)))
            last_steps, last_epoch = cur_steps, cur_epoch
    return res
コード例 #3
0
def convert_ps_to_controller():
    # ps0 -> worker0
    # ps1 -> worker1
    # worker0 -> controller
    # worker1 -> sleep
    job_name = mox.get_flag('job_name')
    task_index = mox.get_flag('task_index')
    ps_hosts = mox.get_flag('ps_hosts')
    worker_hosts = mox.get_flag('worker_hosts')

    mox.set_flag('ps_hosts', '')
    mox.set_flag('worker_hosts', ps_hosts)
    mox.set_flag('controller_host', worker_hosts.split(',')[0])

    if job_name == 'ps':
        tf.logging.info('convert ps to worker')
        mox.set_flag('job_name', 'worker')
    elif job_name == 'worker' and task_index == 0:
        tf.logging.info('convert worker-0 to controller')
        mox.set_flag('job_name', 'controller')
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    else:
        tf.logging.info('sleep unused server')
        time.sleep(9999999)
コード例 #4
0
def main(*args):
    _data_url = flags.data_url
    _train_url = flags.train_url
    if not mox.file.is_directory(_train_url):
        mox.file.make_dirs(_train_url)
    mox.file.make_dirs('/cache/data_url')
    mox.file.make_dirs('/cache/train_url')
    mox.file.copy_parallel(_data_url, '/cache/data_url')
    mox.file.copy_parallel(_train_url, '/cache/train_url')
    flags.data_url = '/cache/data_url'
    flags.train_url = '/cache/train_url'
    atexit.register(
        lambda: mox.file.copy_parallel('/cache/train_url', _train_url))
    logger = logging.getLogger()
    while logger.handlers:
        logger.handlers.pop()

    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    steps_per_epoch = int(
        math.ceil(
            float(NUM_SAMPLES_TRAIN) /
            (flags.batch_size * num_gpus * num_workers)))
    submission = pd.DataFrame(columns=['id', 'is_iceberg'])

    def input_fn(run_mode, **kwargs):
        if run_mode == mox.ModeKeys.TRAIN:
            num_samples = NUM_SAMPLES_TRAIN
            num_epochs = None
            shuffle = True
            file_pattern = 'iceberg-train-*.tfrecord'
        else:
            num_epochs = 1
            shuffle = False
            if run_mode == mox.ModeKeys.EVAL:
                num_samples = NUM_SAMPLES_EVAL
                file_pattern = 'iceberg-eval-*.tfrecord'
            else:
                num_samples = NUM_SAMPLES_TEST
                file_pattern = 'iceberg-test-*.tfrecord'
        keys_to_features = {
            'band_1':
            tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None),
            'band_2':
            tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None),
            'angle':
            tf.FixedLenFeature([1], tf.float32, default_value=None),
        }
        items_to_handlers = {
            'band_1': slim.tfexample_decoder.Tensor('band_1', shape=[75, 75]),
            'band_2': slim.tfexample_decoder.Tensor('band_2', shape=[75, 75]),
            'angle': slim.tfexample_decoder.Tensor('angle', shape=[])
        }
        if run_mode == mox.ModeKeys.PREDICT:
            keys_to_features['id'] = tf.FixedLenFeature([1],
                                                        tf.string,
                                                        default_value=None)
            items_to_handlers['id'] = slim.tfexample_decoder.Tensor('id',
                                                                    shape=[])
        else:
            keys_to_features['label'] = tf.FixedLenFeature([1],
                                                           tf.int64,
                                                           default_value=None)
            items_to_handlers['label'] = slim.tfexample_decoder.Tensor(
                'label', shape=[])
        dataset = mox.get_tfrecord(dataset_dir=flags.data_url,
                                   file_pattern=file_pattern,
                                   num_samples=num_samples,
                                   keys_to_features=keys_to_features,
                                   items_to_handlers=items_to_handlers,
                                   num_epochs=num_epochs,
                                   shuffle=shuffle)
        if run_mode == mox.ModeKeys.PREDICT:
            band_1, band_2, id_or_label, angle = dataset.get(
                ['band_1', 'band_2', 'id', 'angle'])
            # Non-DMA safe string cannot tensor may not be copied to a GPU.
            # So we encode string to a list of integer.
            id_or_label = tf.py_func(
                lambda str: np.array([ord(ch) for ch in str]), [id_or_label],
                tf.int64)
            # We know `id` is a string of 8 alphabets.
            id_or_label = tf.reshape(id_or_label, shape=(8, ))
        else:
            band_1, band_2, id_or_label, angle = dataset.get(
                ['band_1', 'band_2', 'label', 'angle'])
        band_3 = band_1 + band_2

        # Rescale the input image to [0, 1]
        def rescale(*args):
            ret_images = []
            for image in args:
                image = tf.cast(image, tf.float32)
                image_min = tf.reduce_min(image)
                image_max = tf.reduce_max(image)
                image = (image - image_min) / (image_max - image_min)
                ret_images.append(image)
            return ret_images

        band_1, band_2, band_3 = rescale(band_1, band_2, band_3)
        image = tf.stack([band_1, band_2, band_3], axis=2)
        # Data augementation
        if run_mode == mox.ModeKeys.TRAIN:
            image = tf.image.random_flip_left_right(image)
            image = tf.image.random_flip_up_down(image)
            image = tf.image.rot90(image,
                                   k=tf.random_uniform(shape=(),
                                                       maxval=3,
                                                       minval=0,
                                                       dtype=tf.int32))
        return image, id_or_label, angle

    def model_v1(images, angles, run_mode):
        is_training = (run_mode == mox.ModeKeys.TRAIN)
        # Conv Layer 1
        x = Conv2D(64,
                   kernel_size=(3, 3),
                   activation='relu',
                   input_shape=(75, 75, 3))(images)
        x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Conv Layer 2
        x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Conv Layer 3
        x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Conv Layer 4
        x = Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Flatten the data for upcoming dense layers
        x = Flatten()(x)
        x = Concatenate()([x, angles])
        # Dense Layers
        x = Dense(512)(x)
        x = Activation('relu')(x)
        x = Dropout(0.2)(x, training=is_training)
        # Dense Layer 2
        x = Dense(256)(x)
        x = Activation('relu')(x)
        x = Dropout(0.2)(x, training=is_training)
        # Sigmoid Layer
        logits = Dense(2)(x)
        return logits

    def model_fn(inputs, run_mode, **kwargs):
        # In train or eval, id_or_labels represents labels. In predict, id_or_labels represents id.
        images, id_or_labels, angles = inputs
        # Reshape angles from [batch_size] to [batch_size, 1]
        angles = tf.expand_dims(angles, 1)
        # Apply your version of model
        logits = model_v1(images, angles, run_mode)
        if run_mode == mox.ModeKeys.PREDICT:
            logits = tf.nn.softmax(logits)
            # clip logits to get lower loss value.
            logits = tf.clip_by_value(logits,
                                      clip_value_min=0.05,
                                      clip_value_max=0.95)
            model_spec = mox.ModelSpec(output_info={
                'id': id_or_labels,
                'logits': logits
            })
        else:
            labels_one_hot = slim.one_hot_encoding(id_or_labels, 2)
            loss = tf.losses.softmax_cross_entropy(
                logits=logits,
                onehot_labels=labels_one_hot,
                label_smoothing=0.0,
                weights=1.0)
            model_spec = mox.ModelSpec(loss=loss, log_info={'loss': loss})
        return model_spec

    def output_fn(outputs):
        global submission
        for output in outputs:
            for id, logits in zip(output['id'], output['logits']):
                # Decode id from integer list to string.
                id = ''.join([chr(ch) for ch in id])
                # Get the probability of label==1
                is_iceberg = logits[1]
                df = pd.DataFrame([[id, is_iceberg]],
                                  columns=['id', 'is_iceberg'])
                submission = submission.append(df)

    if flags.is_training:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                optimizer_fn=mox.get_optimizer_fn(name='adam',
                                                  learning_rate=0.001),
                run_mode=mox.ModeKeys.TRAIN,
                batch_size=flags.batch_size,
                log_dir=flags.train_url,
                max_number_of_steps=steps_per_epoch * 150,
                log_every_n_steps=20,
                save_summary_steps=50,
                save_model_secs=120)
    else:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.EVAL,
                batch_size=5,
                log_every_n_steps=1,
                max_number_of_steps=int(NUM_SAMPLES_EVAL / 5),
                checkpoint_path=flags.train_url)
        mox.run(input_fn=input_fn,
                output_fn=output_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.PREDICT,
                batch_size=24,
                max_number_of_steps=int(NUM_SAMPLES_TEST / 24),
                log_every_n_steps=50,
                output_every_n_steps=int(NUM_SAMPLES_TEST / 24),
                checkpoint_path=flags.train_url)
        # Write results to file. tf.gfile allow writing file to EBS/s3
        submission_file = os.path.join(flags.train_url, 'submission.csv')
        result = submission.to_csv(path_or_buf=None, index=False)
        with tf.gfile.Open(submission_file, 'w') as f:
            f.write(result)
コード例 #5
0
_data_url = flags.data_url
_train_url = flags.train_url
if not mox.file.is_directory(_train_url):
    mox.file.make_dirs(_train_url)
mox.file.make_dirs('/cache/data_url')
mox.file.make_dirs('/cache/train_url')
mox.file.copy_parallel(_data_url, '/cache/data_url')
mox.file.copy_parallel(_train_url, '/cache/train_url')
flags.data_url = '/cache/data_url'
flags.train_url = '/cache/train_url'
atexit.register(lambda: mox.file.copy_parallel('/cache/train_url', _train_url))
logger = logging.getLogger()
while logger.handlers:
    logger.handlers.pop()

num_gpus = mox.get_flag('num_gpus')
num_workers = len(mox.get_flag('worker_hosts').split(','))
steps_per_epoch = int(
    math.ceil(
        float(NUM_SAMPLES_TRAIN) /
        (flags.batch_size * num_gpus * num_workers)))
submission = pd.DataFrame(columns=['id', 'is_iceberg'])


def input_fn(run_mode, **kwargs):
    if run_mode == mox.ModeKeys.TRAIN:
        num_samples = NUM_SAMPLES_TRAIN
        num_epochs = None
        shuffle = True
        file_pattern = 'iceberg-train-*.tfrecord'
    else:
コード例 #6
0
    def input_fn(mode, **kwargs):

        if not flags.synthetic:
            ds_strategy_spec = []
            ds_switch_steps = []

            if flags.split_dataset_like_mxnet and mox.get_flag('job_name'):
                if num_workers == 4:
                    file_pattern = 'train-*-of-*-node-%d-*-*' % task_index
                elif num_workers == 8:
                    file_pattern = 'train-*-of-*-node-*-%d-*' % task_index
                elif num_workers == 16:
                    file_pattern = 'train-*-of-*-node-*-*-%d' % task_index
                else:
                    raise ValueError('num_workers should be 4, 8, 16')

            else:
                file_pattern = flags.file_pattern

            for step, ims, bs in schduler:
                # switch to next dataset 2 steps earlier because there are 2 pipeline prefetch queue
                ds_switch_steps.append(step - 2)
                if ims == 128:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_160_data,
                                      file_pattern), bs, ims, 0.08))
                elif ims == 224:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_data,
                                      file_pattern), bs, ims, 0.087))
                elif ims == 288:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_data,
                                      file_pattern), bs, ims, 0.5))
                else:
                    raise ValueError('image is not in [128, 224, 288]')

            # The last stage of dataset does not need to be switched
            ds_switch_steps.pop(-1)
            tf.logging.info('Dataset will be switched at step: %s' %
                            ds_switch_steps)

            dataset = ProgressiveImagenetDataset(
                num_samples=flags.num_samples,
                strategy_spec=ds_strategy_spec,
                ds_switch_steps=ds_switch_steps,
                shuffle=True,
                num_parallel=flags.num_readers,
                labels_offset=labels_offset,
                private_num_threads=flags.private_num_threads,
                shuffle_buffer_size=512 * 8 * 2)

            image, label = dataset.get(['image', 'label'])

            image_shape = tf.shape(image)[2]
            batch_size = tf.shape(label)[0]
            tf.summary.scalar(name='image_shape', tensor=image_shape)
            tf.summary.scalar(name='batch_size', tensor=batch_size)

        else:

            import numpy as np
            image = tf.constant(
                np.random.randint(low=0,
                                  high=255,
                                  size=[flags.batch_size, 128, 128, 3],
                                  dtype=np.uint8))
            label = tf.constant(
                np.random.randint(low=0,
                                  high=999,
                                  size=[flags.batch_size],
                                  dtype=np.int64))

        if flags.split_to_device:
            input_spec = mox.InputSpec(split_to_device=True)
            input_spec.new_input([image, label])
            return input_spec
        else:
            return image, label
コード例 #7
0
def main(*args, **kwargs):
    if flags.use_controller:
        convert_ps_to_controller()

    job_name = mox.get_flag('job_name')
    task_index = mox.get_flag('task_index')

    if flags.local_cache == 'hard':
        if flags.use_controller:
            # In all-reduce mode, worker-0 does not download dataset (controller-0 will download).
            imagenet_data, imagenet_160_data = download_dataset(
                flags.data_url,
                flags.data_url_160,
                skip_download=(job_name == 'worker' and task_index == 0))
        else:
            # PS dose not download dataset.
            imagenet_data, imagenet_160_data = download_dataset(
                flags.data_url,
                flags.data_url_160,
                skip_download=(job_name == 'ps'))

        log_dir = '/cache/cache-outputs'
    else:
        imagenet_data = flags.data_url
        imagenet_160_data = flags.data_url_160
        log_dir = flags.train_url

    print('download dataset finish at %s' % time.time())

    if (not job_name or
        (job_name == 'worker' and task_index == 0)) and flags.train_url:
        if not mox.file.is_directory(log_dir):
            mox.file.make_dirs(log_dir)
    else:
        log_dir = None

    model_meta = mox.get_model_meta(flags.model_name)
    labels_offset = model_meta.default_labels_offset
    num_workers = len(mox.get_flag('worker_hosts').split(','))

    assert flags.bs_and_ims_strategy is not None
    schduler = config_bs_ims(flags.bs_and_ims_strategy)
    max_step = int(schduler[-1][0])

    def input_fn(mode, **kwargs):

        if not flags.synthetic:
            ds_strategy_spec = []
            ds_switch_steps = []

            if flags.split_dataset_like_mxnet and mox.get_flag('job_name'):
                if num_workers == 4:
                    file_pattern = 'train-*-of-*-node-%d-*-*' % task_index
                elif num_workers == 8:
                    file_pattern = 'train-*-of-*-node-*-%d-*' % task_index
                elif num_workers == 16:
                    file_pattern = 'train-*-of-*-node-*-*-%d' % task_index
                else:
                    raise ValueError('num_workers should be 4, 8, 16')

            else:
                file_pattern = flags.file_pattern

            for step, ims, bs in schduler:
                # switch to next dataset 2 steps earlier because there are 2 pipeline prefetch queue
                ds_switch_steps.append(step - 2)
                if ims == 128:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_160_data,
                                      file_pattern), bs, ims, 0.08))
                elif ims == 224:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_data,
                                      file_pattern), bs, ims, 0.087))
                elif ims == 288:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_data,
                                      file_pattern), bs, ims, 0.5))
                else:
                    raise ValueError('image is not in [128, 224, 288]')

            # The last stage of dataset does not need to be switched
            ds_switch_steps.pop(-1)
            tf.logging.info('Dataset will be switched at step: %s' %
                            ds_switch_steps)

            dataset = ProgressiveImagenetDataset(
                num_samples=flags.num_samples,
                strategy_spec=ds_strategy_spec,
                ds_switch_steps=ds_switch_steps,
                shuffle=True,
                num_parallel=flags.num_readers,
                labels_offset=labels_offset,
                private_num_threads=flags.private_num_threads,
                shuffle_buffer_size=512 * 8 * 2)

            image, label = dataset.get(['image', 'label'])

            image_shape = tf.shape(image)[2]
            batch_size = tf.shape(label)[0]
            tf.summary.scalar(name='image_shape', tensor=image_shape)
            tf.summary.scalar(name='batch_size', tensor=batch_size)

        else:

            import numpy as np
            image = tf.constant(
                np.random.randint(low=0,
                                  high=255,
                                  size=[flags.batch_size, 128, 128, 3],
                                  dtype=np.uint8))
            label = tf.constant(
                np.random.randint(low=0,
                                  high=999,
                                  size=[flags.batch_size],
                                  dtype=np.int64))

        if flags.split_to_device:
            input_spec = mox.InputSpec(split_to_device=True)
            input_spec.new_input([image, label])
            return input_spec
        else:
            return image, label

    def model_fn(inputs, mode, **kwargs):
        if not flags.gpu_synthetic:
            if flags.split_to_device:
                images, labels = inputs.get_input(0)
            else:
                images, labels = inputs
        else:
            import numpy as np
            images = tf.constant(
                np.random.randint(low=0,
                                  high=255,
                                  size=[flags.batch_size, 128, 128, 3],
                                  dtype=np.uint8))
            labels = tf.constant(
                np.random.randint(low=0,
                                  high=999,
                                  size=[flags.batch_size],
                                  dtype=np.int64))

        if flags.fp16:
            images = tf.cast(images, tf.float16)

        def preprocess_fn(images, run_mode, *args):
            images = images / 255.0
            channels = tf.split(axis=3, num_or_size_splits=3, value=images)
            for i in range(3):
                channels[i] = (channels[i] - mean[i]) / std[i]
            images = tf.concat(axis=3, values=channels)
            if flags.data_format == 'NCHW':
                images = tf.transpose(images, perm=(0, 3, 1, 2))
            return images

        model_kwargs = {}
        if flags.model_name == 'resnet_v1_50_8k':
            if flags.official_stride:
                model_kwargs['official'] = True
            if flags.fastai_initializer:
                model_kwargs['weights_initializer_params'] = {
                    'factor': 2.0 / 1.3,
                    'mode': 'FAN_OUT'
                }

        mox_model_fn = mox.get_model_fn(name=flags.model_name,
                                        run_mode=mode,
                                        num_classes=1000,
                                        preprocess_fn=preprocess_fn,
                                        weight_decay=flags.weight_decay,
                                        data_format=flags.data_format,
                                        batch_norm_fused=True,
                                        batch_renorm=False,
                                        **model_kwargs)

        logits, end_points = mox_model_fn(images)

        labels_one_hot = slim.one_hot_encoding(labels, 1000)
        loss = tf.losses.softmax_cross_entropy(logits=logits,
                                               onehot_labels=labels_one_hot,
                                               label_smoothing=0.0,
                                               weights=1.0)

        logits_fp32 = tf.cast(logits, tf.float32)
        accuracy_top_1 = tf.reduce_mean(
            tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32))
        accuracy_top_5 = tf.reduce_mean(
            tf.cast(tf.nn.in_top_k(logits_fp32, labels, 5), tf.float32))

        log_info = {
            'ent_loss': loss,
            'top-1': accuracy_top_1,
            'top-5': accuracy_top_5
        }

        regularization_losses = mox.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        if len(regularization_losses
               ) > 0 and flags.use_optimizer != 'dymomentumw':
            regularization_loss = tf.add_n(regularization_losses)
            log_info['reg_loss'] = regularization_loss
            loss = loss + regularization_loss
            log_info['total_loss'] = loss

        return mox.ModelSpec(loss=loss, log_info=log_info)

    if flags.strict_sync_replicas:
        mox.set_flag('sync_replicas', False)
        mox.set_flag('chief_inc_global_step', True)

    def optimizer_fn():
        global_step = tf.train.get_or_create_global_step()
        decay_end = 1.0 - flags.cooldown

        if flags.use_lr_schedule == 'lcd':
            lr = linear_cosine_decay(flags.max_lr, flags.min_lr, global_step,
                                     max_step, flags.warmup, decay_end)
            print("Using Linear Cosine Decay Schedule")
        elif flags.use_lr_schedule == 'poly':
            lr = polynomial_decay(flags.max_lr, flags.min_lr, global_step,
                                  max_step, flags.warmup, decay_end)
            print("Using Polynomial Decay Schedule")
        else:
            raise ValueError("lr schedule not provided")

        if flags.use_optimizer == 'dymomentum':
            opt = DyMomentumOptimizer(lr,
                                      flags.max_lr,
                                      flags.min_lr,
                                      max_mom=flags.max_mom,
                                      min_mom=flags.min_mom,
                                      global_step=global_step,
                                      max_iteration=max_step,
                                      use_nesterov=flags.use_nesterov,
                                      cooldown=flags.cooldown,
                                      use_lars=flags.use_lars,
                                      weight_decay=flags.weight_decay)
            print("Using Dynamic Momentum Optimizer")
        elif flags.use_optimizer == 'dymomentumw':
            opt = DyMomentumWOptimizer(lr,
                                       flags.max_lr,
                                       flags.min_lr,
                                       max_mom=flags.max_mom,
                                       min_mom=flags.min_mom,
                                       global_step=global_step,
                                       max_iteration=max_step,
                                       use_nesterov=flags.use_nesterov,
                                       cooldown=flags.cooldown,
                                       use_lars=flags.use_lars,
                                       weight_decay=flags.weight_decay)
            print("Using Dynamic MomentumW Optimizer")
        else:
            raise ValueError("Optimizer not provided")

        tf.summary.scalar(name='momentum', tensor=opt.get_momentum())

        if flags.strict_sync_replicas:
            from moxing.tensorflow.optimizer.simple_sync_optimizer import SimpleSyncOptimizer
            opt = SimpleSyncOptimizer(opt,
                                      num_workers=num_workers,
                                      task_index=task_index)

        return opt

    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=optimizer_fn,
            run_mode=flags.run_mode,
            batch_size=flags.batch_size,
            max_number_of_steps=max_step,
            log_every_n_steps=flags.log_every_n_steps,
            log_dir=log_dir,
            auto_batch=False,
            save_summary_steps=flags.save_summary_steps,
            checkpoint_path=flags.checkpoint_url,
            save_model_secs=flags.save_model_secs)

    print('upload model finish at %s' % time.time())

    if flags.local_cache == 'hard' and log_dir:
        mox.file.copy_parallel(log_dir, flags.train_url)

    print('Training job finish at: %s' % time.time())
コード例 #8
0
def main(_):
    # 获取当前使用的GPU数量和节点数量
    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url)

    def input_fn(mode):
        # 创建一个数据增强方法,该方法基于resnet50论文实现
        augmentation_fn = mox.get_data_augmentation_fn(name='resnet_v1_50',
                                                       run_mode=mode,
                                                       output_height=224,
                                                       output_width=224)

        # 创建`数据集读取类`,并将数据增强方法传入,最多读取20个epoch
        dataset = mox.ImageClassificationRawDataset(
            data_meta,
            batch_size=flags.batch_size,
            num_epochs=20,
            augmentation_fn=augmentation_fn)
        image, label = dataset.get(['image', 'label'])
        return image, label

    def model_fn(inputs, mode):
        images, labels = inputs

        # 获取一个resnet50的模型,输入images,输入logits和end_points,这里不关心end_points,仅取logits
        logits, _ = mox.get_model_fn(name='resnet_v1_50',
                                     run_mode=mode,
                                     num_classes=data_meta.num_classes,
                                     weight_decay=0.00004)(images)

        # 计算交叉熵损失值
        labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes)
        loss = tf.losses.softmax_cross_entropy(logits=logits,
                                               onehot_labels=labels_one_hot)

        # 获取正则项损失值,并加到loss上,这里必须要用mox.get_collection代替tf.get_collection
        regularization_losses = mox.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        regularization_loss = tf.add_n(regularization_losses)
        loss = loss + regularization_loss

        # 计算分类正确率
        accuracy = tf.reduce_mean(
            tf.cast(tf.nn.in_top_k(logits, labels, 1), tf.float32))

        # 返回MoXing-TensorFlow用于定义模型的类ModelSpec
        return mox.ModelSpec(loss=loss,
                             log_info={
                                 'loss': loss,
                                 'accuracy': accuracy
                             })

    def optimizer_fn():
        # 使用分段式学习率,0-10个epoch为0.01,10-20个epoch为0.001
        lr = learning_rate_scheduler.piecewise_lr(
            '10:0.01,20:0.001',
            num_samples=data_meta.total_num_samples,
            global_batch_size=flags.batch_size * num_gpus * num_workers)
        return tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)

    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=optimizer_fn,
            run_mode=mox.ModeKeys.TRAIN,
            max_number_of_steps=sys.maxint,
            log_dir=flags.train_url)
コード例 #9
0
ファイル: train_iceberg.py プロジェクト: jisaber/dls-example
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import moxing.tensorflow as mox 
from tensorflow.python.keras.layers import Conv2D, MaxPooling2D, Dense 
from tensorflow.python.keras.layers import Dropout, Flatten, Activation, Concatenate 
slim = tf.contrib.slim 
NUM_SAMPLES_TRAIN = 1176 
NUM_SAMPLES_EVAL = 295 
NUM_SAMPLES_TEST = 8424 
tf.flags.DEFINE_integer('batch_size', 16, 'Mini-batch size') 
tf.flags.DEFINE_string('data_url', 's3://zxy/model/zzy', 'Dir of dataset') 
tf.flags.DEFINE_string('log_dir', 's3://zxy/model/zzy/log', 'Dir of log') 
tf.flags.DEFINE_boolean('is_training', True, 'True for train. False for eval and predict.') 
flags = tf.flags.FLAGS 
num_gpus = mox.get_flag('num_gpus') 
num_workers = len(mox.get_flag('worker_hosts').split(',')) 
steps_per_epoch = int(math.ceil(float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers))) 
submission = pd.DataFrame(columns=['id', 'is_iceberg']) 
def input_fn(run_mode, **kwargs): 
  if run_mode == mox.ModeKeys.TRAIN: 
    num_samples = NUM_SAMPLES_TRAIN 
    num_epochs = None 
    shuffle = True 
    file_pattern = 'iceberg-train-*.tfrecord' 
  else: 
    num_epochs = 1 
    shuffle = False 
    if run_mode == mox.ModeKeys.EVAL: 
      num_samples = NUM_SAMPLES_EVAL 
      file_pattern = 'iceberg-eval-*.tfrecord' 
コード例 #10
0
def main(*args, **kwargs):
  import time
  st = time.time()
  num_gpus = mox.get_flag('num_gpus')
  num_workers = len(mox.get_flag('worker_hosts').split(','))

  exclude_list = ['global_step']
  model_meta = mox.get_model_meta(flags.model_name)
  exclude_list.append(model_meta.default_logits_pattern)
  checkpoint_exclude_patterns = ','.join(exclude_list)
  mox.set_flag('checkpoint_exclude_patterns', checkpoint_exclude_patterns)

  data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url)
  labels_list = data_meta.labels_list

  mox.set_flag('loss_scale', 1024.0)

  def input_fn(mode, **kwargs):
    data_augmentation_fn = mox.get_data_augmentation_fn(name=flags.model_name,
                                                        run_mode=mode)

    dataset = mox.ImageClassificationRawDataset(data_meta,
                                                batch_size=flags.batch_size,
                                                num_epochs=20,
                                                augmentation_fn=data_augmentation_fn,
                                                reader_class=mox.AsyncRawGenerator)

    images, labels = dataset.get(['image', 'label'])

    return images, labels

  def model_fn(inputs, mode, **kwargs):
    images, labels = inputs

    # cpu cannot support model infer with `NCHW`, gpu support both 
    if mode == mox.ModeKeys.EXPORT:
      data_format = 'NHWC'
    else:
      data_format = 'NCHW'
    mox_model_fn = mox.get_model_fn(
      name=flags.model_name,
      run_mode=mode,
      num_classes=data_meta.num_classes,
      weight_decay=0.00004,
      data_format=data_format,
      batch_norm_fused=True)

    images_fp16 = tf.cast(images, tf.float16)
    with mox.var_scope(force_dtype=tf.float32):
      logits, _ = mox_model_fn(images_fp16)

    labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes)
    loss = tf.losses.softmax_cross_entropy(labels_one_hot, logits=logits)

    regularization_losses = mox.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    regularization_loss = tf.add_n(regularization_losses)
    loss = loss + regularization_loss

    logits_fp32 = tf.cast(logits, tf.float32)
    accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32))
    export_spec = mox.ExportSpec(inputs_dict={'images': images},
                                 outputs_dict={'logits': logits_fp32},
                                 version='model')

    return mox.ModelSpec(loss=loss,
                         log_info={'loss': loss, 'accuracy': accuracy},
                         export_spec=export_spec)

  def optimizer_fn():
    lr = learning_rate_scheduler.piecewise_lr('10:0.01,20:0.001',
                                              num_samples=data_meta.total_num_samples,
                                              global_batch_size=flags.batch_size * num_gpus * num_workers)
    opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
    return opt

  mox.run(input_fn=input_fn,
          model_fn=model_fn,
          optimizer_fn=optimizer_fn,
          run_mode=mox.ModeKeys.TRAIN,
          log_dir=flags.train_url,
          checkpoint_path=flags.checkpoint_url,
          max_number_of_steps=sys.maxint,
          export_model=mox.ExportKeys.TF_SERVING)

  # for model infer in ModelArts console
  with mox.file.File(os.path.join(flags.train_url, 'model', 'labels.txt'), 'w') as f:
    f.write('\n'.join(labels_list))

  print(time.time() - st)