Ejemplo n.º 1
0
def main(*args, **kwargs):
    check_dataset()
    mnist = input_data.read_data_sets(flags.data_url, one_hot=True)

    # define the input dataset, return image and label
    def input_fn(run_mode, **kwargs):
        def gen():
            while True:
                yield mnist.train.next_batch(flags.batch_size)

        ds = tf.data.Dataset.from_generator(
            gen,
            output_types=(tf.float32, tf.int64),
            output_shapes=(tf.TensorShape([None,
                                           784]), tf.TensorShape([None, 10])))
        return ds.make_one_shot_iterator().get_next()

    # define the model for training or evaling.
    def model_fn(inputs, run_mode, **kwargs):
        x, y_ = inputs

        y = tf.keras.layers.Dense(128, activation='relu')(x)
        y = tf.keras.layers.Dense(10)(y)
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        export_spec = mox.ExportSpec(inputs_dict={'images': x},
                                     outputs_dict={'logits': y},
                                     version='model')
        return mox.ModelSpec(loss=cross_entropy,
                             log_info={
                                 'loss': cross_entropy,
                                 'accuracy': accuracy
                             },
                             export_spec=export_spec)

    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=mox.get_optimizer_fn('adam', learning_rate=0.01),
            run_mode=mox.ModeKeys.TRAIN,
            batch_size=flags.batch_size,
            auto_batch=False,
            log_dir=flags.train_url,
            max_number_of_steps=1000,
            log_every_n_steps=10,
            export_model=mox.ExportKeys.TF_SERVING)
Ejemplo n.º 2
0
def main(*args):
    mnist = input_data.read_data_sets(flags.data_url, one_hot=True)

    # define the input dataset, return image and label
    def input_fn(run_mode, **kwargs):
        def gen():
            while True:
                yield mnist.train.next_batch(50)

        ds = tf.data.Dataset.from_generator(
            gen,
            output_types=(tf.float32, tf.int64),
            output_shapes=(tf.TensorShape([None,
                                           784]), tf.TensorShape([None, 10])))
        return ds.make_one_shot_iterator().get_next()

    # define the model for training or evaling.
    def model_fn(inputs, run_mode, **kwargs):
        x, y_ = inputs
        W = tf.get_variable(name='W', initializer=tf.zeros([784, 10]))
        b = tf.get_variable(name='b', initializer=tf.zeros([10]))
        y = tf.matmul(x, W) + b
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
        predictions = tf.argmax(y, 1)
        correct_predictions = tf.equal(predictions, tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
        export_spec = mox.ExportSpec(inputs_dict={'images': x},
                                     outputs_dict={'predictions': predictions},
                                     version='model')
        return mox.ModelSpec(loss=cross_entropy,
                             log_info={
                                 'loss': cross_entropy,
                                 'accuracy': accuracy
                             },
                             export_spec=export_spec)

    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=mox.get_optimizer_fn('sgd', learning_rate=0.01),
            run_mode=mox.ModeKeys.TRAIN,
            batch_size=50,
            auto_batch=False,
            log_dir=flags.train_url,
            max_number_of_steps=1000,
            log_every_n_steps=10,
            export_model=mox.ExportKeys.TF_SERVING)
Ejemplo n.º 3
0
def main(*args):
    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    steps_per_epoch = int(
        round(
            math.ceil(
                float(NUM_SAMPLES_TRAIN) /
                (flags.batch_size * num_gpus * num_workers))))

    if flags.is_training:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                optimizer_fn=mox.get_optimizer_fn(name='adam',
                                                  learning_rate=0.001),
                run_mode=mox.ModeKeys.TRAIN,
                batch_size=flags.batch_size,
                log_dir=flags.train_url,
                max_number_of_steps=steps_per_epoch * 150,
                log_every_n_steps=20,
                save_summary_steps=50,
                save_model_secs=120,
                export_model=mox.ExportKeys.TF_SERVING)
    else:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.EVAL,
                batch_size=5,
                log_every_n_steps=1,
                max_number_of_steps=int(NUM_SAMPLES_EVAL / 5),
                checkpoint_path=flags.train_url)
        mox.run(input_fn=input_fn,
                output_fn=output_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.PREDICT,
                batch_size=24,
                max_number_of_steps=int(NUM_SAMPLES_TEST / 24),
                log_every_n_steps=50,
                output_every_n_steps=int(NUM_SAMPLES_TEST / 24),
                checkpoint_path=flags.train_url)

        # Write results to file. tf.gfile allow writing file to EBS/s3
        submission_file = os.path.join(flags.train_url, 'submission.csv')
        result = submission.to_csv(path_or_buf=None, index=False)
        with tf.gfile.Open(submission_file, 'w') as f:
            f.write(result)
Ejemplo n.º 4
0
    batch_size = 100
    num_batches = mnist.test.num_examples // batch_size

    def gen():
        for _ in range(num_batches):
            yield mnist.test.next_batch(batch_size)

    ds = tf.data.Dataset.from_generator(
        gen,
        output_types=(tf.float32, tf.int64),
        output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None,
                                                                    10])))
    return ds.make_one_shot_iterator().get_next()


def model_fn(inputs, run_mode, **kwargs):
    x, y_ = inputs
    W = tf.get_variable(name='W', initializer=tf.zeros([784, 10]))
    b = tf.get_variable(name='b', initializer=tf.zeros([10]))
    y = tf.matmul(x, W) + b
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return mox.ModelSpec(log_info={'accuracy': accuracy})


mox.run(input_fn=input_fn,
        model_fn=model_fn,
        run_mode=mox.ModeKeys.EVAL,
        checkpoint_path=flags.train_url,
        max_number_of_steps=sys.maxint)
Ejemplo n.º 5
0
    x, y_ = inputs
    W = tf.get_variable(name='W', initializer=tf.zeros([784, 10]))
    b = tf.get_variable(name='b', initializer=tf.zeros([10]))
    y = tf.matmul(x, W) + b
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    predictions = tf.argmax(y, 1)
    correct_predictions = tf.equal(predictions, tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    export_spec = mox.ExportSpec(inputs_dict={'images': x},
                                 outputs_dict={'predictions': predictions})
    return mox.ModelSpec(loss=cross_entropy,
                         log_info={
                             'loss': cross_entropy,
                             'accuracy': accuracy
                         },
                         export_spec=export_spec)


if __name__ == '__main__':
    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=mox.get_optimizer_fn('sgd', learning_rate=0.01),
            run_mode=mox.ModeKeys.TRAIN,
            batch_size=50,
            auto_batch=False,
            log_dir=flags.train_url,
            max_number_of_steps=1000,
            log_every_n_steps=10,
            export_model=mox.ExportKeys.TF_SERVING)
def main(*args):
    _data_url = flags.data_url
    _train_url = flags.train_url
    if not mox.file.is_directory(_train_url):
        mox.file.make_dirs(_train_url)
    mox.file.make_dirs('/cache/data_url')
    mox.file.make_dirs('/cache/train_url')
    mox.file.copy_parallel(_data_url, '/cache/data_url')
    mox.file.copy_parallel(_train_url, '/cache/train_url')
    flags.data_url = '/cache/data_url'
    flags.train_url = '/cache/train_url'
    atexit.register(
        lambda: mox.file.copy_parallel('/cache/train_url', _train_url))
    logger = logging.getLogger()
    while logger.handlers:
        logger.handlers.pop()

    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    steps_per_epoch = int(
        math.ceil(
            float(NUM_SAMPLES_TRAIN) /
            (flags.batch_size * num_gpus * num_workers)))
    submission = pd.DataFrame(columns=['id', 'is_iceberg'])

    def input_fn(run_mode, **kwargs):
        if run_mode == mox.ModeKeys.TRAIN:
            num_samples = NUM_SAMPLES_TRAIN
            num_epochs = None
            shuffle = True
            file_pattern = 'iceberg-train-*.tfrecord'
        else:
            num_epochs = 1
            shuffle = False
            if run_mode == mox.ModeKeys.EVAL:
                num_samples = NUM_SAMPLES_EVAL
                file_pattern = 'iceberg-eval-*.tfrecord'
            else:
                num_samples = NUM_SAMPLES_TEST
                file_pattern = 'iceberg-test-*.tfrecord'
        keys_to_features = {
            'band_1':
            tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None),
            'band_2':
            tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None),
            'angle':
            tf.FixedLenFeature([1], tf.float32, default_value=None),
        }
        items_to_handlers = {
            'band_1': slim.tfexample_decoder.Tensor('band_1', shape=[75, 75]),
            'band_2': slim.tfexample_decoder.Tensor('band_2', shape=[75, 75]),
            'angle': slim.tfexample_decoder.Tensor('angle', shape=[])
        }
        if run_mode == mox.ModeKeys.PREDICT:
            keys_to_features['id'] = tf.FixedLenFeature([1],
                                                        tf.string,
                                                        default_value=None)
            items_to_handlers['id'] = slim.tfexample_decoder.Tensor('id',
                                                                    shape=[])
        else:
            keys_to_features['label'] = tf.FixedLenFeature([1],
                                                           tf.int64,
                                                           default_value=None)
            items_to_handlers['label'] = slim.tfexample_decoder.Tensor(
                'label', shape=[])
        dataset = mox.get_tfrecord(dataset_dir=flags.data_url,
                                   file_pattern=file_pattern,
                                   num_samples=num_samples,
                                   keys_to_features=keys_to_features,
                                   items_to_handlers=items_to_handlers,
                                   num_epochs=num_epochs,
                                   shuffle=shuffle)
        if run_mode == mox.ModeKeys.PREDICT:
            band_1, band_2, id_or_label, angle = dataset.get(
                ['band_1', 'band_2', 'id', 'angle'])
            # Non-DMA safe string cannot tensor may not be copied to a GPU.
            # So we encode string to a list of integer.
            id_or_label = tf.py_func(
                lambda str: np.array([ord(ch) for ch in str]), [id_or_label],
                tf.int64)
            # We know `id` is a string of 8 alphabets.
            id_or_label = tf.reshape(id_or_label, shape=(8, ))
        else:
            band_1, band_2, id_or_label, angle = dataset.get(
                ['band_1', 'band_2', 'label', 'angle'])
        band_3 = band_1 + band_2

        # Rescale the input image to [0, 1]
        def rescale(*args):
            ret_images = []
            for image in args:
                image = tf.cast(image, tf.float32)
                image_min = tf.reduce_min(image)
                image_max = tf.reduce_max(image)
                image = (image - image_min) / (image_max - image_min)
                ret_images.append(image)
            return ret_images

        band_1, band_2, band_3 = rescale(band_1, band_2, band_3)
        image = tf.stack([band_1, band_2, band_3], axis=2)
        # Data augementation
        if run_mode == mox.ModeKeys.TRAIN:
            image = tf.image.random_flip_left_right(image)
            image = tf.image.random_flip_up_down(image)
            image = tf.image.rot90(image,
                                   k=tf.random_uniform(shape=(),
                                                       maxval=3,
                                                       minval=0,
                                                       dtype=tf.int32))
        return image, id_or_label, angle

    def model_v1(images, angles, run_mode):
        is_training = (run_mode == mox.ModeKeys.TRAIN)
        # Conv Layer 1
        x = Conv2D(64,
                   kernel_size=(3, 3),
                   activation='relu',
                   input_shape=(75, 75, 3))(images)
        x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Conv Layer 2
        x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Conv Layer 3
        x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Conv Layer 4
        x = Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
        x = Dropout(0.2)(x, training=is_training)
        # Flatten the data for upcoming dense layers
        x = Flatten()(x)
        x = Concatenate()([x, angles])
        # Dense Layers
        x = Dense(512)(x)
        x = Activation('relu')(x)
        x = Dropout(0.2)(x, training=is_training)
        # Dense Layer 2
        x = Dense(256)(x)
        x = Activation('relu')(x)
        x = Dropout(0.2)(x, training=is_training)
        # Sigmoid Layer
        logits = Dense(2)(x)
        return logits

    def model_fn(inputs, run_mode, **kwargs):
        # In train or eval, id_or_labels represents labels. In predict, id_or_labels represents id.
        images, id_or_labels, angles = inputs
        # Reshape angles from [batch_size] to [batch_size, 1]
        angles = tf.expand_dims(angles, 1)
        # Apply your version of model
        logits = model_v1(images, angles, run_mode)
        if run_mode == mox.ModeKeys.PREDICT:
            logits = tf.nn.softmax(logits)
            # clip logits to get lower loss value.
            logits = tf.clip_by_value(logits,
                                      clip_value_min=0.05,
                                      clip_value_max=0.95)
            model_spec = mox.ModelSpec(output_info={
                'id': id_or_labels,
                'logits': logits
            })
        else:
            labels_one_hot = slim.one_hot_encoding(id_or_labels, 2)
            loss = tf.losses.softmax_cross_entropy(
                logits=logits,
                onehot_labels=labels_one_hot,
                label_smoothing=0.0,
                weights=1.0)
            model_spec = mox.ModelSpec(loss=loss, log_info={'loss': loss})
        return model_spec

    def output_fn(outputs):
        global submission
        for output in outputs:
            for id, logits in zip(output['id'], output['logits']):
                # Decode id from integer list to string.
                id = ''.join([chr(ch) for ch in id])
                # Get the probability of label==1
                is_iceberg = logits[1]
                df = pd.DataFrame([[id, is_iceberg]],
                                  columns=['id', 'is_iceberg'])
                submission = submission.append(df)

    if flags.is_training:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                optimizer_fn=mox.get_optimizer_fn(name='adam',
                                                  learning_rate=0.001),
                run_mode=mox.ModeKeys.TRAIN,
                batch_size=flags.batch_size,
                log_dir=flags.train_url,
                max_number_of_steps=steps_per_epoch * 150,
                log_every_n_steps=20,
                save_summary_steps=50,
                save_model_secs=120)
    else:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.EVAL,
                batch_size=5,
                log_every_n_steps=1,
                max_number_of_steps=int(NUM_SAMPLES_EVAL / 5),
                checkpoint_path=flags.train_url)
        mox.run(input_fn=input_fn,
                output_fn=output_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.PREDICT,
                batch_size=24,
                max_number_of_steps=int(NUM_SAMPLES_TEST / 24),
                log_every_n_steps=50,
                output_every_n_steps=int(NUM_SAMPLES_TEST / 24),
                checkpoint_path=flags.train_url)
        # Write results to file. tf.gfile allow writing file to EBS/s3
        submission_file = os.path.join(flags.train_url, 'submission.csv')
        result = submission.to_csv(path_or_buf=None, index=False)
        with tf.gfile.Open(submission_file, 'w') as f:
            f.write(result)
Ejemplo n.º 7
0
            # Decode id from integer list to string.
            id = ''.join([chr(ch) for ch in id])
            # Get the probability of label==1
            is_iceberg = logits[1]
            df = pd.DataFrame([[id, is_iceberg]], columns=['id', 'is_iceberg'])
            submission = submission.append(df)


if __name__ == '__main__':
    if flags.is_training:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                optimizer_fn=mox.get_optimizer_fn(name='adam',
                                                  learning_rate=0.001),
                run_mode=mox.ModeKeys.TRAIN,
                batch_size=flags.batch_size,
                log_dir=flags.train_url,
                max_number_of_steps=steps_per_epoch * 150,
                log_every_n_steps=20,
                save_summary_steps=50,
                save_model_secs=120)
    else:
        mox.run(input_fn=input_fn,
                model_fn=model_fn,
                run_mode=mox.ModeKeys.EVAL,
                batch_size=5,
                log_every_n_steps=1,
                max_number_of_steps=int(NUM_SAMPLES_EVAL / 5),
                checkpoint_path=flags.train_url)
        mox.run(input_fn=input_fn,
                output_fn=output_fn,
Ejemplo n.º 8
0
      gen, output_types=(tf.float32, tf.int64),
      output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None, 10])))
  return ds.make_one_shot_iterator().get_next()


def model_fn(inputs, run_mode, **kwargs):
  x, y_ = inputs
  W = tf.get_variable(name='W', initializer=tf.zeros([784, 10]))
  b = tf.get_variable(name='b', initializer=tf.zeros([10]))
  y = tf.matmul(x, W) + b
  cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
  predictions = tf.argmax(y, 1)
  correct_predictions = tf.equal(predictions, tf.argmax(y_, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
  export_spec = mox.ExportSpec(inputs_dict={'images': x}, outputs_dict={'predictions': predictions})
  return mox.ModelSpec(loss=cross_entropy, log_info={'loss': cross_entropy, 'accuracy': accuracy},
                       export_spec=export_spec)


if __name__ == '__main__':
  mox.run(input_fn=input_fn,
          model_fn=model_fn,
          optimizer_fn=mox.get_optimizer_fn('sgd', learning_rate=0.01),
          run_mode=mox.ModeKeys.TRAIN,
          batch_size=50,
          auto_batch=False,
          log_dir=flags.train_url,
          max_number_of_steps=1000,
          log_every_n_steps=10,
          export_model=mox.ExportKeys.TF_SERVING)
Ejemplo n.º 9
0
def main(*args, **kwargs):
    if flags.use_controller:
        convert_ps_to_controller()

    job_name = mox.get_flag('job_name')
    task_index = mox.get_flag('task_index')

    if flags.local_cache == 'hard':
        if flags.use_controller:
            # In all-reduce mode, worker-0 does not download dataset (controller-0 will download).
            imagenet_data, imagenet_160_data = download_dataset(
                flags.data_url,
                flags.data_url_160,
                skip_download=(job_name == 'worker' and task_index == 0))
        else:
            # PS dose not download dataset.
            imagenet_data, imagenet_160_data = download_dataset(
                flags.data_url,
                flags.data_url_160,
                skip_download=(job_name == 'ps'))

        log_dir = '/cache/cache-outputs'
    else:
        imagenet_data = flags.data_url
        imagenet_160_data = flags.data_url_160
        log_dir = flags.train_url

    print('download dataset finish at %s' % time.time())

    if (not job_name or
        (job_name == 'worker' and task_index == 0)) and flags.train_url:
        if not mox.file.is_directory(log_dir):
            mox.file.make_dirs(log_dir)
    else:
        log_dir = None

    model_meta = mox.get_model_meta(flags.model_name)
    labels_offset = model_meta.default_labels_offset
    num_workers = len(mox.get_flag('worker_hosts').split(','))

    assert flags.bs_and_ims_strategy is not None
    schduler = config_bs_ims(flags.bs_and_ims_strategy)
    max_step = int(schduler[-1][0])

    def input_fn(mode, **kwargs):

        if not flags.synthetic:
            ds_strategy_spec = []
            ds_switch_steps = []

            if flags.split_dataset_like_mxnet and mox.get_flag('job_name'):
                if num_workers == 4:
                    file_pattern = 'train-*-of-*-node-%d-*-*' % task_index
                elif num_workers == 8:
                    file_pattern = 'train-*-of-*-node-*-%d-*' % task_index
                elif num_workers == 16:
                    file_pattern = 'train-*-of-*-node-*-*-%d' % task_index
                else:
                    raise ValueError('num_workers should be 4, 8, 16')

            else:
                file_pattern = flags.file_pattern

            for step, ims, bs in schduler:
                # switch to next dataset 2 steps earlier because there are 2 pipeline prefetch queue
                ds_switch_steps.append(step - 2)
                if ims == 128:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_160_data,
                                      file_pattern), bs, ims, 0.08))
                elif ims == 224:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_data,
                                      file_pattern), bs, ims, 0.087))
                elif ims == 288:
                    ds_strategy_spec.append(
                        (os.path.join(imagenet_data,
                                      file_pattern), bs, ims, 0.5))
                else:
                    raise ValueError('image is not in [128, 224, 288]')

            # The last stage of dataset does not need to be switched
            ds_switch_steps.pop(-1)
            tf.logging.info('Dataset will be switched at step: %s' %
                            ds_switch_steps)

            dataset = ProgressiveImagenetDataset(
                num_samples=flags.num_samples,
                strategy_spec=ds_strategy_spec,
                ds_switch_steps=ds_switch_steps,
                shuffle=True,
                num_parallel=flags.num_readers,
                labels_offset=labels_offset,
                private_num_threads=flags.private_num_threads,
                shuffle_buffer_size=512 * 8 * 2)

            image, label = dataset.get(['image', 'label'])

            image_shape = tf.shape(image)[2]
            batch_size = tf.shape(label)[0]
            tf.summary.scalar(name='image_shape', tensor=image_shape)
            tf.summary.scalar(name='batch_size', tensor=batch_size)

        else:

            import numpy as np
            image = tf.constant(
                np.random.randint(low=0,
                                  high=255,
                                  size=[flags.batch_size, 128, 128, 3],
                                  dtype=np.uint8))
            label = tf.constant(
                np.random.randint(low=0,
                                  high=999,
                                  size=[flags.batch_size],
                                  dtype=np.int64))

        if flags.split_to_device:
            input_spec = mox.InputSpec(split_to_device=True)
            input_spec.new_input([image, label])
            return input_spec
        else:
            return image, label

    def model_fn(inputs, mode, **kwargs):
        if not flags.gpu_synthetic:
            if flags.split_to_device:
                images, labels = inputs.get_input(0)
            else:
                images, labels = inputs
        else:
            import numpy as np
            images = tf.constant(
                np.random.randint(low=0,
                                  high=255,
                                  size=[flags.batch_size, 128, 128, 3],
                                  dtype=np.uint8))
            labels = tf.constant(
                np.random.randint(low=0,
                                  high=999,
                                  size=[flags.batch_size],
                                  dtype=np.int64))

        if flags.fp16:
            images = tf.cast(images, tf.float16)

        def preprocess_fn(images, run_mode, *args):
            images = images / 255.0
            channels = tf.split(axis=3, num_or_size_splits=3, value=images)
            for i in range(3):
                channels[i] = (channels[i] - mean[i]) / std[i]
            images = tf.concat(axis=3, values=channels)
            if flags.data_format == 'NCHW':
                images = tf.transpose(images, perm=(0, 3, 1, 2))
            return images

        model_kwargs = {}
        if flags.model_name == 'resnet_v1_50_8k':
            if flags.official_stride:
                model_kwargs['official'] = True
            if flags.fastai_initializer:
                model_kwargs['weights_initializer_params'] = {
                    'factor': 2.0 / 1.3,
                    'mode': 'FAN_OUT'
                }

        mox_model_fn = mox.get_model_fn(name=flags.model_name,
                                        run_mode=mode,
                                        num_classes=1000,
                                        preprocess_fn=preprocess_fn,
                                        weight_decay=flags.weight_decay,
                                        data_format=flags.data_format,
                                        batch_norm_fused=True,
                                        batch_renorm=False,
                                        **model_kwargs)

        logits, end_points = mox_model_fn(images)

        labels_one_hot = slim.one_hot_encoding(labels, 1000)
        loss = tf.losses.softmax_cross_entropy(logits=logits,
                                               onehot_labels=labels_one_hot,
                                               label_smoothing=0.0,
                                               weights=1.0)

        logits_fp32 = tf.cast(logits, tf.float32)
        accuracy_top_1 = tf.reduce_mean(
            tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32))
        accuracy_top_5 = tf.reduce_mean(
            tf.cast(tf.nn.in_top_k(logits_fp32, labels, 5), tf.float32))

        log_info = {
            'ent_loss': loss,
            'top-1': accuracy_top_1,
            'top-5': accuracy_top_5
        }

        regularization_losses = mox.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        if len(regularization_losses
               ) > 0 and flags.use_optimizer != 'dymomentumw':
            regularization_loss = tf.add_n(regularization_losses)
            log_info['reg_loss'] = regularization_loss
            loss = loss + regularization_loss
            log_info['total_loss'] = loss

        return mox.ModelSpec(loss=loss, log_info=log_info)

    if flags.strict_sync_replicas:
        mox.set_flag('sync_replicas', False)
        mox.set_flag('chief_inc_global_step', True)

    def optimizer_fn():
        global_step = tf.train.get_or_create_global_step()
        decay_end = 1.0 - flags.cooldown

        if flags.use_lr_schedule == 'lcd':
            lr = linear_cosine_decay(flags.max_lr, flags.min_lr, global_step,
                                     max_step, flags.warmup, decay_end)
            print("Using Linear Cosine Decay Schedule")
        elif flags.use_lr_schedule == 'poly':
            lr = polynomial_decay(flags.max_lr, flags.min_lr, global_step,
                                  max_step, flags.warmup, decay_end)
            print("Using Polynomial Decay Schedule")
        else:
            raise ValueError("lr schedule not provided")

        if flags.use_optimizer == 'dymomentum':
            opt = DyMomentumOptimizer(lr,
                                      flags.max_lr,
                                      flags.min_lr,
                                      max_mom=flags.max_mom,
                                      min_mom=flags.min_mom,
                                      global_step=global_step,
                                      max_iteration=max_step,
                                      use_nesterov=flags.use_nesterov,
                                      cooldown=flags.cooldown,
                                      use_lars=flags.use_lars,
                                      weight_decay=flags.weight_decay)
            print("Using Dynamic Momentum Optimizer")
        elif flags.use_optimizer == 'dymomentumw':
            opt = DyMomentumWOptimizer(lr,
                                       flags.max_lr,
                                       flags.min_lr,
                                       max_mom=flags.max_mom,
                                       min_mom=flags.min_mom,
                                       global_step=global_step,
                                       max_iteration=max_step,
                                       use_nesterov=flags.use_nesterov,
                                       cooldown=flags.cooldown,
                                       use_lars=flags.use_lars,
                                       weight_decay=flags.weight_decay)
            print("Using Dynamic MomentumW Optimizer")
        else:
            raise ValueError("Optimizer not provided")

        tf.summary.scalar(name='momentum', tensor=opt.get_momentum())

        if flags.strict_sync_replicas:
            from moxing.tensorflow.optimizer.simple_sync_optimizer import SimpleSyncOptimizer
            opt = SimpleSyncOptimizer(opt,
                                      num_workers=num_workers,
                                      task_index=task_index)

        return opt

    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=optimizer_fn,
            run_mode=flags.run_mode,
            batch_size=flags.batch_size,
            max_number_of_steps=max_step,
            log_every_n_steps=flags.log_every_n_steps,
            log_dir=log_dir,
            auto_batch=False,
            save_summary_steps=flags.save_summary_steps,
            checkpoint_path=flags.checkpoint_url,
            save_model_secs=flags.save_model_secs)

    print('upload model finish at %s' % time.time())

    if flags.local_cache == 'hard' and log_dir:
        mox.file.copy_parallel(log_dir, flags.train_url)

    print('Training job finish at: %s' % time.time())
Ejemplo n.º 10
0
def main(_):
    # 获取当前使用的GPU数量和节点数量
    num_gpus = mox.get_flag('num_gpus')
    num_workers = len(mox.get_flag('worker_hosts').split(','))
    data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url)

    def input_fn(mode):
        # 创建一个数据增强方法,该方法基于resnet50论文实现
        augmentation_fn = mox.get_data_augmentation_fn(name='resnet_v1_50',
                                                       run_mode=mode,
                                                       output_height=224,
                                                       output_width=224)

        # 创建`数据集读取类`,并将数据增强方法传入,最多读取20个epoch
        dataset = mox.ImageClassificationRawDataset(
            data_meta,
            batch_size=flags.batch_size,
            num_epochs=20,
            augmentation_fn=augmentation_fn)
        image, label = dataset.get(['image', 'label'])
        return image, label

    def model_fn(inputs, mode):
        images, labels = inputs

        # 获取一个resnet50的模型,输入images,输入logits和end_points,这里不关心end_points,仅取logits
        logits, _ = mox.get_model_fn(name='resnet_v1_50',
                                     run_mode=mode,
                                     num_classes=data_meta.num_classes,
                                     weight_decay=0.00004)(images)

        # 计算交叉熵损失值
        labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes)
        loss = tf.losses.softmax_cross_entropy(logits=logits,
                                               onehot_labels=labels_one_hot)

        # 获取正则项损失值,并加到loss上,这里必须要用mox.get_collection代替tf.get_collection
        regularization_losses = mox.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        regularization_loss = tf.add_n(regularization_losses)
        loss = loss + regularization_loss

        # 计算分类正确率
        accuracy = tf.reduce_mean(
            tf.cast(tf.nn.in_top_k(logits, labels, 1), tf.float32))

        # 返回MoXing-TensorFlow用于定义模型的类ModelSpec
        return mox.ModelSpec(loss=loss,
                             log_info={
                                 'loss': loss,
                                 'accuracy': accuracy
                             })

    def optimizer_fn():
        # 使用分段式学习率,0-10个epoch为0.01,10-20个epoch为0.001
        lr = learning_rate_scheduler.piecewise_lr(
            '10:0.01,20:0.001',
            num_samples=data_meta.total_num_samples,
            global_batch_size=flags.batch_size * num_gpus * num_workers)
        return tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)

    mox.run(input_fn=input_fn,
            model_fn=model_fn,
            optimizer_fn=optimizer_fn,
            run_mode=mox.ModeKeys.TRAIN,
            max_number_of_steps=sys.maxint,
            log_dir=flags.train_url)
Ejemplo n.º 11
0
  global submission 
  for output in outputs: 
    for id, logits in zip(output['id'], output['logits']): 
      # Decode id from integer list to string. 
      id = ''.join([chr(ch) for ch in id]) 
      # Get the probability of label==1 
      is_iceberg = logits[1] 
      df = pd.DataFrame([[id, is_iceberg]], columns=['id', 'is_iceberg']) 
      submission = submission.append(df) 
if __name__ == '__main__': 
  if flags.is_training: 
    mox.run(input_fn=input_fn, 
            model_fn=model_fn, 
            optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), 
            run_mode=mox.ModeKeys.TRAIN, 
            batch_size=flags.batch_size, 
            log_dir=flags.log_dir, 
            max_number_of_steps=steps_per_epoch * 150, 
            log_every_n_steps=20, 
            save_summary_steps=50, 
            save_model_secs=120) 
  else: 
    mox.run(input_fn=input_fn, 
            model_fn=model_fn, 
            run_mode=mox.ModeKeys.EVAL, 
            batch_size=5, 
            log_every_n_steps=1, 
            max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), 
            checkpoint_path=flags.log_dir) 
    mox.run(input_fn=input_fn, 
            output_fn=output_fn, 
            model_fn=model_fn, 
Ejemplo n.º 12
0
    ds = tf.data.Dataset.from_generator(
        gen,
        output_types=(tf.float32, tf.int64),
        output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None,
                                                                    10])))
    x, y_ = ds.make_one_shot_iterator().get_next()
    return x, y_


def model_fn(inputs, run_mode, **kwargs):
    x, y_ = inputs
    W = tf.get_variable(name='W', initializer=tf.zeros([784, 10]))
    b = tf.get_variable(name='b', initializer=tf.zeros([10]))
    y = tf.matmul(x, W) + b
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    return mox.ModelSpec(loss=cross_entropy, log_info={'loss': cross_entropy})


def optimizer_fn():
    return tf.train.GradientDescentOptimizer(0.5)


mox.run(input_fn=input_fn,
        model_fn=model_fn,
        optimizer_fn=optimizer_fn,
        run_mode=mox.ModeKeys.TRAIN,
        log_dir=flags.train_url,
        max_number_of_steps=sys.maxint)
Ejemplo n.º 13
0
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    from efficient_ai.config import CompressorSpec
    compressor_spec = CompressorSpec(logits=y)
    return mox.ModelSpec(loss=cross_entropy,
                         compressor_spec=compressor_spec,
                         log_info={'loss': cross_entropy})


def optimizer_fn():
    return tf.train.GradientDescentOptimizer(0.5)


mox.run(input_fn=input_fn,
        model_fn=model_fn,
        optimizer_fn=optimizer_fn,
        run_mode=mox.ModeKeys.TRAIN,
        log_dir=flags.train_url,
        max_number_of_steps=500,
        batch_size=12)


def dcp():
    from efficient_ai.config import DCPCompressorConfig
    run_mode = mox.ModeKeys.TRAIN
    log_dir = flags.train_url
    max_number_of_steps = 500
    num_classes = 10
    batch_size = None
    new_log_dir = os.path.join(log_dir, 'dcp')

    config = DCPCompressorConfig(
Ejemplo n.º 14
0
def main(*args, **kwargs):
  import time
  st = time.time()
  num_gpus = mox.get_flag('num_gpus')
  num_workers = len(mox.get_flag('worker_hosts').split(','))

  exclude_list = ['global_step']
  model_meta = mox.get_model_meta(flags.model_name)
  exclude_list.append(model_meta.default_logits_pattern)
  checkpoint_exclude_patterns = ','.join(exclude_list)
  mox.set_flag('checkpoint_exclude_patterns', checkpoint_exclude_patterns)

  data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url)
  labels_list = data_meta.labels_list

  mox.set_flag('loss_scale', 1024.0)

  def input_fn(mode, **kwargs):
    data_augmentation_fn = mox.get_data_augmentation_fn(name=flags.model_name,
                                                        run_mode=mode)

    dataset = mox.ImageClassificationRawDataset(data_meta,
                                                batch_size=flags.batch_size,
                                                num_epochs=20,
                                                augmentation_fn=data_augmentation_fn,
                                                reader_class=mox.AsyncRawGenerator)

    images, labels = dataset.get(['image', 'label'])

    return images, labels

  def model_fn(inputs, mode, **kwargs):
    images, labels = inputs

    # cpu cannot support model infer with `NCHW`, gpu support both 
    if mode == mox.ModeKeys.EXPORT:
      data_format = 'NHWC'
    else:
      data_format = 'NCHW'
    mox_model_fn = mox.get_model_fn(
      name=flags.model_name,
      run_mode=mode,
      num_classes=data_meta.num_classes,
      weight_decay=0.00004,
      data_format=data_format,
      batch_norm_fused=True)

    images_fp16 = tf.cast(images, tf.float16)
    with mox.var_scope(force_dtype=tf.float32):
      logits, _ = mox_model_fn(images_fp16)

    labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes)
    loss = tf.losses.softmax_cross_entropy(labels_one_hot, logits=logits)

    regularization_losses = mox.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    regularization_loss = tf.add_n(regularization_losses)
    loss = loss + regularization_loss

    logits_fp32 = tf.cast(logits, tf.float32)
    accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32))
    export_spec = mox.ExportSpec(inputs_dict={'images': images},
                                 outputs_dict={'logits': logits_fp32},
                                 version='model')

    return mox.ModelSpec(loss=loss,
                         log_info={'loss': loss, 'accuracy': accuracy},
                         export_spec=export_spec)

  def optimizer_fn():
    lr = learning_rate_scheduler.piecewise_lr('10:0.01,20:0.001',
                                              num_samples=data_meta.total_num_samples,
                                              global_batch_size=flags.batch_size * num_gpus * num_workers)
    opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
    return opt

  mox.run(input_fn=input_fn,
          model_fn=model_fn,
          optimizer_fn=optimizer_fn,
          run_mode=mox.ModeKeys.TRAIN,
          log_dir=flags.train_url,
          checkpoint_path=flags.checkpoint_url,
          max_number_of_steps=sys.maxint,
          export_model=mox.ExportKeys.TF_SERVING)

  # for model infer in ModelArts console
  with mox.file.File(os.path.join(flags.train_url, 'model', 'labels.txt'), 'w') as f:
    f.write('\n'.join(labels_list))

  print(time.time() - st)