コード例 #1
0
def main():
    args = parse_args()
    experiment = Run()
    params = load_values(args.param_file)
    if params:
        experiment.log_inputs(**params)
    metrics = load_values(args.metric_file)
    if metrics:
        experiment.log_metrics(**metrics)
    if args.tag:
        experiment.log_tags(args.tag)
    for dataset in load_datasets(args.data_file):
        experiment.log_data_ref(**dataset)
    if args.capture_png:
        imgs = discover_png(experiment.get_outputs_path())
        for img in imgs:
            if isinstance(img, str):
                experiment.log_image(img)
            elif isinstance(img, SerialImages):
                for idx, path in enumerate(img.paths):
                    experiment.log_image(path, name=img.name, step=idx)
            else:
                raise NotImplementedError('We should never get here.')
コード例 #2
0
# https://polyaxon.com/docs/experimentation/tracking/module/#log_data_ref

experiment.log_data_ref('dataset_X', content=X)
experiment.log_data_ref('dataset_y', content=y)

accuracies, classifier = model(X=X,
                               y=y,
                               n_estimators=args.n_estimators,
                               max_features=args.max_features,
                               min_samples_leaf=args.min_samples_leaf)

accuracy_mean, accuracy_std = (np.mean(accuracies), np.std(accuracies))
values, counts = np.histogram(accuracies)

# Polyaxon

experiment.log_metrics(accuracy_mean=accuracy_mean,
                       accuracy_std=accuracy_std)
for step in range(accuracies.size):
    experiment.log_metrics(accuracy=accuracies[step], step=step)

outpath = os.path.join(experiment.get_outputs_path(), 'model.pkl')
with(open(outpath, 'wb')) as outfile:
    pickle.dump(classifier, outfile)

experiment.log_model(
    outpath,
    name='top cross validation model',
    framework='sklearn'
)
コード例 #3
0
ファイル: model.py プロジェクト: zhaohb/polyaxon-examples
        '--max_iter',
        type=int,
        default=1000)
    parser.add_argument(
        '--tol',
        type=float,
        default=0.001
    )
    args = parser.parse_args()

    # Polyaxon
    experiment = Run()

    (X, y) = load_data()

    # Polyaxon
    experiment.log_data_ref(content=X, name='dataset_X')
    experiment.log_data_ref(content=y, name='dataset_y')

    accuracies = model(X=X,
                       y=y,
                       loss=args.loss,
                       penalty=args.penalty,
                       l1_ratio=args.l1_ratio,
                       max_iter=args.max_iter,
                       tol=args.tol)
    accuracy_mean, accuracy_std = (np.mean(accuracies), np.std(accuracies))
    print('Accuracy: {} +/- {}'.format(accuracy_mean, accuracy_std))
    # Polyaxon
    experiment.log_metrics(accuracy_mean=accuracy_mean, accuracy_std=accuracy_std)
コード例 #4
0
def main(unused_argv):

    # Horovod: initialize Horovod.
    hvd.init()

    # Polyaxon
    if hvd.rank() == 0:
        experiment = Run()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (train_data, train_labels), (eval_data, eval_labels) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # Polyaxon
    if hvd.rank() == 0:
        experiment.log_data_ref(content=train_data, name='x_train')
        experiment.log_data_ref(content=train_labels, name='y_train')
        experiment.log_data_ref(content=eval_data, name='x_test')
        experiment.log_data_ref(content=eval_labels, name='y_test')

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=100,
                                                        num_epochs=None,
                                                        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(input_fn=train_input_fn,
                           steps=3000 // hvd.size(),
                           hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)

    # Polyaxon
    if hvd.rank() == 0:
        experiment.log_metrics(**eval_results)
コード例 #5
0
def train(mnist):

    #定义输入输出占位
    input_x = tf.placeholder(tf.float32, [None,INPUT_SIZE,INPUT_SIZE,1 ], name= "input_x")
    input_y = tf.placeholder(tf.float32, [None,OUTPUT_SIZE], name= "input_y")
    dropout_keep_prob = tf.placeholder(tf.float32,name = "dropout_keep_prob")
    l2_loss = tf.constant(0.0)
    print("1 step ok!")

    #polyaxon
    experiment = Run()
    experiment.log_data_ref(content=input_x, name='input_x')
    experiment.log_data_ref(content=input_y, name='input_y')

    #第一层:卷积层conv1
    '''
    input  : [-1,28,28,1]
    filter : [5,5,32]
    output : [-1,28,28,32]
    '''
    with tf.name_scope("conv1"):
        w = get_weights([FILTER1_SIZE,FILTER1_SIZE,1,FILTER1_NUM])
        b = get_biases([FILTER1_NUM])
        with tf.device("/fpga:0"):
            conv1_op = tf.nn.conv2d(
                input  = input_x,
                filter = w,
                strides = [1,1,1,1],
                padding = "SAME",
                name = 'conv1_op')
        #    print("***********************")
        re1 = tf.nn.bias_add(conv1_op,b)
        with tf.device("/fpga:0"):
            conv1 = tf.nn.relu(re1 ,name = "relu")
    print("2 step ok!")

    #第二层:持化层pooling2
    '''
    input  : [-1,28,28,32]
    output : [-1,14,14,32]
    '''
    with tf.name_scope("pooling2"):
        with tf.device("/fpga:0"):
            pooling2 = tf.nn.max_pool(
                value = conv1,
                ksize = [1,2,2,1],
                strides = [1,2,2,1],
                padding = "SAME",
                name = "pooling1")
    print("3 step ok!")

    #第三层:卷积层conv3
    '''
    input  : [-1,14,14,32]
    filter : [5,5,64]
    output : [-1,14,14,64]
    '''
    with tf.name_scope("conv3"):
        w = get_weights([FILTER3_SIZE,FILTER3_SIZE,FILTER1_NUM,FILTER3_NUM])
        b = get_biases([FILTER3_NUM])
        with tf.device("/fpga:0"):
            conv3_op = tf.nn.conv2d(
            input = pooling2,
            filter = w ,
            strides = [1,1,1,1],
            padding = "SAME",
            name = "conv3_op")
        re3 = tf.nn.bias_add(conv3_op,b)
        with tf.device("/fpga:0"):
            conv3 = tf.nn.relu(re3 ,name = "relu")
    print("4 step ok!")

    #第四层:池化层pooling4
    '''
    input  : [-1,14,14,64]
    output : [-1,7,7,64]
    '''
    with tf.name_scope("pooling4"):
        with tf.device("/fpga:0"):
            pooling4 = tf.nn.max_pool(
                value = conv3,
                ksize = [1,2,2,1],
                strides = [1,2,2,1],
                padding = "SAME",
                name = "pooling4")
    #池化结果展开
    '''
    input  : [-1,7,7,64]
    output : [-1,3136]
    '''
    pooling4_flat = tf.reshape(pooling4,[-1,FLAT_SIZE])
    print("5 step ok!")

    #第五层:全连接层fc5
    '''
    input  : [-1,3136]
    output : [-1,512]
    '''
    with tf.name_scope("fc5"):
        w = get_weights([FLAT_SIZE,FC5_SIZE])
        b = get_biases([FC5_SIZE])
        xw_res = tf.nn.xw_plus_b(pooling4_flat,w,b,name=  "fc5")
        with tf.device("/fpga:0"):
            fc5 = tf.nn.relu(xw_res, name = "relu")
        fc5_drop = tf.nn.dropout( fc5,dropout_keep_prob)
        l2_loss += tf.nn.l2_loss(w) + tf.nn.l2_loss(b)
    print("6 step ok!")

    #第六层:全连接层(输出)
    '''
    input  : [-1,512]
    output : [-1,10]
    '''
    with tf.name_scope("fc6"):
        w = get_weights([FC5_SIZE,OUTPUT_SIZE])
        b = get_biases([OUTPUT_SIZE])
        y_hat = tf.nn.xw_plus_b(fc5_drop,w,b,name = "y_hat")
        l2_loss += tf.nn.l2_loss(w) + tf.nn.l2_loss(b)
    print("7 step ok!")


    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = y_hat, labels = input_y)
    #print("****************")
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    loss = cross_entropy_mean + L2NORM_RATE * l2_loss
    print("8 step ok!")

    correct_predictions = tf.equal(tf.argmax(y_hat,1),tf.argmax(input_y,1))
    accuracy = tf.reduce_mean( tf.cast(correct_predictions,tf.float32) )

    global_step = tf.Variable(0,trainable=False)
    train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss,global_step = global_step)
    print("9 step ok!")

    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        for i in range(TRAIN_STEP):
            xs_pre, ys = mnist.train.next_batch(BATCH_SIZE)
            xs = np.reshape(xs_pre,[-1,INPUT_SIZE,INPUT_SIZE,1])
            feed_dict = {
                input_x: xs,
                input_y: ys,
                dropout_keep_prob : 0.5
            }

            _, step, train_loss, train_acc = sess.run([train_op, global_step, loss, accuracy],feed_dict = feed_dict)

            if i%2 == 0:
                print("step:{} ,train loss:{:g}, train_acc:{:g}".format(step,train_loss,train_acc))
                experiment.log_metrics(loss=train_loss, accuracy=train_acc)

        test_x = np.reshape(mnist.test.images[0:100],[-1,INPUT_SIZE,INPUT_SIZE,1])
        test_y = mnist.test.labels[0:100]
        feed_test = {
            input_x : test_x,
            input_y : test_y,
            dropout_keep_prob : 1.0
        }
        test_loss, test_acc, data = sess.run([loss,accuracy,y_hat],feed_dict = feed_test)
      #  print(data)
      #  print(test_y[0])
        print("After {} training steps, in test dataset, loss is {:g}, acc is {:g}".format(TRAIN_STEP,test_loss,test_acc))
        experiment.log_metrics(loss=test_loss, accuracy=test_acc)
コード例 #6
0
ファイル: model.py プロジェクト: zhaohb/polyaxon
    # Polyaxon
    experiment.log_data_ref(content=x_train, name='x_train')
    experiment.log_data_ref(content=y_train, name='y_train')
    experiment.log_data_ref(content=x_test, name='x_test')
    experiment.log_data_ref(content=y_test, name='y_test')

    with tf.Session() as sess:
        model = create_model(conv1_size=args.conv1_size,
                             conv1_out=args.conv1_out,
                             conv1_activation=args.conv1_activation,
                             pool1_size=args.pool1_size,
                             conv2_size=args.conv2_size,
                             conv2_out=args.conv2_out,
                             conv2_activation=args.conv2_activation,
                             pool2_size=args.pool2_size,
                             fc1_activation=args.fc1_activation,
                             fc1_size=args.fc1_size,
                             optimizer=args.optimizer,
                             log_learning_rate=args.log_learning_rate)
        sess.run(tf.global_variables_initializer())
        train_model(model,
                    x_train,
                    y_train,
                    batch_size=args.batch_size,
                    dropout=args.dropout,
                    epochs=args.epochs)
        accuracy = evaluate_model(model, x_test, y_test)

        # Polyaxon
        experiment.log_metrics(accuracy=accuracy)
コード例 #7
0
def main(_):
    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:%s/task:%d/%s" %
                (task["type"], task["index"], FLAGS.device),
                cluster=cluster)):
        worker_device = "/job:%s/task:%d/%s" % (task["type"], task["index"],
                                                FLAGS.device),
        logging.info("worker_device: %s", worker_device)

        ###
        ### Training
        ###

        #
        # read training data
        #

        # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
        # label - digit (0, 1, ..., 9)
        train_queue = tf.train.string_input_producer(
            [FLAGS.train_file], num_epochs=2
        )  # data is repeated and it raises OutOfRange when data is over
        train_reader = tf.TFRecordReader()
        _, train_serialized_exam = train_reader.read(train_queue)
        train_exam = tf.parse_single_example(train_serialized_exam,
                                             features={
                                                 'image_raw':
                                                 tf.FixedLenFeature([],
                                                                    tf.string),
                                                 'label':
                                                 tf.FixedLenFeature([],
                                                                    tf.int64)
                                             })
        train_image = tf.decode_raw(train_exam['image_raw'], tf.uint8)
        train_image.set_shape([784])
        train_image = tf.cast(train_image, tf.float32) * (1. / 255)
        train_label = tf.cast(train_exam['label'], tf.int32)
        train_batch_image, train_batch_label = tf.train.batch(
            [train_image, train_label], batch_size=batch_size)

        #polyaxon
        experiment = Run()
        # Polyaxon
        experiment.log_data_ref(content=train_image, name='train_image')
        experiment.log_data_ref(content=train_label, name='train_label')

        #
        # define training graph
        #

        # define input
        plchd_image = tf.placeholder(dtype=tf.float32, shape=(None, 784))
        plchd_label = tf.placeholder(dtype=tf.int32, shape=(None))

        # define network and inference
        # (simple 2 fully connected hidden layer : 784->128->64->10)
        with tf.name_scope('hidden1'):
            weights = tf.Variable(tf.truncated_normal([784, 128],
                                                      stddev=1.0 /
                                                      math.sqrt(float(784))),
                                  name='weights')
            biases = tf.Variable(tf.zeros([128]), name='biases')
            hidden1 = tf.nn.relu(tf.matmul(plchd_image, weights) + biases)
        with tf.name_scope('hidden2'):
            weights = tf.Variable(tf.truncated_normal([128, 64],
                                                      stddev=1.0 /
                                                      math.sqrt(float(128))),
                                  name='weights')
            biases = tf.Variable(tf.zeros([64]), name='biases')
            hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
        with tf.name_scope('softmax_linear'):
            weights = tf.Variable(tf.truncated_normal([64, 10],
                                                      stddev=1.0 /
                                                      math.sqrt(float(64))),
                                  name='weights')
            biases = tf.Variable(tf.zeros([10]), name='biases')
            logits = tf.matmul(hidden2, weights) + biases

        # define optimization
        global_step = tf.train.create_global_step()  # start without checkpoint
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.07)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=plchd_label,
                                                      logits=logits)
        train_op = optimizer.minimize(loss=loss, global_step=global_step)

        #
        # run session
        #

        with tf.train.MonitoredTrainingSession(master=server.target,
                                               checkpoint_dir=FLAGS.out_dir,
                                               is_chief=is_chief) as sess:

            # when data is over, OutOfRangeError occurs and ends with MonitoredSession

            local_step_value = 0
            run_metadata = tf.RunMetadata()
            array_image, array_label = sess.run(
                [train_batch_image, train_batch_label],
                run_metadata=run_metadata)
            while not sess.should_stop():
                feed_dict = {
                    plchd_image: array_image,
                    plchd_label: array_label
                }
                _, global_step_value, loss_value, array_image, array_label = sess.run(
                    [
                        train_op, global_step, loss, train_batch_image,
                        train_batch_label
                    ],
                    feed_dict=feed_dict)
                local_step_value += 1
                if local_step_value % 100 == 0:  # You can also use tf.train.LoggingTensorHook for output
                    logging.info("Local Step %d, Global Step %d (Loss: %.2f)",
                                 local_step_value, global_step_value,
                                 loss_value)
                    # Polyaxon
                    experiment.log_metrics(step=local_step_value,
                                           loss=loss_value)

        print('training finished')
コード例 #8
0
        loss.backward()
        trainer.step(args.batch_size)
        metric.update([label], [output])

        if nbatch % 100 == 0:
            name, acc = metric.get()
            logging.info('[Epoch %d Batch %d] Training: %s=%f' %
                         (epoch, nbatch, name, acc))

    if hvd.rank() == 0:
        elapsed = time.time() - tic
        speed = nbatch * args.batch_size * hvd.size() / elapsed
        logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f',
                     epoch, speed, elapsed)

    # Evaluate model accuracy
    _, train_acc = metric.get()
    name, val_acc = evaluate(model, val_data, context)
    if hvd.rank() == 0:
        logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name,
                     train_acc, name, val_acc)
        # Polyaxon
        experiment.log_metrics(step=epoch, train_acc=train_acc, val_acc=val_acc)

    if hvd.rank() == 0 and epoch == args.epochs - 1:
        # Polyaxon
        experiment.log_metrics(val_acc=val_acc)

        assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\
                                (0.96)" % val_acc