def main(): args = parse_args() experiment = Run() params = load_values(args.param_file) if params: experiment.log_inputs(**params) metrics = load_values(args.metric_file) if metrics: experiment.log_metrics(**metrics) if args.tag: experiment.log_tags(args.tag) for dataset in load_datasets(args.data_file): experiment.log_data_ref(**dataset) if args.capture_png: imgs = discover_png(experiment.get_outputs_path()) for img in imgs: if isinstance(img, str): experiment.log_image(img) elif isinstance(img, SerialImages): for idx, path in enumerate(img.paths): experiment.log_image(path, name=img.name, step=idx) else: raise NotImplementedError('We should never get here.')
# https://polyaxon.com/docs/experimentation/tracking/module/#log_data_ref experiment.log_data_ref('dataset_X', content=X) experiment.log_data_ref('dataset_y', content=y) accuracies, classifier = model(X=X, y=y, n_estimators=args.n_estimators, max_features=args.max_features, min_samples_leaf=args.min_samples_leaf) accuracy_mean, accuracy_std = (np.mean(accuracies), np.std(accuracies)) values, counts = np.histogram(accuracies) # Polyaxon experiment.log_metrics(accuracy_mean=accuracy_mean, accuracy_std=accuracy_std) for step in range(accuracies.size): experiment.log_metrics(accuracy=accuracies[step], step=step) outpath = os.path.join(experiment.get_outputs_path(), 'model.pkl') with(open(outpath, 'wb')) as outfile: pickle.dump(classifier, outfile) experiment.log_model( outpath, name='top cross validation model', framework='sklearn' )
'--max_iter', type=int, default=1000) parser.add_argument( '--tol', type=float, default=0.001 ) args = parser.parse_args() # Polyaxon experiment = Run() (X, y) = load_data() # Polyaxon experiment.log_data_ref(content=X, name='dataset_X') experiment.log_data_ref(content=y, name='dataset_y') accuracies = model(X=X, y=y, loss=args.loss, penalty=args.penalty, l1_ratio=args.l1_ratio, max_iter=args.max_iter, tol=args.tol) accuracy_mean, accuracy_std = (np.mean(accuracies), np.std(accuracies)) print('Accuracy: {} +/- {}'.format(accuracy_mean, accuracy_std)) # Polyaxon experiment.log_metrics(accuracy_mean=accuracy_mean, accuracy_std=accuracy_std)
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Polyaxon if hvd.rank() == 0: experiment = Run() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (train_data, train_labels), (eval_data, eval_labels) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # Polyaxon if hvd.rank() == 0: experiment.log_data_ref(content=train_data, name='x_train') experiment.log_data_ref(content=train_labels, name='y_train') experiment.log_data_ref(content=eval_data, name='x_test') experiment.log_data_ref(content=eval_labels, name='y_test') # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. train_data = np.reshape(train_data, (-1, 784)) / 255.0 eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train(input_fn=train_input_fn, steps=3000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results) # Polyaxon if hvd.rank() == 0: experiment.log_metrics(**eval_results)
def train(mnist): #定义输入输出占位 input_x = tf.placeholder(tf.float32, [None,INPUT_SIZE,INPUT_SIZE,1 ], name= "input_x") input_y = tf.placeholder(tf.float32, [None,OUTPUT_SIZE], name= "input_y") dropout_keep_prob = tf.placeholder(tf.float32,name = "dropout_keep_prob") l2_loss = tf.constant(0.0) print("1 step ok!") #polyaxon experiment = Run() experiment.log_data_ref(content=input_x, name='input_x') experiment.log_data_ref(content=input_y, name='input_y') #第一层:卷积层conv1 ''' input : [-1,28,28,1] filter : [5,5,32] output : [-1,28,28,32] ''' with tf.name_scope("conv1"): w = get_weights([FILTER1_SIZE,FILTER1_SIZE,1,FILTER1_NUM]) b = get_biases([FILTER1_NUM]) with tf.device("/fpga:0"): conv1_op = tf.nn.conv2d( input = input_x, filter = w, strides = [1,1,1,1], padding = "SAME", name = 'conv1_op') # print("***********************") re1 = tf.nn.bias_add(conv1_op,b) with tf.device("/fpga:0"): conv1 = tf.nn.relu(re1 ,name = "relu") print("2 step ok!") #第二层:持化层pooling2 ''' input : [-1,28,28,32] output : [-1,14,14,32] ''' with tf.name_scope("pooling2"): with tf.device("/fpga:0"): pooling2 = tf.nn.max_pool( value = conv1, ksize = [1,2,2,1], strides = [1,2,2,1], padding = "SAME", name = "pooling1") print("3 step ok!") #第三层:卷积层conv3 ''' input : [-1,14,14,32] filter : [5,5,64] output : [-1,14,14,64] ''' with tf.name_scope("conv3"): w = get_weights([FILTER3_SIZE,FILTER3_SIZE,FILTER1_NUM,FILTER3_NUM]) b = get_biases([FILTER3_NUM]) with tf.device("/fpga:0"): conv3_op = tf.nn.conv2d( input = pooling2, filter = w , strides = [1,1,1,1], padding = "SAME", name = "conv3_op") re3 = tf.nn.bias_add(conv3_op,b) with tf.device("/fpga:0"): conv3 = tf.nn.relu(re3 ,name = "relu") print("4 step ok!") #第四层:池化层pooling4 ''' input : [-1,14,14,64] output : [-1,7,7,64] ''' with tf.name_scope("pooling4"): with tf.device("/fpga:0"): pooling4 = tf.nn.max_pool( value = conv3, ksize = [1,2,2,1], strides = [1,2,2,1], padding = "SAME", name = "pooling4") #池化结果展开 ''' input : [-1,7,7,64] output : [-1,3136] ''' pooling4_flat = tf.reshape(pooling4,[-1,FLAT_SIZE]) print("5 step ok!") #第五层:全连接层fc5 ''' input : [-1,3136] output : [-1,512] ''' with tf.name_scope("fc5"): w = get_weights([FLAT_SIZE,FC5_SIZE]) b = get_biases([FC5_SIZE]) xw_res = tf.nn.xw_plus_b(pooling4_flat,w,b,name= "fc5") with tf.device("/fpga:0"): fc5 = tf.nn.relu(xw_res, name = "relu") fc5_drop = tf.nn.dropout( fc5,dropout_keep_prob) l2_loss += tf.nn.l2_loss(w) + tf.nn.l2_loss(b) print("6 step ok!") #第六层:全连接层(输出) ''' input : [-1,512] output : [-1,10] ''' with tf.name_scope("fc6"): w = get_weights([FC5_SIZE,OUTPUT_SIZE]) b = get_biases([OUTPUT_SIZE]) y_hat = tf.nn.xw_plus_b(fc5_drop,w,b,name = "y_hat") l2_loss += tf.nn.l2_loss(w) + tf.nn.l2_loss(b) print("7 step ok!") cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = y_hat, labels = input_y) #print("****************") cross_entropy_mean = tf.reduce_mean(cross_entropy) loss = cross_entropy_mean + L2NORM_RATE * l2_loss print("8 step ok!") correct_predictions = tf.equal(tf.argmax(y_hat,1),tf.argmax(input_y,1)) accuracy = tf.reduce_mean( tf.cast(correct_predictions,tf.float32) ) global_step = tf.Variable(0,trainable=False) train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss,global_step = global_step) print("9 step ok!") with tf.Session() as sess: tf.global_variables_initializer().run() for i in range(TRAIN_STEP): xs_pre, ys = mnist.train.next_batch(BATCH_SIZE) xs = np.reshape(xs_pre,[-1,INPUT_SIZE,INPUT_SIZE,1]) feed_dict = { input_x: xs, input_y: ys, dropout_keep_prob : 0.5 } _, step, train_loss, train_acc = sess.run([train_op, global_step, loss, accuracy],feed_dict = feed_dict) if i%2 == 0: print("step:{} ,train loss:{:g}, train_acc:{:g}".format(step,train_loss,train_acc)) experiment.log_metrics(loss=train_loss, accuracy=train_acc) test_x = np.reshape(mnist.test.images[0:100],[-1,INPUT_SIZE,INPUT_SIZE,1]) test_y = mnist.test.labels[0:100] feed_test = { input_x : test_x, input_y : test_y, dropout_keep_prob : 1.0 } test_loss, test_acc, data = sess.run([loss,accuracy,y_hat],feed_dict = feed_test) # print(data) # print(test_y[0]) print("After {} training steps, in test dataset, loss is {:g}, acc is {:g}".format(TRAIN_STEP,test_loss,test_acc)) experiment.log_metrics(loss=test_loss, accuracy=test_acc)
# Polyaxon experiment.log_data_ref(content=x_train, name='x_train') experiment.log_data_ref(content=y_train, name='y_train') experiment.log_data_ref(content=x_test, name='x_test') experiment.log_data_ref(content=y_test, name='y_test') with tf.Session() as sess: model = create_model(conv1_size=args.conv1_size, conv1_out=args.conv1_out, conv1_activation=args.conv1_activation, pool1_size=args.pool1_size, conv2_size=args.conv2_size, conv2_out=args.conv2_out, conv2_activation=args.conv2_activation, pool2_size=args.pool2_size, fc1_activation=args.fc1_activation, fc1_size=args.fc1_size, optimizer=args.optimizer, log_learning_rate=args.log_learning_rate) sess.run(tf.global_variables_initializer()) train_model(model, x_train, y_train, batch_size=args.batch_size, dropout=args.dropout, epochs=args.epochs) accuracy = evaluate_model(model, x_test, y_test) # Polyaxon experiment.log_metrics(accuracy=accuracy)
def main(_): with tf.device( tf.train.replica_device_setter( worker_device="/job:%s/task:%d/%s" % (task["type"], task["index"], FLAGS.device), cluster=cluster)): worker_device = "/job:%s/task:%d/%s" % (task["type"], task["index"], FLAGS.device), logging.info("worker_device: %s", worker_device) ### ### Training ### # # read training data # # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1] # label - digit (0, 1, ..., 9) train_queue = tf.train.string_input_producer( [FLAGS.train_file], num_epochs=2 ) # data is repeated and it raises OutOfRange when data is over train_reader = tf.TFRecordReader() _, train_serialized_exam = train_reader.read(train_queue) train_exam = tf.parse_single_example(train_serialized_exam, features={ 'image_raw': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64) }) train_image = tf.decode_raw(train_exam['image_raw'], tf.uint8) train_image.set_shape([784]) train_image = tf.cast(train_image, tf.float32) * (1. / 255) train_label = tf.cast(train_exam['label'], tf.int32) train_batch_image, train_batch_label = tf.train.batch( [train_image, train_label], batch_size=batch_size) #polyaxon experiment = Run() # Polyaxon experiment.log_data_ref(content=train_image, name='train_image') experiment.log_data_ref(content=train_label, name='train_label') # # define training graph # # define input plchd_image = tf.placeholder(dtype=tf.float32, shape=(None, 784)) plchd_label = tf.placeholder(dtype=tf.int32, shape=(None)) # define network and inference # (simple 2 fully connected hidden layer : 784->128->64->10) with tf.name_scope('hidden1'): weights = tf.Variable(tf.truncated_normal([784, 128], stddev=1.0 / math.sqrt(float(784))), name='weights') biases = tf.Variable(tf.zeros([128]), name='biases') hidden1 = tf.nn.relu(tf.matmul(plchd_image, weights) + biases) with tf.name_scope('hidden2'): weights = tf.Variable(tf.truncated_normal([128, 64], stddev=1.0 / math.sqrt(float(128))), name='weights') biases = tf.Variable(tf.zeros([64]), name='biases') hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases) with tf.name_scope('softmax_linear'): weights = tf.Variable(tf.truncated_normal([64, 10], stddev=1.0 / math.sqrt(float(64))), name='weights') biases = tf.Variable(tf.zeros([10]), name='biases') logits = tf.matmul(hidden2, weights) + biases # define optimization global_step = tf.train.create_global_step() # start without checkpoint optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.07) loss = tf.losses.sparse_softmax_cross_entropy(labels=plchd_label, logits=logits) train_op = optimizer.minimize(loss=loss, global_step=global_step) # # run session # with tf.train.MonitoredTrainingSession(master=server.target, checkpoint_dir=FLAGS.out_dir, is_chief=is_chief) as sess: # when data is over, OutOfRangeError occurs and ends with MonitoredSession local_step_value = 0 run_metadata = tf.RunMetadata() array_image, array_label = sess.run( [train_batch_image, train_batch_label], run_metadata=run_metadata) while not sess.should_stop(): feed_dict = { plchd_image: array_image, plchd_label: array_label } _, global_step_value, loss_value, array_image, array_label = sess.run( [ train_op, global_step, loss, train_batch_image, train_batch_label ], feed_dict=feed_dict) local_step_value += 1 if local_step_value % 100 == 0: # You can also use tf.train.LoggingTensorHook for output logging.info("Local Step %d, Global Step %d (Loss: %.2f)", local_step_value, global_step_value, loss_value) # Polyaxon experiment.log_metrics(step=local_step_value, loss=loss_value) print('training finished')
loss.backward() trainer.step(args.batch_size) metric.update([label], [output]) if nbatch % 100 == 0: name, acc = metric.get() logging.info('[Epoch %d Batch %d] Training: %s=%f' % (epoch, nbatch, name, acc)) if hvd.rank() == 0: elapsed = time.time() - tic speed = nbatch * args.batch_size * hvd.size() / elapsed logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f', epoch, speed, elapsed) # Evaluate model accuracy _, train_acc = metric.get() name, val_acc = evaluate(model, val_data, context) if hvd.rank() == 0: logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name, train_acc, name, val_acc) # Polyaxon experiment.log_metrics(step=epoch, train_acc=train_acc, val_acc=val_acc) if hvd.rank() == 0 and epoch == args.epochs - 1: # Polyaxon experiment.log_metrics(val_acc=val_acc) assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\ (0.96)" % val_acc