def _pipeline(self, config, inputs): if config.model == 'cbow': model_func = md.cbow_forward elif config.model == 'rnn': model_func = md.rnn_forward elif config.model == 'lstm': model_func = md.lstm_forward elif config.model == 'lstm_gru': model_func = md.lstm_fw_gru_bw elif config.model == 'gru_lstm': model_func = md.gru_fw_lstm_bw elif config.model == 'att': model_func = md.attention_forward elif config.model == 'lstm_att': model_func = md.lstm_attention_forward elif config.model == 'att2rnn': model_func = md.attention_to_rnn_forward else: raise NotImplementedError() self.variables, outputs = model_func(config, inputs) loss, grads = None, None if config.supervised: loss = md.get_loss(config, inputs, outputs) if config.is_train: grads = self.opt.compute_gradients(loss) return outputs, loss, grads
def train(): img = tf.placeholder(shape=[config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3], dtype=tf.float32) #ig = AddCoords(x_dim=512,y_dim=512)(img) anchors_num = sum( [config.Config['feature_maps'][s] ** 2 * config.Config['aspect_num'][s] for s in range(5)]) loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) pred_loc, pred_confs, vbs = retinanet.model(img,config) train_tensors = get_loss(conf, loc, pred_loc, pred_confs,config) gen = data_gen.get_batch_inception(batch_size=config.batch_size,image_size=config.Config['min_dim'],max_detect=50) global_step = slim.get_or_create_global_step() lr = tf.train.exponential_decay( learning_rate=0.001, global_step=global_step, decay_steps=40000, decay_rate=0.7, staircase=True) tf.summary.scalar('lr', lr) sum_op = tf.summary.merge_all() optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) vbs = [] for s in slim.get_variables(): print(s.name) if 'resnet_v2_50' in s.name and 'Momentum' not in s.name: print(s.name) vbs.append(s) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, config.check_dir) sv = tf.train.Supervisor(logdir=config.save_dir, summary_op=None, init_fn=restore) with sv.managed_session() as sess: for step in range(200000): print(' '+' '.join(['*']*(step%10))) images, true_box, true_label = q.get() loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size,cfg=config.Config) feed_dict = {img: images, loc: loct, conf: conft} ls, step = sess.run([train_op, global_step], feed_dict=feed_dict) if step % 10 == 0: print('step:' + str(step) + ' ' + 'class_loss:' + str(ls[0]) + ' ' + 'loc_loss:' + str(ls[1]) ) summaries = sess.run(sum_op, feed_dict=feed_dict) sv.summary_computed(sess, summaries)
def train(): global_step = tf.Variable(0, trainable=False) dataset = coco_input.get_dataset() labels, images = dataset.train_input() network = model.Network(is_train=True) logits = network.inference(images) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) entropy, loss = model.get_loss(labels, logits) lr, opt = get_opt(loss, global_step) summary_op = tf.merge_all_summaries() #gpu_options = tf.GPUOptions(allow_growth=True) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: init = tf.initialize_all_variables() sess.run(init) if FLAGS.dir_pretrain is not None: saver = tf.train.Saver(model.get_pretrain_variables()) restore_model(saver, sess) summary_writer = tf.train.SummaryWriter("log", sess.graph) tf.train.start_queue_runners(sess=sess) saver = tf.train.Saver(model.get_restore_variables()) for num_iter in range(1, FLAGS.max_steps + 1): start_time = time.time() value_entropy, value_loss, value_lr, _ = sess.run( [entropy, loss, lr, opt]) duration = time.time() - start_time assert not np.isnan(value_loss), 'Model diverged with loss = NaN' if num_iter % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) print( "step = {} entropy = {:.2f} loss = {:.2f} ({:.1f} examples/sec; {:.1f} sec/batch)" .format(num_iter, value_entropy, value_loss, examples_per_sec, sec_per_batch)) if num_iter % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, num_iter) if num_iter % 1000 == 0: print "lr = {:.2f}".format(value_lr) checkpoint_path = os.path.join(FLAGS.dir_parameter, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=num_iter)
def evaluate(): is_training = False with tf.device('/gpu:' + str(GPU_INDEX)): pointclouds_pl, labels_pl = placeholder_inputs(BATCH_SIZE, NUM_POINT) is_training_pl = tf.placeholder(tf.bool, shape=()) # simple model pred = get_model(pointclouds_pl, is_training_pl) loss = get_loss(pred, labels_pl) pred_softmax = tf.nn.softmax(pred) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Create a session config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = True sess = tf.Session(config=config) # Restore variables from disk. saver.restore(sess, MODEL_PATH) log_string("Model restored.") ops = { 'pointclouds_pl': pointclouds_pl, 'labels_pl': labels_pl, 'is_training_pl': is_training_pl, 'pred': pred, 'pred_softmax': pred_softmax, 'loss': loss } total_correct = 0 total_seen = 0 fout_out_filelist = open(FLAGS.output_filelist, 'w') for room_path in ROOM_PATH_LIST: out_data_label_filename = os.path.basename( room_path)[:-4] + '_pred.txt' out_data_label_filename = os.path.join(DUMP_DIR, out_data_label_filename) out_gt_label_filename = os.path.basename(room_path)[:-4] + '_gt.txt' out_gt_label_filename = os.path.join(DUMP_DIR, out_gt_label_filename) print(room_path, out_data_label_filename) a, b = eval_one_epoch(sess, ops, room_path, out_data_label_filename, out_gt_label_filename) total_correct += a total_seen += b fout_out_filelist.write(out_data_label_filename + '\n') fout_out_filelist.close() log_string('all room eval accuracy: %f' % (total_correct / float(total_seen)))
def main(config): import torch from model import get_model, get_loss, get_converter, get_post_processing from metric import get_metric from data_loader import get_dataloader from tools.rec_trainer import RecTrainer as rec from tools.det_trainer import DetTrainer as det if torch.cuda.device_count() > 1: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://", world_size=torch.cuda.device_count(), rank=args.local_rank) config['distributed'] = True else: config['distributed'] = False config['local_rank'] = args.local_rank train_loader = get_dataloader(config['dataset']['train'], config['distributed']) assert train_loader is not None if 'validate' in config['dataset']: validate_loader = get_dataloader(config['dataset']['validate'], False) else: validate_loader = None criterion = get_loss(config['loss']).cuda() if config.get('post_processing', None): post_p = get_post_processing(config['post_processing']) else: post_p = None metric = get_metric(config['metric']) if config['arch']['algorithm'] == 'rec': converter = get_converter(config['converter']) config['arch']['num_class'] = len(converter.character) model = get_model(config['arch']) else: converter = None model = get_model(config['arch']) trainer = eval(config['arch']['algorithm'])( config=config, model=model, criterion=criterion, train_loader=train_loader, post_process=post_p, metric=metric, validate_loader=validate_loader, converter=converter) trainer.train()
def test(): with tf.Graph().as_default(): with tf.device('/gpu:0'): src_mesh = model.mesh_placeholder_inputs(BATCH_SIZE, MAX_NVERTS, MAX_NTRIS, NUM_POINTS, 'src') ref_mesh = model.mesh_placeholder_inputs(BATCH_SIZE, MAX_NVERTS, MAX_NTRIS, NUM_POINTS, 'ref') is_training_pl = tf.placeholder(tf.bool, shape=()) print(is_training_pl) print("--- Get model") end_points = model.get_model(src_mesh, ref_mesh, NUM_POINTS, is_training_pl) loss, end_points = model.get_loss(end_points, NUM_PART_CATEGORIES) # Add ops to save and restore all the variables. # Create a session # with tf.device('/gpu:0'): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False sess = tf.Session(config=config) # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Init variables init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() ckptstate = tf.train.get_checkpoint_state(PRETRAINED_MODEL_PATH) if ckptstate is not None: LOAD_MODEL_FILE = os.path.join( PRETRAINED_MODEL_PATH, os.path.basename(ckptstate.model_checkpoint_path)) saver.restore(sess, LOAD_MODEL_FILE) print("Model loaded in file: %s" % LOAD_MODEL_FILE) else: print("Fail to load modelfile: %s" % PRETRAINED_MODEL_PATH) return ops = { 'src_mesh': src_mesh, 'ref_mesh': ref_mesh, 'is_training_pl': is_training_pl, 'end_points': end_points } test_(sess, ops)
def train(): start = time() file_train = h5py.File("data/3DMNIST/train_point_clouds.h5", "r") file_test = h5py.File("data/3DMNIST/test_point_clouds.h5", "r") data_train = load_data(file_train, N) data_test = load_data(file_test, NE) log('Data loaded in %2fs' % (time() - start)) with tf.Graph().as_default(): is_training = tf.placeholder(tf.bool, shape=()) inputs, labels = placeholder(B, N) pred = get_model(inputs, is_training, k=10, s=S, use_tnet=TNET, bn_mom=BN_MOM) loss = get_loss(pred, labels) learning_rate = tf.placeholder(tf.float32, shape=[]) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) saver = tf.train.Saver() init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) ops = { 'inputs': inputs, 'labels': labels, 'is_training': is_training, 'pred': pred, 'loss': loss, 'train_op': train_op, 'learning_rate': learning_rate, } log('\nStart training\n') start = time() for ep in range(MAX_EPOCH): if ep == 20: global LR LR /= 10 log("#### EPOCH {:03} ####".format(ep + 1)) begin = time() train_one_epoch(data_train, sess, ops) log("---- Time elapsed: {:.2f}s".format(time() - begin)) eval_one_epoch(data_test, sess, ops) save_path = saver.save(sess, os.path.join(LOG_DIR, "model.ckpt")) log("Total time: {:.2f}s".format(time() - start))
def train(): img = tf.placeholder(shape=[ config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3 ], dtype=tf.float32) anchors_num = sum([ config.Config['feature_maps'][s]**2 * config.Config['aspect_num'][s] for s in range(6) ]) loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) pred_loc, pred_confs, vbs = inception_500_ince.inception_v2_ssd( img, config) train_tensors, sum_op = get_loss(conf, loc, pred_loc, pred_confs, config) gen = data_gen.get_batch_inception(batch_size=config.batch_size, image_size=config.Config['min_dim'], max_detect=50) optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, '/home/dsl/all_check/inception_v2.ckpt') sv = tf.train.Supervisor(logdir='/home/dsl/all_check/face_detect/voc-1', summary_op=None, init_fn=restore) with sv.managed_session() as sess: for step in range(1000000000): images, true_box, true_label = q.get() loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size, cfg=config.Config) feed_dict = {img: images, loc: loct, conf: conft} ls = sess.run(train_op, feed_dict=feed_dict) if step % 10 == 0: summaries = sess.run(sum_op, feed_dict=feed_dict) sv.summary_computed(sess, summaries) print(ls)
def train(): img = tf.placeholder(shape=[config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3], dtype=tf.float32) anchors_num = sum( [config.Config['feature_maps'][s] ** 2 * config.Config['aspect_num'][s] for s in range(6)]) loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) pred_loc, pred_confs, vbs = inceptionv3_500_ince.inception_v2_ssd(img,config) train_tensors = get_loss(conf, loc, pred_loc, pred_confs,config) global_step = get_or_create_global_step() # Define your exponentially decaying learning rate lr = tf.train.exponential_decay( learning_rate=0.001, global_step=global_step, decay_steps=20000, decay_rate=0.7, staircase=True) tf.summary.scalar('lr',lr) sum_op = tf.summary.merge_all() gen = data_gen.get_batch_inception(batch_size=config.batch_size,image_size=config.Config['min_dim'],max_detect=50) optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, '/home/dsl/all_check/inception_v3.ckpt') sv = tf.train.Supervisor(logdir='/home/dsl/all_check/face_detect/voc-v32', summary_op=None, init_fn=restore) with sv.managed_session() as sess: for step in range(1000000000): images, true_box, true_label = q.get() loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size,cfg=config.Config) feed_dict = {img: images, loc: loct, conf: conft} t = time.time() ls,step = sess.run([train_op,global_step], feed_dict=feed_dict) if step % 10 == 0: print(time.time()-t) summaries = sess.run(sum_op, feed_dict=feed_dict) sv.summary_computed(sess, summaries) print(ls)
def initTF(): global tf_session, ops with tf.device('/gpu:0'): pointclouds_pl, labels_pl = model.placeholder_inputs( BATCH_SIZE, NUM_POINTS) is_training_pl = tf.placeholder(tf.bool, shape=()) pred, end_points = model.get_model(pointclouds_pl, is_training_pl) model.get_loss(pred, labels_pl, end_points) losses = tf.get_collection('losses') total_loss = tf.add_n(losses, name='total_loss') saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False tf_session = tf.Session(config=config) model_path = "/home/gstavrinos/libs/python/python2.7/pointnet2/log/model.ckpt" saver.restore(tf_session, model_path) ops = { "pointclouds_pl": pointclouds_pl, "is_training_pl": is_training_pl, "pred": pred }
def evaluate(): with tf.Graph().as_default() as g, tf.device("/gpu:0"): FLAGS.batch_size = 100 images, labels = mnist_input.validate_input() label_vector = tf.one_hot(labels, 10, dtype=tf.float32) network = model.Network() logits = network.inference(images) top_k_op = tf.nn.in_top_k(logits, labels, 1) entropy, loss = model.get_loss(label_vector, logits) summary_writer = tf.train.SummaryWriter(FLAGS.dir_log, g) while True: eval_once(summary_writer, top_k_op, entropy) time.sleep(FLAGS.eval_interval_secs)
def _pipeline(self, config, inputs): if config.model == 'cbow': model_func = cbow_forward elif config.model == 'rnn': model_func = rnn_forward elif config.model == 'att': model_func = attention_forward else: raise NotImplementedError() self.variables, outputs = model_func(config, inputs) loss, grads = None, None if config.supervised: loss = get_loss(config, inputs, outputs) if config.is_train: grads = self.opt.compute_gradients(loss) return outputs, loss, grads
def evaluate(): with tf.Graph().as_default() as g, tf.device("/gpu:0"): dataset = coco_input.get_dataset() labels, images = dataset.validate_input() network = model.Network(is_train=False) logits = network.inference(images) entropy, _ = model.get_loss(labels, logits) top_k_op = tf.nn.in_top_k(logits, labels, 1) summary_writer = tf.train.SummaryWriter(FLAGS.dir_log_val, g) while True: eval_once(summary_writer, top_k_op, entropy) time.sleep(FLAGS.eval_interval_secs)
def train(): with tf.Graph().as_default(): features, labels = model.placeholder_inputs(BATCH_SIZE, NUM_FEATURES) pred = model.get_model(features) # with tf.name_scope('loss') as scope: loss = model.get_loss(pred, labels) tf.summary.scalar('loss', loss) total, count = tf.metrics.accuracy(labels=tf.to_int64(labels), predictions=tf.argmax(pred, 1), name='accuracy') tf.summary.scalar('accuracy', count) # Get training operator optimizer = tf.train.AdamOptimizer(LEARNING_RATE) train_op = optimizer.minimize(loss) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Create a session sess = tf.Session() # Add summary writers merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), sess.graph) # Init variables init = tf.global_variables_initializer() local = tf.local_variables_initializer() sess.run(init) sess.run(local) for epoch in range(NUM_EPOCHS): data, label = preprocess.load_data() feed_dict = {features: data, labels: label} summary, _, loss_val, pred_val, accurate = sess.run( [merged, train_op, loss, pred, count], feed_dict=feed_dict) train_writer.add_summary(summary, epoch) print(accurate) save_path = saver.save(sess, os.path.join(LOG_DIR, "model.ckpt")) return
def train(): img = tf.placeholder( shape=[config.batch_size, cfg["min_dim"], cfg["min_dim"], 3], dtype=tf.float32 ) anchors_num = sum( [cfg["feature_maps"][s] ** 2 * cfg["aspect_num"][s] for s in range(6)] ) loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) pred_loc, pred_confs, vbs = mobile.nana_mobile(img, config) train_tensors, sum_op = get_loss(conf, loc, pred_loc, pred_confs, config) gen = data_gen.get_batch(batch_size=config.batch_size, image_size=cfg["min_dim"]) optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, "/home/dsl/all_check/nasnet-a_mobile_04_10_2017/model.ckpt") sv = tf.train.Supervisor( logdir="/home/dsl/all_check/face_detect/nana", summary_op=None, init_fn=restore ) with sv.managed_session() as sess: for step in range(1000000000): images, true_box, true_label = next(gen) loct, conft = np_utils.get_loc_conf( true_box, true_label, batch_size=config.batch_size, cfg=cfg ) feed_dict = {img: images, loc: loct, conf: conft} ls = sess.run(train_op, feed_dict=feed_dict) if step % 10 == 0: summaries = sess.run(sum_op, feed_dict=feed_dict) sv.summary_computed(sess, summaries) print(ls)
def train(): global_step = tf.Variable(0, trainable=False) image, label = mnist_input.train_input() network = model.Network() logits = network.inference(image, is_train=True) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) entropy, loss = model.get_loss(label, logits) lr, opt = get_opt(loss, global_step) saver = tf.train.Saver(tf.trainable_variables()) summary_op = tf.merge_all_summaries() gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: init = tf.initialize_all_variables() sess.run(init) summary_writer = tf.train.SummaryWriter("log", sess.graph) tf.train.start_queue_runners(sess=sess) for num_iter in range(1, 1000000): value_entropy, value_loss, value_lr, _ = sess.run( [entropy, loss, lr, opt]) if num_iter % 100 == 0: print "lr = {} entropy = {} loss = {}".format( value_lr, value_entropy, value_loss) summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, num_iter) if num_iter % 1000 == 0: checkpoint_path = os.path.join(FLAGS.dir_parameter, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=num_iter)
def train_step(train_data,optimizer,dev_data): model.train() count=0 total_loss=0 for j in range(0, len(train_data), config.batch_size): optimizer.zero_grad() print("run bactch : % d" % j) batch = train_data[j:j + config.batch_size] sentence_tensor, tags_tensor,length_tensor=get_batch(batch) loss=model.get_loss(sentence_tensor,tags_tensor,length_tensor) loss.backward() optimizer.step() print("minibatch : %d , loss : %.5f " % (j,loss.item())) total_loss+=loss.item() count+=1 print("-------------------------------------------------------------") print("avg loss : %.5f"%(total_loss/count)) print("-------------------------------------------------------------") f1=dev_step(dev_data) return f1
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.PREDICT: raise RuntimeError("mode {} is not supported yet".format(mode)) loss = get_loss(features, labels, args) if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = tf.compat.v1.train.exponential_decay( args.lr, tf.compat.v1.train.get_global_step(), decay_steps=100000, decay_rate=0.96) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=learning_rate) if args.use_tpu: optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer) return tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=optimizer.minimize(loss, tf.compat.v1.train.get_global_step()))
def validation(self, data_valid, step, data, loss_weight_base, value_weight, value_ratio): self.model.eval() running_valid_loss = 0 for inp, out, out_real, lens in data_valid: loss, y_p = forecast_model.get_loss(inp=inp, out=out, lens=lens, cuda=True, gn=self.model, glucose_dat=data, criterion=self.criterion, base=loss_weight_base, out_real=out_real, value_weight=value_weight, value_ratio=value_ratio) step += 1 running_valid_loss += loss.data.cpu().numpy()[0] running_valid_loss = running_valid_loss / len(data_valid) print('validation loss: {:.3f}'.format(running_valid_loss)) self.writer.add_scalar(tag='valid_total_loss', scalar_value=running_valid_loss, global_step=step) self.model.train() return running_valid_loss
def train_sup(self, epoch_lim, data, valid_data, early_stopping_lim, batch_size, num_workers, track_embeddings, validation_rate, loss_weight_base=1, value_weight=0, value_ratio=0): """ Training loop :param epoch_lim: total number of training epochs :param data: training data :param valid_data: validation data :param early_stopping_lim: Number of epochs to run without validation improvement before stopping if None, never stop early :param batch_size: training batch_size :param num_workers: number of CPU workers to use for data loading :param track_embeddings: Save out embedding information at end of run :param validation_rate: Check validation performance every validation_rate training epochs :param loss_weight_base: A constant between 0 and 1 used to interpolate between Single (=0) and Multi (=1) Step forecasting. :param value_weight: A constant multiplier for the real-value loss, set to 0 in the paper :param value_ratio: The proportion of loss used for the MSE loss term (as opposed for the cross-entropy loss), set to 0 in the paper :return loss array, model: """ if early_stopping_lim is None: early_stopping_lim = epoch_lim train_sampler = sampler.RandomSampler(np.arange(len(data))) data_train = DataLoader(data, batch_size=batch_size, sampler=train_sampler, drop_last=True) valid_sampler = sampler.SequentialSampler(np.arange(len(valid_data))) data_valid = DataLoader(valid_data, batch_size=batch_size, sampler=valid_sampler) step = 0 bsf_loss = np.inf epochs_without_improvement = 0 improvements = [] for epoch in range(epoch_lim): if epochs_without_improvement > early_stopping_lim: print('Exceeded early stopping limit, stopping') break if epoch % validation_rate == 0: valid_loss = self.validation(data_valid=data_valid, step=step, data=data, loss_weight_base=loss_weight_base, value_weight=value_weight, value_ratio=value_ratio) (bsf_loss, epochs_without_improvement, improvements) = self.manage_early_stopping(bsf_loss=bsf_loss, early_stopping_lim=early_stopping_lim, epochs_without_improvement=epochs_without_improvement, valid_loss=valid_loss, validation_rate=validation_rate, improvements=improvements) running_train_loss = 0 for inp, out, out_real, lens in tqdm(data_train): loss, y_p = forecast_model.get_loss(inp=inp, out=out, lens=lens, cuda=True, gn=self.model, glucose_dat=data, criterion=self.criterion, base=loss_weight_base, out_real=out_real, value_weight=value_weight, value_ratio=value_ratio) step += 1 running_train_loss += loss.data.cpu().numpy()[0] self.optimizer.zero_grad() loss.backward() self.optimizer.step() running_train_loss = running_train_loss/len(data_train) self.writer.add_scalar(tag='train_loss', scalar_value=running_train_loss, global_step=step) torch.save(self.model.state_dict(), '{}/final_sup.pt'.format(self.model_dir)) if track_embeddings: self.embed(data_valid, step, embed_batch=100) return improvements
def main(): args = parse_args() print("Params:") print(args) print() config = tf.ConfigProto() config.gpu_options.allow_growth = True X = tf.placeholder(tf.float32, [17770, None], name='X') Y = tf.placeholder(tf.float32, [17770, None], name='Y') Yhat, weights = model.autoencoder(X, args.layers, keep_prob=(1.0 - args.dropout), constrained=args.constrained) YhatDev, weights = model.autoencoder(X, args.layers, constrained=args.constrained, weights=weights) loss = model.get_loss(Y, Yhat) loss_sum, loss_examples = model.get_test_loss(Y, Yhat) loss_sum_dev, loss_examples_dev = model.get_test_loss(Y, YhatDev) losses = (loss, loss_sum, loss_examples, loss_sum_dev, loss_examples_dev) optimizer = model.get_optimizer(args.optimizer_type, args.lr, args.momentum) if args.small_dataset: train_path = "../data/netflix/output_small_train" dev_path = "../data/netflix/output_small_dev" test_path = "../data/netflix/output_small_test" else: train_path = "../data/netflix/output_train" dev_path = "../data/netflix/output_dev" test_path = "../data/netflix/output_test" data_train = data_manager.Data(size=args.chunk_size, batch=args.batch_size, path=train_path) data_dev = data_manager.Data(size=args.chunk_size, batch=args.batch_size, path=dev_path, test=True) data_test = data_manager.Data(size=args.chunk_size, batch=args.batch_size, path=test_path, test=True) train_losses, eval_losses = model.train( data_train, data_dev, losses, optimizer, X, Y, Yhat, epochs=args.epochs, dense_refeeding=args.dense_refeeding) model.test(data_test, X, Y, YhatDev) t, = plt.plot([i + 1 for i in range(len(train_losses))], train_losses, label="Train") e, = plt.plot([i + 1 for i in range(len(eval_losses))], eval_losses, label="Dev") plt.legend(handles=[t, e]) plt.xlabel("Epoch") plt.ylabel("Loss") plt.show() print([i + 1 for i in range(len(train_losses))]) print(train_losses) print([i + 1 for i in range(len(eval_losses))]) print(eval_losses)
def train(rank, world_size, args): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = args.port dist.init_process_group('nccl', rank=rank, world_size=world_size) torch.cuda.set_device(rank) tokenizer = BertTokenizer.from_pretrained('bert-large-cased') train_data = MultiTaskDataset([get_data(t, 'train') for t in args.tasks]) train_sampler = DistMultiTaskBatchSampler(train_data, args.batch_size, drop_last=True, rank=rank, world_size=world_size) train_loader = DataLoader( train_data, batch_sampler=train_sampler, collate_fn=lambda x: collate(tokenizer, x, args.max_length), pin_memory=True) dev_data = MultiTaskDataset([get_data(t, 'dev') for t in args.tasks]) dev_sampler = DistMultiTaskBatchSampler(dev_data, args.batch_size, drop_last=False, rank=rank, world_size=world_size) dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler, collate_fn=lambda x: collate(tokenizer, x), pin_memory=True) model = BertMultiTask([get_n_classes(t) for t in args.tasks], [get_loss(t) for t in args.tasks]).cuda() model = DDP(model, device_ids=[rank], find_unused_parameters=True) optimizer = torch.optim.Adamax(model.parameters(), args.lr) step = 0 if rank == 0: writer = SummaryWriter(args.log_dir) for epoch in range(args.n_epoch): model.train() batch_time = data_time = total_loss = 0 start = time() for b, (inputs, labels, task_id) in enumerate(train_loader): step += 1 inputs = {key: inputs[key].cuda() for key in inputs} labels = labels.cuda() data_time += time() - start logits, loss = model(inputs, task_id, labels) dist.reduce(loss, 0) total_loss += loss.item() / world_size optimizer.zero_grad() loss.backward() optimizer.step() acc = torch.zeros(len(args.tasks)).cuda() acc_sub = torch.tensor(0).cuda() task = torch.zeros(len(args.tasks)).cuda() task[task_id] = 1 with torch.no_grad(): if args.tasks[task_id] == 5: correct = (logits > 0) == labels.bool() acc[task_id] = correct.float().mean() acc_sub = correct.all(dim=1).float().mean() else: acc[task_id] = (logits.argmax( dim=1) == labels).float().mean() dist.reduce(acc, 0) dist.reduce(acc_sub, 0) dist.reduce(task, 0) if rank == 0: writer.add_scalar(f'train/loss', loss.item(), step) for i, t in enumerate(args.tasks): if task[i] > 0: name = get_task_name(t) writer.add_scalar(f'train/{name}_acc', acc[i] / task[i] * 100, step) if t == 5: writer.add_scalar(f'train/{name}_acc_sub', acc_sub / task[i] * 100, step) if (b + 1) % args.print_freq == 0: print( f'Epoch {epoch+1} Train: {(b+1):05d}/{len(train_loader):05d} ' \ f'Batch {(batch_time/(b+1)):.3f}s Data {(data_time/(b+1)):.3f}s ' \ f'Loss {(total_loss/(b+1)):.4f}' ) batch_time += time() - start start = time() model.eval() batch_time = data_time = 0 all_correct = torch.tensor(0).cuda() correct = torch.zeros(len(args.tasks)).cuda() total = torch.zeros(len(args.tasks)).cuda() tp = [torch.zeros(get_n_classes(t)).cuda() for t in args.tasks] fp = [torch.zeros(get_n_classes(t)).cuda() for t in args.tasks] fn = [torch.zeros(get_n_classes(t)).cuda() for t in args.tasks] start = time() for inputs, labels, task_id in dev_loader: inputs = {key: inputs[key].cuda() for key in inputs} labels = labels.cuda() data_time += time() - start with torch.no_grad(): logits = model(inputs, task_id) if args.tasks[task_id] == 5: correct[task_id] += ((logits > 0) == labels.bool()).float().mean( dim=1).sum() all_correct += ((logits > 0) == labels.bool()).all( dim=1).sum() tp[task_id] += ((logits > 0) & labels.bool()).sum(dim=0) fp[task_id] += ((logits > 0) & ~labels.bool()).sum(dim=0) fn[task_id] += ((logits <= 0) & labels.bool()).sum(dim=0) else: for p, l in zip(logits.argmax(dim=1), labels): if p == l: correct[task_id] += 1 tp[task_id][p] += 1 else: fp[task_id][p] += 1 fn[task_id][l] += 1 total[task_id] += labels.shape[0] batch_time += time() - start start = time() dist.reduce(all_correct, 0) dist.reduce(correct, 0) dist.reduce(total, 0) for tpl, fpl, fnl in zip(tp, fp, fn): dist.reduce(tpl, 0) dist.reduce(fpl, 0) dist.reduce(fnl, 0) if rank == 0: l = len(dev_loader) str_out = f'Epoch {epoch+1} Dev : {l:05d}/{l:05d} ' \ f'Batch {(batch_time/l):.3f}s Data {(data_time/l):.3f}s ' for i, t in enumerate(args.tasks): name = get_task_name(t) acc = correct[i] / total[i] * 100 writer.add_scalar(f'dev/{name}_acc', acc, step) f1 = (tp[i] / (tp[i] + (fp[i] + fn[i]) / 2)).mean() * 100 writer.add_scalar(f'dev/{name}_f1', f1, step) if t == 5: acc_sub = all_correct / total[i] * 100 writer.add_scalar(f'dev/{name}_acc_sub', acc_sub, step) str_out += f'{name} Acc {acc_sub.item():.2f} F1 {f1.item():.2f} ' else: str_out += f'{name} Acc {acc.item():.2f} F1 {f1.item():.2f} ' print(str_out) if rank == 0: torch.save(model.module.state_dict(), f'{args.log_dir}/epoch_{epoch+1}.pth') dist.destroy_process_group()
batch_size = 70 images, y_ = input.input_pipeline(filenames, labels, batch_size) sess = tf.Session() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) with tf.variable_scope("model") as scope: adver_y = model.model(images, False) shifted_y_ = tf.concat( 1, [tf.slice(y_, [0, 1], [-1, 1]), tf.slice(y_, [0, 0], [-1, 1])]) adver_loss = model.get_loss(adver_y, shifted_y_) grad = tf.gradients(adver_loss, images)[0] #scale_grad = tf.abs(tf.truncated_normal(shape=grad.get_shape(), stddev=.01)) update_prob = .1 update_mag = .01 scale_grad = tf.to_float( tf.random_uniform(shape=[batch_size]) > update_prob) * update_mag grad_shape = grad.get_shape().as_list() scale_grad = tf.tile(scale_grad, [grad_shape[1] * grad_shape[2] * grad_shape[3]]) scale_grad = tf.reshape(scale_grad, grad_shape[1:4] + [batch_size]) scale_grad = tf.transpose(scale_grad, [3, 0, 1, 2]) update = -tf.mul(tf.sign(grad), scale_grad)
num_workers=4) testloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) # ============================ step 2/6 模型 ============================ if args.pointnet: net = get_model(40, normal_channel=args.normal) else: net = cls_3d() net.to(device) # ============================ step 3/6 损失函数 ============================ if args.pointnet: criterion = get_loss() # 负对数似然损失 else: criterion = nn.CrossEntropyLoss() # 选择损失函数 # ============================ step 4/6 优化器 ============================ # 选择优化器 if args.optim == 'sgd': optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9, weight_decay=L2_REG) elif args.optim == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=LR, weight_decay=L2_REG) elif args.optim == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=LR,
def train(): """Train the model on a single GPU """ with tf.Graph().as_default(): stacker, stack_validation, stack_train = init_stacking() with tf.device("/gpu:" + str(PARAMS["gpu"])): pointclouds_pl, labels_pl, smpws_pl = model.get_placeholders( PARAMS["num_point"], hyperparams=PARAMS) is_training_pl = tf.compat.v1.placeholder(tf.bool, shape=()) # Note the global_step=batch parameter to minimize. # That tells the optimizer to helpfully increment the 'batch' parameter for # you every time it trains. batch = tf.Variable(0) bn_decay = get_bn_decay(batch) tf.summary.scalar("bn_decay", bn_decay) print("--- Get model and loss") # Get model and loss pred, end_points = model.get_model( pointclouds_pl, is_training_pl, NUM_CLASSES, hyperparams=PARAMS, bn_decay=bn_decay, ) loss = model.get_loss(pred, labels_pl, smpws_pl, end_points) tf.summary.scalar("loss", loss) # Compute accuracy correct = tf.equal(tf.argmax(pred, 2), tf.compat.v1.to_int64(labels_pl)) accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / float( PARAMS["batch_size"] * PARAMS["num_point"]) tf.summary.scalar("accuracy", accuracy) # Computer mean intersection over union mean_intersection_over_union, update_iou_op = tf.compat.v1.metrics.mean_iou( tf.compat.v1.to_int32(labels_pl), tf.compat.v1.to_int32(tf.argmax(pred, 2)), NUM_CLASSES) tf.summary.scalar( "mIoU", tf.compat.v1.to_float(mean_intersection_over_union)) print("--- Get training operator") # Get training operator learning_rate = get_learning_rate(batch) tf.summary.scalar("learning_rate", learning_rate) if PARAMS["optimizer"] == "momentum": optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=PARAMS["momentum"]) else: assert PARAMS["optimizer"] == "adam" optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(loss, global_step=batch) # Add ops to save and restore all the variables. saver = tf.compat.v1.train.Saver() # Create a session config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False sess = tf.compat.v1.Session(config=config) # Add summary writers merged = tf.compat.v1.summary.merge_all() train_writer = tf.compat.v1.summary.FileWriter( os.path.join(PARAMS["logdir"], "train"), sess.graph) validation_writer = tf.compat.v1.summary.FileWriter( os.path.join(PARAMS["logdir"], "validation"), sess.graph) # Init variables sess.run(tf.compat.v1.global_variables_initializer()) sess.run( tf.compat.v1.local_variables_initializer()) # important for mIoU ops = { "pointclouds_pl": pointclouds_pl, "labels_pl": labels_pl, "smpws_pl": smpws_pl, "is_training_pl": is_training_pl, "pred": pred, "loss": loss, "train_op": train_op, "merged": merged, "step": batch, "end_points": end_points, "update_iou": update_iou_op, } # Train for hyper_params["max_epoch"] epochs best_acc = 0 for epoch in range(PARAMS["max_epoch"]): print("in epoch", epoch) print("max_epoch", PARAMS["max_epoch"]) log_string("**** EPOCH %03d ****" % (epoch)) sys.stdout.flush() # Train one epoch train_one_epoch(sess, ops, train_writer, stack_train) # Evaluate, save, and compute the accuracy if epoch % 5 == 0: acc = eval_one_epoch(sess, ops, validation_writer, stack_validation) if acc > best_acc: best_acc = acc save_path = saver.save( sess, os.path.join(PARAMS["logdir"], "best_model_epoch_%03d.ckpt" % (epoch)), ) log_string("Model saved in file: %s" % save_path) print("Model saved in file: %s" % save_path) # Save the variables to disk. if epoch % 10 == 0: save_path = saver.save( sess, os.path.join(PARAMS["logdir"], "model.ckpt")) log_string("Model saved in file: %s" % save_path) print("Model saved in file: %s" % save_path) # Kill the process, close the file and exit stacker.terminate() LOG_FOUT.close() sys.exit()
import tensorflow as tf import input import model with tf.variable_scope("input"): filenames, labels = input.get_filenames_labels(12500, .90, True, "../train_preprocessed2") x, y_ = input.input_pipeline(filenames, labels, 80) with tf.variable_scope("model") as scope: y = model.model(x, True) with tf.variable_scope("optimizer"): loss = model.get_loss(y, y_) optimizer = model.get_optimizer(loss) with tf.variable_scope("error"): error = model.get_error(y, y_) saver = tf.train.Saver() with tf.variable_scope("summary"): logs_path = "../logs" merged_summary_op = model.get_summary_op(x, loss, error) sv = tf.train.Supervisor(logdir="../logs", init_op=tf.global_variables_initializer(), summary_op=merged_summary_op, saver=saver, save_summaries_secs=60, save_model_secs=600)
def main(): if len(sys.argv) != 7: print( "Usage: {0} <data directory> <hidden layer size> <min song length> <steps> <epochs> <batch_size>" .format(sys.argv[0])) exit(2) path = sys.argv[1] hidden_size = int(sys.argv[2]) min_len = int(sys.argv[3]) steps = int(sys.argv[4]) epochs = int(sys.argv[5]) batch_size = int(sys.argv[6]) all_songs = get_songs(path) print('Preprocessed Songs') total_songs = len(all_songs) input_size = all_songs[0].shape[1] output_size = input_size rnn_units = hidden_size learning_rate = 0.001 keep_probability = 0.6 disp = 1 print(total_songs, input_size) print(all_songs[0].shape) model_inputs, model_targets, keep_prob, lr = model_placeholders( input_size, output_size, steps) parameters = model_parameters(output_size, hidden_size) #w1, b1 final_outputs, prediction = rnn_layer(model_inputs, parameters, rnn_units, keep_prob, steps) loss = get_loss(final_outputs, model_targets) optimizer = get_optimizer(loss, lr) accuracy = get_accuracy(model_targets, prediction) init = tf.global_variables_initializer() session = tf.Session() print('Start Training') with session as sess: sess.run(init) for epoch in range(epochs): inputs, targets = generate_batches(all_songs, batch_size, steps, input_size, output_size) feed_dict = { model_inputs: inputs, model_targets: targets, keep_prob: keep_probability, lr: learning_rate } sess.run(optimizer, feed_dict=feed_dict) if epoch % disp == 0 or epoch == 10: l, a = sess.run([loss, accuracy], feed_dict=feed_dict) s = 'Epoch: {}, Loss: {:.4f}, Accuracy: {:.3f} \n'.format( epoch, l, a) logger(epoch, epochs, s=s) # Generate new midi files get_random = False idx = 11 if get_random else np.random.randint(total_songs) song = all_songs[idx][:steps].tolist() print('Sampling new music') for i in range(100): initial = np.array([song[-steps]]) sample = sess.run(prediction, feed_dict={model_inputs, initial}) new_songs = sample_music(sample, output_size, song) sample_midi(new_songs, name='gen_1') sample_midi(all_songs[idx], name='base_1')
def train_neural_network(): tf.reset_default_graph() with tf.Session() as sess: sess = tf_debug.LocalCLIDebugWrapperSession(sess) # initialize lookup table table = initialize_lookup_table() train_feature_filenames, train_label_filenames = get_filenames() with tf.name_scope('raw_inputs'): features, raw_labels = input.getFiles(train_feature_filenames, train_label_filenames) with tf.name_scope('processed_labels'): labels = preprocess_labels(raw_labels, table) output, test_output, test_features, test_labels = model.create_model( features, labels) with tf.name_scope('loss'): loss = model.get_loss(output, labels) with tf.name_scope('training_accuracy'): training_accuracy = model.compute_accuracy(output, labels) with tf.name_scope('dev_accuracy'): dev_accuracy = model.compute_accuracy(test_output, test_labels) train_step = model.get_optimizer(loss) training_fetches = [ features, raw_labels, labels, output, loss, training_accuracy, train_step ] # initialize variables sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # add graph summary for tensorboard writer = tf.summary.FileWriter(constants.TENSORBOARD_DIR, sess.graph) # start queue runner for data loading coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) # get dev features dev_features, dev_labels = sess.run([features, labels]) # check if we received the labels correctly or not print dev_labels for epoch in range(1, constants.EPOCHS + 1): for batch in range(1, constants.NUM_BATCHES + 1): # train the model model_features, model_raw_labels, model_labels, model_output, model_loss, model_accuracy, _ = sess.run( training_fetches) print "Epoch {}/{} ; Batch {}/{} ; Accuracy {} ; Loss {}".format( epoch, constants.EPOCHS, batch, constants.NUM_BATCHES, model_accuracy, model_loss) print model_output # evaluate the accuracy if (batch % constants.TEST_PERIOD == 0): mdev_accuracy = sess.run(dev_accuracy, feed_dict={ test_features: dev_features, test_labels: dev_labels })
def main(): #Training Data xtrain = 'Xtrain.txt' ytrain = 'Ytrain.txt' #Validation Data xtest = 'Xtest.txt' ytest = 'Ytest.txt' # Training Parameters batch_size = 500 # Batch size num_epochs = 5 # Number epochs train_holdout = 0.2 # Portion of training features used for valisation learning_rate = 0.005 # Starting learning rate steps_per_epoch = 50 # Number of training steps per epoch #----- Begin Main Code # Get environment variables try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None # Get local file paths PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION) ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION) # Flags flags = tf.app.flags FLAGS = flags.FLAGS # Flags for environment variables flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training file flags flags.DEFINE_string("xtrain", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = xtrain ), "Path to training dataset.") flags.DEFINE_string("ytrain", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = ytrain ), "Path to training dataset.") flags.DEFINE_string("log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints.") # Validation file flags flags.DEFINE_string("xtest", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = xtest ), "Path to testing dataset.") flags.DEFINE_string("ytest", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = ytest ), "Path to testing dataset.") # Training parameter flags flags.DEFINE_integer("batch_size", batch_size, "Batch size [100].") flags.DEFINE_integer("num_epochs", num_epochs, "Number epochs [50].") flags.DEFINE_float("train_holdout", train_holdout, "Portion of training features withheld from traing and used for validation [0.2].") flags.DEFINE_float("learning_rate", learning_rate, "Starting learning rate [0.0005].") flags.DEFINE_integer("steps_per_epoch", steps_per_epoch, "Number of training steps per epoch") # Configure Distributed Environment def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "") # Otherwise we're running distributed TensorFlow. print("Running distributed training") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), "worker": FLAGS.worker_hosts.split(","), }) server = tf.train.Server( cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() worker_device = "/job:worker/task:{}".format(FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. return ( tf.train.replica_device_setter( worker_device=worker_device, cluster=cluster_spec), server.target, ) device, target = device_and_target() # ----- Read Data ----- # Check Flags if FLAGS.log_dir is None or FLAGS.log_dir == "": raise ValueError("Must specify an explicit `log_dir`") if FLAGS.xtrain is None or FLAGS.xtrain == "": raise ValueError("Must specify an explicit `xtrain`") if FLAGS.ytrain is None or FLAGS.ytrain == "": raise ValueError("Must specify an explicit `ytrain`") if FLAGS.xtest is None or FLAGS.xtest == "": raise ValueError("Must specify an explicit `xtest`") if FLAGS.ytest is None or FLAGS.ytest == "": raise ValueError("Must specify an explicit `ytest`") print('Training dataset file: ', FLAGS.xtrain) print('Training target file: ', FLAGS.ytrain) print('Testing dataset file: ', FLAGS.xtest) print('Testing target file: ', FLAGS.ytest) print('Log Files Saved To: ', FLAGS.log_dir) # Read in data Xtrain, Ytrain = read_flat_file(FLAGS.xtrain, FLAGS.ytrain) Xtest, Ytest = read_flat_file(FLAGS.xtest, FLAGS.ytest) num_train = int(np.round(Xtrain.shape[0] * (1-FLAGS.train_holdout))) num_held = int(Xtrain.shape[0]-num_train) print('Training on {:d} features'.format(num_train)) print('Validating on {:d} features (once per epoch)'.format(num_held)) Xval = Xtrain[num_train:] Yval = Ytrain[num_train:] Xtrain = Xtrain[:num_train] Ytrain = Ytrain[:num_train] num_batches = int(np.floor(Ytrain.shape[0]/FLAGS.batch_size)) if num_batches==0: # if defined bach size is below dataset, read as1 batch num_batches=1 FLAGS.batch_size = Ytrain.shape[0] # ----- Define Graph ----- tf.reset_default_graph() with tf.device(device): # X_in = tf.placeholder(tf.float32, [None, 15, 4, 3]) # Y_out = tf.placeholder(tf.float32, [None, 8]) global_step = tf.train.get_or_create_global_step() # Create Datasets train_dataset = tf.data.Dataset.from_tensor_slices((Xtrain, Ytrain)) # train_dataset = train_dataset.shuffle(buffer_size=10000) train_dataset = train_dataset.batch(FLAGS.batch_size) # train_dataset = train_dataset.repeat(FLAGS.num_epochs) val_dataset = tf.data.Dataset.from_tensor_slices((Xval, Yval)) val_dataset = val_dataset.batch(Yval.shape[0]) # val_dataset = val_dataset.repeat(FLAGS.num_epochs) test_dataset = tf.data.Dataset.from_tensor_slices((Xtest, Ytest)) test_dataset = test_dataset.batch(FLAGS.batch_size) # Create Iterator iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) features, labels = iter.get_next() # Create initialisation operations train_init_op = iter.make_initializer(train_dataset) val_init_op = iter.make_initializer(val_dataset) test_init_op = iter.make_initializer(test_dataset) # Apply model with tf.name_scope('predictions'): predictions = get_model(features, FLAGS) with tf.name_scope('loss'): loss = get_loss(predictions,labels) tf.summary.scalar('loss', loss)#add to tboard with tf.name_scope('train'): train_step = ( tf.train.AdamOptimizer(FLAGS.learning_rate) .minimize(loss, global_step=global_step) ) summ = tf.summary.merge_all() writer = tf.summary.FileWriter(FLAGS.log_dir) #%% Train Model with periodic validation def run_train_epoch(target, FLAGS, epoch_index): print('Epoch {:d} Training...'.format(epoch_index)) i=1 hooks=[tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch*epoch_index)] # Increment number of required training steps scaffold = tf.train.Scaffold( local_init_op=[train_init_op, val_init_op], saver=tf.train.Saver(max_to_keep=5) ) with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.log_dir, hooks = hooks, scaffold=scaffold ) as sess: writer.add_graph(sess.graph) sess.run(train_init_op) # switch to train dataset while not sess.should_stop(): [current_loss,_,s] = sess.run([loss, train_step, summ]) iteration = (epoch_index)*FLAGS.steps_per_epoch + i print("Iteration {} Training Loss: {:.4f}".format(iteration,current_loss)) i += 1 #writer.add_summary(s, i) if i==FLAGS.steps_per_epoch: # validate on last session sess.run(val_init_op) # switch to val dataset while True: try: # run and save validation parameters v_loss = sess.run(loss) print("Epoch {} Validation Loss: {:.4f}".format(epoch_index, v_loss)) except tf.errors.OutOfRangeError: break for e in range(1,FLAGS.num_epochs+1): run_train_epoch(target, FLAGS,e) # ----- Test Model on Different Dataset ----- with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0) ) as sess: sess.run(test_init_op) # initialize to test dataset loss = sess.run(loss) print("Test Set Loss (independent dataset): {:.4f}".format(loss))
import numpy as np import tensorflow as tf from model import get_model,get_loss import time batch_size=1 num_point=4096 xyzrgb=9 pointclouds_pl=tf.constant(np.random.rand(batch_size,num_point,xyzrgb),dtype=tf.float32) labels_pl=tf.constant(np.random.rand(batch_size,num_point),dtype=tf.int32) is_training_pl = tf.constant(True, shape=()) bn_decay=None pred = get_model(pointclouds_pl, is_training_pl, bn_decay=bn_decay) loss = get_loss(pred, labels_pl) init=tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(10): t1 = time.time() sess.run(pred) # sess.run(loss) t2=time.time() print t2-t1