def get_dataset_regress(dataset_path: str, batch_size: int, buffer_size: int, shuffle=False, partial=False, indices): assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size > 0 assert isinstance(indices, bool) and indices>0 records = np.array(get_records(dataset_path, partial)) records=list(records[indices]) shape = load_shape(dataset_path) return (tf.data.TFRecordDataset(records) .map(functools.partial(_decode, shape)) .batch(batch_size) .prefetch(buffer_size))
def standardize(dataset: str): """ :param dataset: :return: """ assert isinstance(dataset, str) and len(dataset) tf.enable_eager_execution() train_path = _util.get_rel_datasets_path(dataset, "train") _util.ensure_dir(train_path) dataset_path = _util.get_rel_datasets_path(dataset) standardized_name = _get_standardized_name(dataset) standardized_path = _util.get_rel_datasets_path(standardized_name) # _util.ensure_path_free(standardized_path, empty_ok=True) # _util.mkdir(standardized_path) train_data = _dataset.get_dataset(train_path, partial=True) train_iter = train_data.repeat().make_one_shot_iterator() train_records = _dataset.get_records(train_path, partial=True) # Compute sample mean over train total = train_iter.next()[0][0] for _ in tqdm(train_records[1:]): sample = train_iter.next() total += sample[0][0] mean = total / len(train_records) total = tf.square(train_iter.next()[0][0] - mean) for _ in tqdm(train_records[1:]): sample = train_iter.next() scan = sample[0][0] total += tf.square(scan - mean) std = tf.sqrt(tf.reduce_mean(total)) _standardize_dataset(train_path, dataset, mean, std) _standardize_dataset(_util.get_rel_datasets_path(dataset, "dev"), dataset, mean, std) _standardize_dataset(_util.get_rel_datasets_path(dataset, "test"), dataset, mean, std) _dataset.save_shape(standardized_path, _dataset.load_shape(dataset_path)) _dataset.save_mean(standardized_path, mean.numpy()) _dataset.save_std(standardized_path, std.numpy())
def predict(dataset: str, encoder_weights: str, model: str, model_weights: str): """ Assess regression model performance :param dataset: Name of dataset over which to test. :param encoder_weights: Path to trained encoder weights. :param model: Model type to use for regression :param model_weights: Path to trained regression weights. """ assert isinstance(dataset, str) and len(dataset) assert isinstance(encoder_weights, str) and len(encoder_weights) assert isinstance(model, str) and len(model) assert isinstance(model_weights, str) and len(model_weights) model = _get_model(model) if not os.path.isabs(encoder_weights): encoder_weights = _util.get_rel_weights_path(encoder_weights) _util.ensure_dir(os.path.dirname(encoder_weights)) if not os.path.isabs(model_weights): model_weights = _util.get_rel_weights_path(model_weights) _util.ensure_dir(os.path.dirname(model_weights)) test_dataset = _dataset.get_dataset_by_name(os.path.join(dataset, "test"), partial=True).batch(1) shape = _dataset.load_shape(_util.get_rel_datasets_path(dataset)) label = tf.placeholder(dtype=tf.float32, shape=[None, 1]) features = tf.placeholder(dtype=tf.float32, shape=[None, len(_hcp.FEATURES)]) config = tf.ConfigProto(device_count={'GPU': 0}) with tf.Session(config=config) as sess: # Define input, output, and intermediate operation. encoder, (scan, code) = _tt_utils.load_encoder(sess, encoder_weights, 1, shape) _logger.info("Counting dataset...") test_batches = _tt_utils.dataset_iter_len( sess, test_dataset.make_one_shot_iterator().get_next()) _logger.info("\tTest samples: {}".format(test_batches)) model(model_weights, sess, encoder, scan, features, code, label, test_dataset, test_batches)
def train(dataset: str, epochs: int, batch_size: int, buffer_size: int, lr: float, l2_reg=0., tv_reg=0., ssim_loss=0., sobel_loss=0.): """ Trains an Autoencoder using the specified parameters. :param dataset: Existing dataset over which to train. Must contain train, dev, {mean,std}.pickle, shape.json :param epochs: Number of iterations over training data before termination. :param batch_size: Number of training samples per batch. :param buffer_size: Number of batches to prefetch. :param lr: Adam optimization initial learning rate. :param l2_reg: L2 regularization coefficient for kernel weights. :param tv_reg: Total Variation regularization coefficient for data. :param ssim_loss: SSIM regularization coefficient for data. :param sobel_loss: L2 regularization coefficient for data Sobel difference. """ assert isinstance(dataset, str) and len(dataset) assert isinstance(epochs, int) and epochs > 0 assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and batch_size > 0 assert isinstance(lr, float) and lr > 0 assert isinstance(l2_reg, float) and l2_reg >= 0 assert isinstance(tv_reg, float) and tv_reg >= 0 assert isinstance(ssim_loss, float) and ssim_loss >= 0 assert isinstance(sobel_loss, float) and sobel_loss >= 0 # Load and ensure required paths. weights_path = _util.get_weights_path_by_param(model="autoencoder", dataset=dataset, epochs=epochs, batch_size=batch_size, lr=lr, l2_reg=l2_reg, tv_reg=tv_reg, ssim_loss=ssim_loss, sobel_loss=sobel_loss) log_path = os.path.join(weights_path, "logs") _util.ensure_path_free(log_path, empty_ok=True) _util.mkdir(log_path) dataset_path = _util.get_rel_datasets_path(dataset) _util.ensure_dir(dataset_path) # Load model and input shape. shape = _dataset.load_shape(dataset_path) mean = _dataset.load_mean(dataset_path) std = _dataset.load_std(dataset_path) model = Autoencoder(l2_reg) # Create input/output placeholders. inp = tf.image.per_image_standardization( tf.placeholder(tf.float32, shape=[None, *shape])) out = model.call(inp) # Initialize loss functions. total_loss, l2_loss, l2_reg, tv_reg, ssim_loss, sobel_loss = \ _get_losses(inp, out, batch_size, model.losses, l2_reg, tv_reg, ssim_loss, sobel_loss) # Configure training operation. train_op = _get_train_op(total_loss, lr) # Load datasets train_dataset = (_dataset.get_dataset( os.path.join(dataset_path, "train"), partial=True).map( _only_cropped_scan).batch(batch_size).prefetch(buffer_size)) dev_dataset = (_dataset.get_dataset( os.path.join(dataset_path, "dev"), partial=True).map( _only_cropped_scan).batch(batch_size).prefetch(buffer_size)) # Setup logging and weight saving. _tboard.configure(log_path, flush_secs=2) saver = tf.train.Saver() # Initialize training loop variables. best_dev_loss, dev_loss = np.inf, np.inf config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) _logger.info("Counting datasets...") train_batches = dataset_iter_len( sess, train_dataset.make_one_shot_iterator().get_next()) _logger.info("\tTrain samples: {}".format(train_batches)) dev_batches = dataset_iter_len( sess, dev_dataset.make_one_shot_iterator().get_next()) _logger.info("\tDev samples: {}".format(dev_batches)) train_loss = total_loss / train_batches dev_loss = total_loss / dev_batches train_dataset = (_dataset.get_dataset( os.path.join(dataset_path, "train"), partial=True).map( _only_cropped_scan).batch(batch_size).prefetch(buffer_size)) for epoch in tqdm(range(epochs)): train_iter = train_dataset.make_one_shot_iterator().get_next() losses = defaultdict(float) for _ in range(train_batches): sample = sess.run(train_iter) _, _train_loss, _l2_loss, _l2_reg, _tv_reg, _ssim_loss, _sobel_loss = \ sess.run( [train_op, train_loss, l2_loss, l2_reg, tv_reg, ssim_loss, sobel_loss], feed_dict={inp: sample}) losses["train/loss/total"] += _train_loss losses["train/loss/l2_loss"] += _l2_loss losses["train/reg/l2"] += _l2_reg losses["train/reg/tv"] += _tv_reg losses["train/loss/ssim"] += _ssim_loss losses["train/loss/sobel"] += _sobel_loss # Increment before doing anything else to avoid zero-indexed epochs. epoch += 1 # Log training losses to tensorboard. for name, val in losses.items(): _tboard.log_value(name, val, step=epoch) _logger.info("Epoch {}: train loss {}".format( epoch, losses["train/loss/total"])) # Compute dev metrics every 2 epochs. if epoch < 2 or epoch % 2 == 0: losses.clear() # Compute and log dev loss _dev_loss, _l2_loss, _l2_reg, _tv_reg, _ssim_loss, _sobel_loss = \ _get_dev_loss(sess, inp, dev_dataset, dev_batches, dev_loss, l2_loss, l2_reg, tv_reg, ssim_loss, sobel_loss) # Log dev losses to tensorboard. _logger.info("Epoch {}: dev loss {}".format(epoch, _dev_loss)) _tboard.log_value("dev/loss/total", _dev_loss, step=epoch) _tboard.log_value("dev/loss/l2_loss", _l2_loss, step=epoch) _tboard.log_value("dev/reg/l2", _l2_reg, step=epoch) _tboard.log_value("dev/reg/tv", _tv_reg, step=epoch) _tboard.log_value("dev/loss/ssim", _ssim_loss, step=epoch) _tboard.log_value("dev/loss/sobel", _sobel_loss, step=epoch) # Save best model. if _dev_loss < best_dev_loss: save_path = saver.save( sess, os.path.join(weights_path, "{}.ckpt".format(epoch))) _logger.info( "Saved new best model to {}".format(save_path)) best_dev_loss = _dev_loss # Plot some reconstruction images _logger.info("Generating reconstruction plots...") _log_reconstruction_imgs("eval", sess, train_dataset, inp, out, epoch, mean, std) _log_reconstruction_imgs("train", sess, train_dataset, inp, out, epoch, mean, std)
def regression(dataset: str, batch_size: int, encoder_weights: str, lr: float, epsilon: float, model: str): """ Creates and trains a regression model with variable batch size. :param dataset: Name of dataset over which to train. :param batch_size: :param encoder_weights: Path to trained encoder weights. :param lr: Model learning rate. :param epsilon: Cutoff for training termination. :param model: Model type to use for regression """ assert isinstance(dataset, str) and len(dataset) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(encoder_weights, str) and len(encoder_weights) assert isinstance(lr, float) and lr > 0 assert isinstance(epsilon, float) and epsilon > 0 assert isinstance(model, str) and len(model) model_name = model model = _get_model(model) if not os.path.isabs(encoder_weights): encoder_weights = _util.get_rel_weights_path(encoder_weights) _util.ensure_dir(os.path.dirname(encoder_weights)) # Note: these are weights for THIS model. weights_path = _util.get_weights_path_by_param( model=model_name, dataset=dataset, encoder=md5(encoder_weights.encode("ascii")).hexdigest(), lr=lr, batch_size=batch_size, epsilon=epsilon, ) log_path = os.path.join(weights_path, "logs") train_dataset = _dataset.get_dataset_by_name( os.path.join(dataset, "train"), partial=True).batch(batch_size, drop_remainder=True) dev_dataset = _dataset.get_dataset_by_name(os.path.join(dataset, "dev"), partial=True).batch( batch_size, drop_remainder=True) shape = _dataset.load_shape(_util.get_rel_datasets_path(dataset)) label = tf.placeholder(dtype=tf.float32, shape=[None, 1]) features = tf.placeholder(dtype=tf.float32, shape=[None, len(_hcp.FEATURES)]) _tboard.configure(log_path, flush_secs=2) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Define input, output, and intermediate operation. encoder, (scan, code) = _tt_utils.load_encoder(sess, encoder_weights, batch_size, shape) _logger.info("Counting datasets...") train_batches = _tt_utils.dataset_iter_len( sess, train_dataset.make_one_shot_iterator().get_next()) _logger.info("\tTrain samples: {}".format(train_batches)) dev_batches = _tt_utils.dataset_iter_len( sess, dev_dataset.make_one_shot_iterator().get_next()) _logger.info("\tDev samples: {}".format(dev_batches)) model(sess, encoder, scan, code, features, label, epsilon, train_dataset, train_batches, dev_dataset, dev_batches, lr, weights_path)
def train(dataset: str, weights: str, epochs=1000, batch_size=64, grad_norm=1000, buffer_size=8, lr=1e-3, l2_reg=1e-1, tv_reg=1e-2, partial=False): """ TODO (rfrowe) :param dataset: :param weights: :param epochs: :param batch_size: :param grad_norm: :param buffer_size: :param lr: :param partial: :return: """ "" assert isinstance(dataset, str) and len(dataset) assert isinstance(weights, str) and len(weights) assert isinstance(epochs, int) and epochs > 0 assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(grad_norm, int) and grad_norm >= 0 assert isinstance(buffer_size, int) and batch_size > 0 assert isinstance(lr, float) and lr > 0 assert isinstance(l2_reg, float) and l2_reg > 0 assert isinstance(tv_reg, float) and tv_reg > 0 assert isinstance(partial, bool) # Load and ensure required paths. weights_path = _get_weights_path(weights) log_path = _get_log_path(weights) dataset_path = _util.get_rel_datasets_path(dataset) _util.ensure_dir(dataset_path) # Load model and input shape. shape = load_shape(dataset_path) model = Autoencoder(l2_reg) # Create input/output placeholders. inp = tf.placeholder(tf.float32, shape=[None, *shape]) out = model.call(inp) # Initialize loss, reg, and TV loss = tf.nn.l2_loss(inp - out) if l2_reg > 0: loss += tf.add_n(model.losses) loss += tf.reduce_sum(total_variation_5d(tf.expand_dims(out, 4))) # Configure training operation. train_op = _get_train_op(loss, lr, grad_norm) # Load datasets train_dataset = _get_dataset(os.path.join(dataset_path, "train"), batch_size, buffer_size, partial) dev_dataset = _get_dataset(os.path.join(dataset_path, "dev"), batch_size, buffer_size, partial) # Setup logging and weight saving. _tboard.configure(log_path, flush_secs=5) saver = tf.train.Saver() # Initialize training loop variables. best_dev_loss, dev_loss = np.inf, np.inf with tf.Session() as sess: sess.run(tf.global_variables_initializer()) _logger.info("Counting datasets...") train_batches = _iter_len( sess, train_dataset.make_one_shot_iterator().get_next()) _logger.info("\tTrain samples: {}".format(train_batches)) dev_batches = _iter_len( sess, dev_dataset.make_one_shot_iterator().get_next()) _logger.info("\tDev samples: {}".format(dev_batches)) for epoch in tqdm(range(epochs)): train_iter = train_dataset.make_one_shot_iterator().get_next() train_loss = 0 for _ in range(train_batches): _, new_train_loss = sess.run( [train_op, loss], feed_dict={inp: sess.run(train_iter)}) train_loss += new_train_loss # Increment before doing anything else to avoid zero-indexed epochs. epoch += 1 _tboard.log_value("epoch", epoch, step=epoch) train_loss /= train_batches * batch_size _logger.info("Epoch {}: train {}".format(epoch, train_loss)) _tboard.log_value("train loss", train_loss, step=epoch) if epoch % 20 == 0: # Compute and log dev loss new_dev_loss = _get_dev_loss(sess, inp, dev_dataset, dev_batches, batch_size, loss) _logger.info("Epoch {}: dev {} diff {}".format( epoch, new_dev_loss, dev_loss - new_dev_loss)) dev_loss = new_dev_loss if dev_loss < best_dev_loss: save_path = saver.save( sess, os.path.join(weights_path, "{}.ckpt".format(epoch))) _logger.info( "Saved new best model to {}".format(save_path)) best_dev_loss = new_dev_loss # Plot some reconstruction images _log_reconstruction_imgs("eval", sess, dev_dataset, inp, out, epoch, weights_path) _log_reconstruction_imgs("train", sess, train_dataset, inp, out, epoch, weights_path)
batch size, the learning rate, and the threshold for acceptable error """ #make sure that you have all necessary variables in necessary forms assert isinstance(dataset, str) and len(dataset) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(test_size, int) and test_size > 0 assert isinstance(buffer_size, int) and batch_size > 0 assert isinstance(lr, float) and lr > 0 assert isinstance(eps, float) and eps > 0 assert isinstance(partial, bool) # get dataset path dataset_path = _util.get_rel_datasets_path(dataset) _util.ensure_dir(dataset_path) shape = load_shape(dataset_path) #subset data into train and test sets, randomly shuffle? # for now, hardcoade number of files for train/test split: testset=np.random.choice(1096,110) #this is a 10% validation split test_set = get_dataset_regress(dataset, test_size, buffer_size, partial, testset) trainset=np.random.permute(np.delete(np.arange(1:1096), test_set)) train_set = get_dataset_regress(dataset, batch_size, buffer_size, partial, trainset) #define svm inputs vars feat= tf.placeholder(dtype=tf.float32,shape=[None, 1]) label= tf.placeholder(dtype=tf.float32, shape=[None, 1]) w = tf.Variable(tf.random_normal(shape=[1,1])) b = tf.Variable(tf.random_normal(shape=[1,1])) #define output svm_out = tf.add(tf.matmul(feat, w), b)