def load_data(train_path, test_path): tr_path, tr_filename = os.path.split(train_path) train_dir = get_data_path( dataset_name = "svenchmie/titanic_data/titanic_train.csv", local_root = tr_path, local_repo = tr_filename, path = '' ) train = pd.read_csv(train_dir[:-1], engine="python") train = train[FEATURE_CLASSES].dropna(axis=0, how='any') train_x, train_y = train, train.pop('survived') te_path, te_filename = os.path.split(test_path) test_dir = get_data_path( dataset_name = "svenchmie/titanic_data/titanic_test.csv", local_root = te_path, local_repo = te_filename, path = '' ) test = pd.read_csv(test_dir[:-1], engine="python") test = test[FEATURE_CLASSES].dropna(axis=0, how='any') test_x, test_y = test, test.pop('survived') return (train_x, train_y), (test_x, test_y) return data
def parse_args(): """Parse arguments""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, description='''Train a convolution neural network with MNIST dataset. For distributed mode, you must run this with mpirun. See README.md''') # Experiment related parameters parser.add_argument('--local_data_root', type=str, default=os.path.join(FILE_DIR, 'data'), help='Path to dataset. This path will be /data on Clusterone.') parser.add_argument('--local_log_root', type=str, default=os.path.join(FILE_DIR, 'logs'), help='Path to store logs and checkpoints. This path will be /logs on Clusterone.') parser.add_argument('--data_subpath', type=str, default='', help='Which sub-directory the data will sit inside local_data_root (locally) ' + 'or /data/ (on Clusterone).') # CNN model params parser.add_argument('--kernel_size', type=int, default=3, help='Size of the CNN kernels to use.') parser.add_argument('--hidden_units', type=str, default='32,64', help='Comma-separated list of integers. Number of hidden units to use in CNN model.') parser.add_argument('--learning_rate', type=float, default=0.01, help='Initial learning rate used in Adam optimizer.') parser.add_argument('--learning_decay', type=float, default=0.0001, help='Exponential decay rate of the learning rate per step.') parser.add_argument('--dropout', type=float, default=0.5, help='Dropout rate used after each convolutional layer.') parser.add_argument('--batch_size', type=int, default=512, help='Batch size to use during training and evaluation.') # Training params parser.add_argument('--verbosity', type=str, default='INFO', choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'], help='TF logging level. To see intermediate results printed, set this to INFO or DEBUG.') parser.add_argument('--fashion', action='store_true', help='Download and use fashion MNIST data instead of the default handwritten digit MNIST.') parser.add_argument('--parallel_batches', type=int, default=2, help='Number of parallel batches to prepare in data pipeline.') parser.add_argument('--max_ckpts', type=int, default=2, help='Maximum number of checkpoints to keep.') parser.add_argument('--ckpt_steps', type=int, default=100, help='How frequently to save a model checkpoint.') parser.add_argument('--save_summary_steps', type=int, default=10, help='How frequently to save TensorBoard summaries.') parser.add_argument('--log_step_count_steps', type=int, default=10, help='How frequently to log loss & global steps/s.') parser.add_argument('--eval_steps', type=int, default=100, help='How frequently to run evaluation step.') parser.add_argument('--max_steps', type=int, default=1000000, help='Maximum number of steps to run.') # Parse args opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='*/*', local_root=opts.local_data_root, local_repo='', path=opts.data_subpath) opts.log_dir = get_logs_path(root=opts.local_log_root) opts.hidden_units = [int(n) for n in opts.hidden_units.split(',')] return opts
def get_args(): '''Return parsed args''' parser = ArgumentParser() parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') parser.add_argument('--fashion', type=str2bool, default=False, help='Use Fashion MNIST data') # Model params parser.add_argument('--cnn', type=str2bool, default=False, help='If true, use CNN. Otherwise, use MLP. Default: False') parser.add_argument('--kernel_size', type=int, default=3, help='Ignored if cnn is False') parser.add_argument('--hidden_units', type=int, nargs='*', default=[256, 256]) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--learning_decay', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--batch_size', type=int, default=512) # Training params parser.add_argument('--eval_secs', type=int, default=120, help='throttle_secs for EvalSpec') opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name = 'adrianyi/mnist-data', local_root = opts.local_data_dir, local_repo = '', path = '') opts.log_dir = get_logs_path(root = opts.local_log_dir) return opts
def get_args(): '''Return parsed args''' parser = ArgumentParser() parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') # Model params parser.add_argument('--hidden_units', type=int, nargs='*', default=[256, 256]) parser.add_argument( '--activation', type=str, default='relu', help= 'Activation function. See Keras activation functions. Default: relu') parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--batch_size', type=int, default=128) opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='adrianyi/mnist-data', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) return opts
def get_args(): """Return parsed args""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') parser.add_argument('--dist', type=str2bool, default='False') # Model params parser.add_argument('--hidden_units', type=int, nargs='*', default=[32, 64]) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--learning_decay', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--epochs', type=int, default=9999999) parser.add_argument( '--cuda', type=str2bool, default=None, help='Use CUDA. If left empty, CUDA will be used if available.') parser.add_argument('--ckpt_epochs', type=int, default=1) # Logging parser.add_argument('--log_freq', type=int, default=100, help='Number of steps before saving loss, etc.') parser.add_argument('--log_level', type=str, default='info', choices=['info', 'debug']) opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='*/*', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) opts.cuda = opts.cuda or torch.cuda.is_available() opts.device = torch.device('cuda' if opts.cuda else 'cpu') opts.distributed = n_workers > 1 or opts.dist return opts
def get_env(self): # Configure distributed task try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None flags = self.flags # Flags for configuring the distributed task flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training related flags flags.DEFINE_string("data_dir", get_data_path( dataset_name = self.cloud_user_repo, #all mounted repo local_root = self.data_path, local_repo = self.local_repo, path = self.cloud_data_path ), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually make sure to use" "/data/ as root path when running on ClusterOne cloud.") flags.DEFINE_string("log_dir", get_logs_path(root=self.logs_path), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") self.flags = flags
def get_args(): """Parse arguments""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, description='''Train a convolution neural network with MNIST dataset. For distributed mode, the script will use few environment variables as defaults: JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be available on distributed Tensorflow jobs on Clusterone platform by default. If running this locally, you will need to set these environment variables or pass them in as arguments (i.e. python mnist.py --job_name worker --task_index 0 --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224"). If these are not set, the script will run in non-distributed (single instance) mode.''') # Configuration for distributed task parser.add_argument('--job_name', type=str, default=os.environ.get('JOB_NAME', None), choices=['worker', 'ps'], help='Task type for the node in the distributed cluster. Worker-0 will be set as master.') parser.add_argument('--task_index', type=int, default=os.environ.get('TASK_INDEX', 0), help='Worker task index, should be >= 0. task_index=0 is the chief worker.') parser.add_argument('--ps_hosts', type=str, default=os.environ.get('PS_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') parser.add_argument('--worker_hosts', type=str, default=os.environ.get('WORKER_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') # Experiment related parameters parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') # Training params parser.add_argument('--learning_rate', type=float, default=0.001, help='Initial learning rate used in Adam optimizer.') parser.add_argument('--learning_decay', type=float, default=0.001, help='Exponential decay rate of the learning rate per step.') parser.add_argument('--batch_size', type=int, default=512, help='Batch size to use during training and evaluation.') opts = parser.parse_args() # Clusterone snippet: Grabs the correct paths, depending on if the job is running local or on Clusterone opts.data_dir = get_data_path(dataset_name='', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) return opts
def get_data_path(dataset_name, local_root, local_repo='', path=''): """ Dataset specification, see: get_data_path, https://clusterone.com/documentation/api/#get_data_path If local_root starts with gs:// we suppose a bucket in google cloud and return local_root / local_repo / local_path :param str name: TensorPort dataset repository name, e.g. user_name/repo_name :param str local_root: specifies the root directory for dataset. e.g. /home/username/datasets, gs://my-project/my_dir :param str local_repo: specifies the repo name inside the root data path. e.g. my_repo_data/ :param str path: specifies the path inside the repository, (optional) e.g. train :return str: the real path of the dataset """ if local_root.startswith('gs://'): return os.path.join(local_root, local_repo, path) return clusterone.get_data_path(dataset_name=dataset_name, local_root=local_root, local_repo=local_repo, path=path)
def get_args(): '''Return parsed args''' parser = ArgumentParser() parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') # Model params parser.add_argument('--hidden_units', type=int, nargs='*', default=[32, 64]) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--learning_decay', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.5) # Runtime params parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--num_steps', type=int, default=9999999) parser.add_argument('--input_threads', type=int, default=None) opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='*/*', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) if opts.input_threads is None: import multiprocessing opts.input_threads = multiprocessing.cpu_count() return opts
def parse_args(): """Parse arguments""" parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description= '''Trains a self-steering car model in single-instance or distributed mode. For distributed mode, the script will use few environment variables as defaults: JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be available on distributed Tensorflow jobs on Clusterone platform by default. If running this locally, you will need to set these environment variables or pass them in as arguments (i.e. python main.py --job_name worker --task_index 0 --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224"). If these are not set, the script will run in non-distributed (single instance) mode.''' ) # Configuration for distributed task parser.add_argument( '--job_name', type=str, default=os.environ.get('JOB_NAME', None), choices=['worker', 'ps'], help= 'Task type for the node in the distributed cluster. Worker-0 will be set as master.' ) parser.add_argument( '--task_index', type=int, default=os.environ.get('TASK_INDEX', 0), help= 'Worker task index, should be >= 0. task_index=0 is the chief worker.') parser.add_argument('--ps_hosts', type=str, default=os.environ.get('PS_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') parser.add_argument('--worker_hosts', type=str, default=os.environ.get('WORKER_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') # Experiment related parameters parser.add_argument( '--local_data_root', type=str, default=os.path.abspath('./data/'), help='Path to dataset. This path will be /data on Clusterone.') parser.add_argument( '--local_log_root', type=str, default=os.path.abspath('./logs/'), help= 'Path to store logs and checkpoints. This path will be /logs on Clusterone.' ) parser.add_argument( '--data_subpath', type=str, default='', help= 'Which sub-directory the data will sit inside local_data_root (locally) ' + 'or /data/ (on Clusterone)') parser.add_argument( '--absolute_data_path', type=str, default=None, help='Using this will ignore other data path arguments.') # Model params parser.add_argument('--dropout_rate1', type=float, default=0.2, help='Dropout rate after the convolutional layers.') parser.add_argument('--dropout_rate2', type=float, default=0.5, help='Dropout rate after the dense layer.') parser.add_argument('--fc_dim', type=int, default=512, help='Number of dimensions in the dense layer.') parser.add_argument('--nogood', action='store_true', help='Ignore "goods" filters') parser.add_argument('--learning_rate', type=float, default=0.0001, help='Initial learning rate used in Adam optimizer.') parser.add_argument( '--learning_decay', type=float, default=0.0001, help='Exponential decay rate of the learning rate per step.') # Training params parser.add_argument( '--batch_size', type=int, default=64, help='Batch size to use during training and evaluation.') parser.add_argument('--max_steps', type=int, default=10000, help='Max number of steps to train for.') parser.add_argument( '--verbosity', type=str, default='INFO', choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'], help= 'TF logging level. To log intermediate results, set this to INFO or DEBUG.' ) parser.add_argument('--num_threads', type=int, default=1, help='Number of threads to use to prepare data') parser.add_argument('--max_ckpts', type=int, default=2, help='Maximum number of checkpoints to keep') parser.add_argument('--ckpt_steps', type=int, default=100, help='How frequently to save a model checkpoint') parser.add_argument('--save_summary_steps', type=int, default=10, help='How frequently to save TensorBoard summaries') parser.add_argument('--log_step_count_steps', type=int, default=10, help='How frequently to log loss & global steps/s') parser.add_argument( '--eval_secs', type=int, default=60, help='How frequently to run evaluation step. ' + 'By default, there is no evaluation dataset, thus effectively no evaluation.' ) # Parse args opts = parser.parse_args() if opts.absolute_data_path is None: opts.train_data = get_data_path(dataset_name='*/*', local_root=opts.local_data_root, local_repo=opts.data_subpath, path='camera/training/*.h5') else: opts.train_data = os.path.join(opts.absolute_data_path, 'camera/training/*.h5') opts.log_dir = get_logs_path(root=opts.local_log_root) opts.ps_hosts = opts.ps_hosts.split(',') if opts.ps_hosts else [] opts.worker_hosts = opts.worker_hosts.split( ',') if opts.worker_hosts else [] return opts
sum1, sumx, sumy = 0, 0, 0 for i in range(len(X)): sum1 += (X[i] - avx) * (Y[i] - avy) sumx += (X[i] - avx) * (X[i] - avx) sumy += (Y[i] - avy) * (Y[i] - avy) return sum1 * sum1 / (sumx * sumy) sys.stdout.write("reading data ... ") sys.stdout.flush() start = datetime.datetime.now() # file path when running on clusterone: /data/my_username/dataset_name/ data_path = get_data_path( dataset_name='zhaoxiaq/qianqianmerckdata', # on ClusterOne local_root='~/', # path to local dataset local_repo='TrainingSet', # local data folder name path='ACT1_competition_training.csv' # folder within the data folder ) train_1 = pd.read_csv(data_path, engine='python', dtype={ "MOLECULE": object, "Act": float }) # file path when running on my own pc or cloud # train_1 = pd.read_csv('TrainingSet/ACT1_competition_training.csv',dtype={"MOLECULE": object, "Act": float}) stop = datetime.datetime.now() sys.stdout.write("done\n") sys.stdout.write("took {} seconds\n".format((stop - start).total_seconds()))
from keras.preprocessing.image import ImageDataGenerator from keras.applications import MobileNetV2 from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping from keras.optimizers import Adam from clusterone import get_data_path, get_logs_path N_CLASSES = 200 BATCH_SIZE = 32 train_data_dir = get_data_path( dataset_name = 'artem/artem-tiny-imagenet', local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/', local_repo = 'tiny-imagenet-200', path = 'train' ) val_data_dir = get_data_path( dataset_name = 'artem/artem-tiny-imagenet', local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/', local_repo = 'tiny-imagenet-200', path = 'val/for_keras' ) models_dir = get_data_path( dataset_name = 'artem/artem-tiny-imagenet', local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/', local_repo = '', path = 'models' ) log_dir = get_logs_path('/Users/artem/Documents/Scratch/tiny_imagenet/logs/')
flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training related flags flags.DEFINE_string("data_dir", get_data_path( dataset_name = "cyi/cyi-datasets", #all mounted repo local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = "mnist", path = 'data' ), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") flags.DEFINE_string("log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.")
# coding: utf-8 # In[ ]: import numpy as np #from sklearn.model_selection import train_test_split import tensorflow as tf import os import tensorflow as tf from clusterone import get_data_path, get_logs_path # In[ ]: train_dir = get_data_path( dataset_name='/data/bhavikaj/bhavikaj-asl/data.h5', # on ClusterOne local_root='data.h5', # path to local dataset local_repo='', # local data folder name path='' # folder within the data folder ) # In[ ]: import h5py # Load hdf5 dataset h5f = h5py.File('/data/bhavikaj/bhavikaj-asl/data.h5', 'r') X_train = h5f['X_train'] y_trainHot = h5f['y_trainHot'] X_test = h5f['X_test'] y_testHot = h5f['y_testHot'] ##Defining the Model #Training using a simple model
def main(args): print('args =', args) sys.stdout.write("reading data ... ") sys.stdout.flush() start = datetime.datetime.now() # file path when running on clusterone: /data/my_username/dataset_name/ data_path = get_data_path( dataset_name = 'zhaoxiaq/qianqianmerckdata', # on ClusterOne local_root = '~/', # path to local dataset local_repo = 'TrainingSet', # local data folder name path = 'ACT1_competition_training.csv' # folder within the data folder ) train_1 = pd.read_csv(data_path, engine = 'python', dtype={"MOLECULE": object, "Act": float}) stop = datetime.datetime.now() sys.stdout.write("done\n") sys.stdout.write("took {} seconds\n".format((stop - start).total_seconds())) sys.stdout.flush() y = train_1['Act'].values y = np.reshape(y, (-1, 1)) train_1 = train_1.drop(['Act', 'MOLECULE'], axis = 1) train_1 = train_1.apply(lambda x: np.log(x+1)) x = train_1 seed = args.seed if args.seed else random.randint(0, pow(2,32) - 1) print("SEED =", seed ) X_train, X_dev, Y_train, Y_dev = train_test_split(x, y, train_size = 0.80, random_state=seed) X_val, X_test, Y_val, Y_test = train_test_split(X_dev, Y_dev, train_size = 0.50, random_state=seed) X_placeholder = tf.placeholder(tf.float64, (None, X_train.shape[1])) Y_placeholder = tf.placeholder(tf.float64, (None, Y_train.shape[1])) # define parameters features = np.shape(X_train)[1] # switch to X_train target_size = np.shape(X_train)[0] learning_rate = 0.001# switch to 0.05 start_epoch = args.restart_epoch if args.restart_epoch else 0 epochs = args.epochs checkpoint_freq = args.checkpoint batch_size = 300 batch_size_placeholder = tf.placeholder(tf.int64) # network parameters n_hidden_1 = 100 n_hidden_2 = 50 n_hidden_3 = 25 n_hidden_4 = 25 ds_train = tf.data.Dataset.from_tensor_slices((X_placeholder, Y_placeholder)).shuffle(buffer_size=round(len(X_train) * 0.3)).batch(batch_size_placeholder) ds_test = tf.data.Dataset.from_tensor_slices((X_placeholder, Y_placeholder)).batch(batch_size_placeholder) ds_iter = tf.data.Iterator.from_structure(ds_train.output_types, ds_train.output_shapes) next_x, next_y = ds_iter.get_next() train_init_op = ds_iter.make_initializer(ds_train) test_init_op = ds_iter.make_initializer(ds_test) # define placeholder for input vector X and target vector y keep_prob = tf.placeholder(tf.float64) # initialize weights and bias weights = {'w1': tf.Variable(tf.truncated_normal([features, n_hidden_1], 0, 1, dtype=tf.float64)), 'w2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2], 0, 1, dtype=tf.float64)), 'w3': tf.Variable(tf.truncated_normal([n_hidden_2, n_hidden_3], 0, 1, dtype=tf.float64)), 'w4': tf.Variable(tf.truncated_normal([n_hidden_3, n_hidden_4], 0, 1, dtype=tf.float64)), 'out': tf.Variable(tf.truncated_normal([n_hidden_4, 1], 0, 1, dtype=tf.float64))} biases = {'b1': tf.Variable(tf.truncated_normal([n_hidden_1], 0, 1, dtype=tf.float64)), 'b2': tf.Variable(tf.truncated_normal([n_hidden_2], 0, 1, dtype=tf.float64)), 'b3': tf.Variable(tf.truncated_normal([n_hidden_3], 0, 1, dtype=tf.float64)), 'b4': tf.Variable(tf.truncated_normal([n_hidden_4], 0, 1, dtype=tf.float64)), 'out': tf.Variable(tf.truncated_normal([1], 0, 1, dtype=tf.float64))} # construct model y_pred = multilayer_perceptron(next_x, weights, biases, keep_prob) # define cost function(mean squred error) and optimizer(gradient descent) cost = tf.losses.mean_squared_error(next_y, y_pred) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # initialize variables init_op = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=8, keep_checkpoint_every_n_hours=1) with tf.Session() as sess: if args.restart: saver.restore(sess, args.restart) else: sess.run(init_op) for epoch in range(epochs): sess.run(train_init_op, feed_dict={X_placeholder: X_train, Y_placeholder: Y_train, batch_size_placeholder: batch_size}) count = 0 while True: try: count += 1 _, c = sess.run((optimizer, cost), feed_dict={keep_prob: 0.75}) print('Epoch:', (epoch + 1), 'Batch:', count, 'cost =', c) except tf.errors.OutOfRangeError: break # Calculate R^2 each Epoch sess.run(test_init_op, feed_dict={X_placeholder: X_test, Y_placeholder: Y_test, batch_size_placeholder: len(X_test)}) results, test_cost = sess.run((y_pred, cost), feed_dict={keep_prob: 1.0}) print(epoch, test_cost, r_square(np.reshape(results, (len(results),)), Y_test)) # Save model every 'checkpoint_freq' epochs. if epoch % checkpoint_freq == 0: saver.save(sess, './snap', global_step=epoch)
import argparse import os import math import shutil import cv2 import numpy as np from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets from clusterone import get_data_path, get_logs_path LOCAL_DATA_PATH = os.path.abspath(os.path.expanduser('../../data/')) LOCAL_LOGS_PATH = os.path.abspath(os.path.expanduser('logs/')) # Storage directory for the MNIST dataset. # Returns LOCAL_DATA_PATH when running locally, '/data/malo/mnist' when running on Clusterone. data_dir = get_data_path(dataset_name="malo/mnist", local_root=LOCAL_DATA_PATH, local_repo="mnist", path='') # Storage dictory for the log files produced by this script. logs_dir = get_logs_path(LOCAL_LOGS_PATH) # The MNIST dataset has 10 classes, representing the digits 0 through 9 NUM_CLASSES = 10 # The MNIST images are always 28x28 pixels IMAGE_SIZE = 28 IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE # Each hidden layer gets 128 neurons hidden1_units = 128 hidden2_units = 128
flags.DEFINE_integer( "task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task the performs the variable " "initialization") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training related flags flags.DEFINE_string( "data_dir", get_data_path( dataset_name="malo/mnist", #all mounted repo local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo="mnist", path=''), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") flags.DEFINE_string( "log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.")
import os import numpy as np import torch.nn as nn import torch import argparse import time from torch.autograd import Variable from tensorboard_logger import configure, log_value from clusterone import get_logs_path, get_data_path from .model.model import fetch_metrics, TinyImageNetModel from .model.data_loader import fetch_label_map, fetch_dataloader TRAIN_DATA_DIR = get_data_path( dataset_name = 'mohsen/clusterone-tiny-imagenet-example', local_root = os.path.expanduser('~/'), local_repo = 'tiny-imagenet-200', path = 'train' ) EVAL_DATA_DIR = get_data_path( dataset_name = 'mohsen/clusterone-tiny-imagenet-example', local_root = os.path.expanduser('~/'), local_repo = 'tiny-imagenet-200', path = 'val/for_keras' ) UNIQUE_LABELS_PATH = get_data_path( dataset_name = 'mohsen/clusterone-tiny-imagenet-example', local_root = os.path.expanduser('~/'), local_repo = 'tiny-imagenet-200', path = 'wnids.txt' ) LOGS_PATH = get_logs_path('./logs')
flags.DEFINE_integer( "task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training related flags flags.DEFINE_string( "train_data_dir", get_data_path( dataset_name='artem/artem-tiny-imagenet', local_root=os.path.expanduser('~/Documents/Scratch/tiny_imagenet/'), local_repo='tiny-imagenet-200', path='train'), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") flags.DEFINE_string( "val_data_dir", get_data_path( dataset_name='artem/artem-tiny-imagenet', local_root=os.path.expanduser('~/Documents/Scratch/tiny_imagenet/'), local_repo='tiny-imagenet-200', path='val/for_keras'), "Path to store logs and checkpoints. It is recommended"
flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task the performs the variable " "initialization") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training related flags flags.DEFINE_string("data_dir", get_data_path( dataset_name = "sjay87/", #all mounted repo local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = "", path = "" ), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to ClusterOne without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.") flags.DEFINE_string("log_dir", get_logs_path(root = PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.")
flags.DEFINE_integer( "input_width", None, "The size of image to use (will be center cropped). If None, same value as input_height [None]" ) flags.DEFINE_integer("output_height", 64, "The size of the output images to produce [64]") flags.DEFINE_integer( "output_width", None, "The size of the output images to produce. If None, same value as output_height [None]" ) flags.DEFINE_string("dataset", "celebA", "The name of dataset [celebA, mnist, lsun]") flags.DEFINE_string( "data_path", get_data_path(dataset_name="%s/*" % CLUSTERONE_USERNAME, local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo=LOCAL_REPO, path=""), "data path for zip file") flags.DEFINE_string("checkpoint_dir", get_logs_path(LOCAL_PATH_TO_LOGS), "Directory name to save the checkpoints [checkpoint]") flags.DEFINE_string( "sample_dir", get_logs_path("samples"), "Directory name to save the image samples [samples]" ) #TODO: replace with os.path.join(logs/samples) when folders are supported flags.DEFINE_boolean("train", True, "True for training, False for testing [True]") flags.DEFINE_boolean("crop", True, "True for training, False for testing [True]") flags.DEFINE_boolean("visualize", False, "True for visualizing, False for nothing [False]") FLAGS = flags.FLAGS
flags = tf.app.flags flags.DEFINE_integer("number_worker_gpu", 0, "Number of worker GPUs") flags.DEFINE_integer("number_ps_gpu", 0, "Number of PS GPUs") flags.DEFINE_integer("batch_size", 2097152, "Batch size") FLAGS = tf.flags.FLAGS USERNAME = "******" DATASET_NAME = "openslr_small" PROBLEM = 'librispeech_clean' DATA_PATH = get_data_path( dataset_name="%s/%s" % (USERNAME, DATASET_NAME), #on clusterone local_root=os.path.expanduser("~/Data"), local_repo="openSLR", path='') CHECKPOINTS_PATH = get_logs_path(root=os.path.expanduser("~/logs")) if not os.path.exists(CHECKPOINTS_PATH): os.makedirs(CHECKPOINTS_PATH) try: job_name = os.environ['JOB_NAME'] task_index = int(os.environ['TASK_INDEX']) ps_hosts = os.environ['PS_HOSTS'].split(',') worker_hosts = os.environ['WORKER_HOSTS'].split(',') if job_name == 'ps': ps_hosts[task_index] = 'localhost:%s' % (
def main(): """ Main wrapper""" # clusterone snippet 1 - get environment variables try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None if job_name == None: #if running locally if LOCAL_LOG_LOCATION == "...": raise ValueError("LOCAL_LOG_LOCATION needs to be defined") if LOCAL_DATASET_LOCATION == "...": raise ValueError("LOCAL_DATASET_LOCATION needs to be defined") if LOCAL_DATASET_NAME == "...": raise ValueError("LOCAL_DATASET_NAME needs to be defined") #Path to your data locally. This will enable to run the model both locally and on # ClusterOne without changes PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION) ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION) #end of clusterone snippet 1 #Flags flags = tf.app.flags FLAGS = flags.FLAGS # clusterone snippet 2: flags. #Define the path from the root data directory to your data. #We use glob to match any .h5 datasets in Documents/comma locally, or in data/ on ClusterOne flags.DEFINE_string( "train_data_dir", get_data_path( dataset_name="tensorbot/*", local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo= LOCAL_DATASET_NAME, #all repos (we use glob downstream, see read_data.py) path='camera/training/*.h5' #all .h5 files ), """Path to training dataset. It is recommended to use get_data_path() to define your data directory. If you set your dataset directory manually make sure to use /data/ as root path when running on TensorPort cloud. On tensrport, the data will be mounted in /data/user/clusterone_dataset_name, so you can acces `path` with /data/user/clusterone_dataset_name/path """) flags.DEFINE_string( "logs_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "If you set your logs directory manually make sure" "to use /logs/ when running on TensorPort cloud.") # Define worker specific environment variables. Handled automatically. flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer( "task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task the performs the variable " "initialization") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # end of clusterone snippet 2 # Training flags - feel free to play with that! flags.DEFINE_integer("batch", 64, "Batch size") flags.DEFINE_integer("time", 1, "Number of frames per sample") flags.DEFINE_integer("steps_per_epoch", 10000, "Number of training steps per epoch") flags.DEFINE_integer("nb_epochs", 200, "Number of epochs") # Model flags - feel free to play with that! flags.DEFINE_float("dropout_rate1", .2, "Dropout rate on first dropout layer") flags.DEFINE_float("dropout_rate2", .5, "Dropout rate on second dropout layer") flags.DEFINE_float("starter_lr", 1e-6, "Starter learning rate. Exponential decay is applied") flags.DEFINE_integer("fc_dim", 512, "Size of the dense layer") flags.DEFINE_boolean("nogood", False, "Ignore `goods` filters.") # clusterone snippet 3: configure distributed environment def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "") # Otherwise we're running distributed TensorFlow. print("Running distributed training") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), "worker": FLAGS.worker_hosts.split(","), }) server = tf.train.Server(cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() worker_device = "/job:worker/task:{}".format(FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. return ( tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec), server.target, ) device, target = device_and_target() # end of clusterone snippet 3 print(FLAGS.logs_dir) print(FLAGS.train_data_dir) if FLAGS.logs_dir is None or FLAGS.logs_dir == "": raise ValueError("Must specify an explicit `logs_dir`") if FLAGS.train_data_dir is None or FLAGS.train_data_dir == "": raise ValueError("Must specify an explicit `train_data_dir`") # if FLAGS.val_data_dir is None or FLAGS.val_data_dir == "": # raise ValueError("Must specify an explicit `val_data_dir`") TIME_LEN = 1 #1 video frame. Other not supported. # Define graph with tf.device(device): # X = tf.placeholder(tf.float32, [FLAGS.batch, 3, 160, 320], name="X") # Y = tf.placeholder(tf.float32,[FLAGS.batch,1], name="Y") # angle only # S = tf.placeholder(tf.float32,[FLAGS.batch,1], name="S") #speed if FLAGS.task_index == 0: print("Looking for data in %s" % FLAGS.train_data_dir) reader = DataReader(FLAGS.train_data_dir) x, y, s = reader.read_row_tf() x.set_shape((3, 160, 320)) y.set_shape((1)) s.set_shape((1)) X, Y, S = tf.train.batch([x, y, s], batch_size=FLAGS.batch) predictions = get_model(X, FLAGS) steering_summary = tf.summary.image( "green-is-predicted", render_steering_tf(X, Y, S, predictions) ) # Adding numpy operation to graph. Adding image to summary loss = get_loss(predictions, Y) training_summary = tf.summary.scalar('Training_Loss', loss) #add to tboard #Batch generators global_step = tf.contrib.framework.get_or_create_global_step() learning_rate = tf.train.exponential_decay(FLAGS.starter_lr, global_step, 1000, 0.96, staircase=True) train_step = (tf.train.AdamOptimizer(learning_rate).minimize( loss, global_step=global_step)) def run_train_epoch(target, FLAGS, epoch_index): """Restores the last checkpoint and runs a training epoch Inputs: - target: device setter for distributed work - FLAGS: - requires FLAGS.logs_dir from which the model will be restored. Note that whatever most recent checkpoint from that directory will be used. - requires FLAGS.steps_per_epoch - epoch_index: index of current epoch """ hooks = [ tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch * epoch_index) ] # Increment number of required training steps i = 1 with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.logs_dir, hooks=hooks) as sess: while not sess.should_stop(): variables = [loss, learning_rate, train_step] current_loss, lr, _ = sess.run(variables) print( "Iteration %s - Batch loss: %s" % ((epoch_index) * FLAGS.steps_per_epoch + i, current_loss)) i += 1 for e in range(FLAGS.nb_epochs): run_train_epoch(target, FLAGS, e)
def main(): #Training Data xtrain = 'Xtrain.txt' ytrain = 'Ytrain.txt' #Validation Data xtest = 'Xtest.txt' ytest = 'Ytest.txt' # Training Parameters batch_size = 500 # Batch size num_epochs = 5 # Number epochs train_holdout = 0.2 # Portion of training features used for valisation learning_rate = 0.005 # Starting learning rate steps_per_epoch = 50 # Number of training steps per epoch #----- Begin Main Code # Get environment variables try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None # Get local file paths PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION) ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION) # Flags flags = tf.app.flags FLAGS = flags.FLAGS # Flags for environment variables flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training file flags flags.DEFINE_string("xtrain", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = xtrain ), "Path to training dataset.") flags.DEFINE_string("ytrain", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = ytrain ), "Path to training dataset.") flags.DEFINE_string("log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints.") # Validation file flags flags.DEFINE_string("xtest", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = xtest ), "Path to testing dataset.") flags.DEFINE_string("ytest", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = ytest ), "Path to testing dataset.") # Training parameter flags flags.DEFINE_integer("batch_size", batch_size, "Batch size [100].") flags.DEFINE_integer("num_epochs", num_epochs, "Number epochs [50].") flags.DEFINE_float("train_holdout", train_holdout, "Portion of training features withheld from traing and used for validation [0.2].") flags.DEFINE_float("learning_rate", learning_rate, "Starting learning rate [0.0005].") flags.DEFINE_integer("steps_per_epoch", steps_per_epoch, "Number of training steps per epoch") # Configure Distributed Environment def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "") # Otherwise we're running distributed TensorFlow. print("Running distributed training") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), "worker": FLAGS.worker_hosts.split(","), }) server = tf.train.Server( cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() worker_device = "/job:worker/task:{}".format(FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. return ( tf.train.replica_device_setter( worker_device=worker_device, cluster=cluster_spec), server.target, ) device, target = device_and_target() # ----- Read Data ----- # Check Flags if FLAGS.log_dir is None or FLAGS.log_dir == "": raise ValueError("Must specify an explicit `log_dir`") if FLAGS.xtrain is None or FLAGS.xtrain == "": raise ValueError("Must specify an explicit `xtrain`") if FLAGS.ytrain is None or FLAGS.ytrain == "": raise ValueError("Must specify an explicit `ytrain`") if FLAGS.xtest is None or FLAGS.xtest == "": raise ValueError("Must specify an explicit `xtest`") if FLAGS.ytest is None or FLAGS.ytest == "": raise ValueError("Must specify an explicit `ytest`") print('Training dataset file: ', FLAGS.xtrain) print('Training target file: ', FLAGS.ytrain) print('Testing dataset file: ', FLAGS.xtest) print('Testing target file: ', FLAGS.ytest) print('Log Files Saved To: ', FLAGS.log_dir) # Read in data Xtrain, Ytrain = read_flat_file(FLAGS.xtrain, FLAGS.ytrain) Xtest, Ytest = read_flat_file(FLAGS.xtest, FLAGS.ytest) num_train = int(np.round(Xtrain.shape[0] * (1-FLAGS.train_holdout))) num_held = int(Xtrain.shape[0]-num_train) print('Training on {:d} features'.format(num_train)) print('Validating on {:d} features (once per epoch)'.format(num_held)) Xval = Xtrain[num_train:] Yval = Ytrain[num_train:] Xtrain = Xtrain[:num_train] Ytrain = Ytrain[:num_train] num_batches = int(np.floor(Ytrain.shape[0]/FLAGS.batch_size)) if num_batches==0: # if defined bach size is below dataset, read as1 batch num_batches=1 FLAGS.batch_size = Ytrain.shape[0] # ----- Define Graph ----- tf.reset_default_graph() with tf.device(device): # X_in = tf.placeholder(tf.float32, [None, 15, 4, 3]) # Y_out = tf.placeholder(tf.float32, [None, 8]) global_step = tf.train.get_or_create_global_step() # Create Datasets train_dataset = tf.data.Dataset.from_tensor_slices((Xtrain, Ytrain)) # train_dataset = train_dataset.shuffle(buffer_size=10000) train_dataset = train_dataset.batch(FLAGS.batch_size) # train_dataset = train_dataset.repeat(FLAGS.num_epochs) val_dataset = tf.data.Dataset.from_tensor_slices((Xval, Yval)) val_dataset = val_dataset.batch(Yval.shape[0]) # val_dataset = val_dataset.repeat(FLAGS.num_epochs) test_dataset = tf.data.Dataset.from_tensor_slices((Xtest, Ytest)) test_dataset = test_dataset.batch(FLAGS.batch_size) # Create Iterator iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) features, labels = iter.get_next() # Create initialisation operations train_init_op = iter.make_initializer(train_dataset) val_init_op = iter.make_initializer(val_dataset) test_init_op = iter.make_initializer(test_dataset) # Apply model with tf.name_scope('predictions'): predictions = get_model(features, FLAGS) with tf.name_scope('loss'): loss = get_loss(predictions,labels) tf.summary.scalar('loss', loss)#add to tboard with tf.name_scope('train'): train_step = ( tf.train.AdamOptimizer(FLAGS.learning_rate) .minimize(loss, global_step=global_step) ) summ = tf.summary.merge_all() writer = tf.summary.FileWriter(FLAGS.log_dir) #%% Train Model with periodic validation def run_train_epoch(target, FLAGS, epoch_index): print('Epoch {:d} Training...'.format(epoch_index)) i=1 hooks=[tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch*epoch_index)] # Increment number of required training steps scaffold = tf.train.Scaffold( local_init_op=[train_init_op, val_init_op], saver=tf.train.Saver(max_to_keep=5) ) with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.log_dir, hooks = hooks, scaffold=scaffold ) as sess: writer.add_graph(sess.graph) sess.run(train_init_op) # switch to train dataset while not sess.should_stop(): [current_loss,_,s] = sess.run([loss, train_step, summ]) iteration = (epoch_index)*FLAGS.steps_per_epoch + i print("Iteration {} Training Loss: {:.4f}".format(iteration,current_loss)) i += 1 #writer.add_summary(s, i) if i==FLAGS.steps_per_epoch: # validate on last session sess.run(val_init_op) # switch to val dataset while True: try: # run and save validation parameters v_loss = sess.run(loss) print("Epoch {} Validation Loss: {:.4f}".format(epoch_index, v_loss)) except tf.errors.OutOfRangeError: break for e in range(1,FLAGS.num_epochs+1): run_train_epoch(target, FLAGS,e) # ----- Test Model on Different Dataset ----- with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0) ) as sess: sess.run(test_init_op) # initialize to test dataset loss = sess.run(loss) print("Test Set Loss (independent dataset): {:.4f}".format(loss))
ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None flags = tf.app.flags # Training related flags flags.DEFINE_string( "data_dir", get_data_path( dataset_name="kelvinchngphysicist/2d10", #all mounted repo local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo="2d10", path=''), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") flags.DEFINE_string( "log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.")
input0 = input - m m.squeeze() return m + torch.log(torch.sum(torch.exp(input0), dim=1)) def get_log_odds(raw_marginals): marginals = torch.clamp(raw_marginals.mean(dim=0), 1e-7, 1 - 1e-7) return torch.log(marginals / (1 - marginals)) train_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root=get_data_path( dataset_name="%s/cifars3"%CLUSTERONE_USERNAME, local_root=opt.dataroot, local_repo="", path="" ) , train=True, download=True, transform=transforms.Compose([ transforms.ToTensor() ])), batch_size=batch_size, shuffle=True) save_image_dir = get_logs_path(opt.save_image_dir) save_model_dir = get_logs_path(opt.save_model_dir) netE = tocuda(Encoder(latent_size, True)) netG = tocuda(Generator(latent_size)) netD = tocuda(Discriminator(latent_size, 0.2, 1))