def main(): mongo_db_host = os.environ["MONGO_DB_HOST"] mongo_db_port = os.environ["MONGO_DB_PORT"] experiment_name = os.environ["EXPERIMENT_NAME"] mongo_connect_str = "mongo://{0}:{1}/foo_db/jobs".format( mongo_db_host, mongo_db_port) while True: try: trials = MongoTrials(mongo_connect_str, exp_key=experiment_name) except ServerSelectionTimeoutError: pass else: space = {'x': hp.uniform('x', -2, 2)} best = fmin(obj, space=space, trials=trials, algo=tpe.suggest, max_evals=100) if os.environ["JOB_NAME"] == "ps": save_path = os.path.join(get_logs_path("./logs"), "results.json") with open(save_path, "w") as f: json.dump(json.dumps(best), f) return
def get_args(): '''Return parsed args''' parser = ArgumentParser() parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') parser.add_argument('--fashion', type=str2bool, default=False, help='Use Fashion MNIST data') # Model params parser.add_argument('--cnn', type=str2bool, default=False, help='If true, use CNN. Otherwise, use MLP. Default: False') parser.add_argument('--kernel_size', type=int, default=3, help='Ignored if cnn is False') parser.add_argument('--hidden_units', type=int, nargs='*', default=[256, 256]) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--learning_decay', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--batch_size', type=int, default=512) # Training params parser.add_argument('--eval_secs', type=int, default=120, help='throttle_secs for EvalSpec') opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name = 'adrianyi/mnist-data', local_root = opts.local_data_dir, local_repo = '', path = '') opts.log_dir = get_logs_path(root = opts.local_log_dir) return opts
def get_args(): '''Return parsed args''' parser = ArgumentParser() parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') # Model params parser.add_argument('--hidden_units', type=int, nargs='*', default=[256, 256]) parser.add_argument( '--activation', type=str, default='relu', help= 'Activation function. See Keras activation functions. Default: relu') parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--batch_size', type=int, default=128) opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='adrianyi/mnist-data', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) return opts
def parse_args(): """Parse arguments""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, description='''Train a convolution neural network with MNIST dataset. For distributed mode, you must run this with mpirun. See README.md''') # Experiment related parameters parser.add_argument('--local_data_root', type=str, default=os.path.join(FILE_DIR, 'data'), help='Path to dataset. This path will be /data on Clusterone.') parser.add_argument('--local_log_root', type=str, default=os.path.join(FILE_DIR, 'logs'), help='Path to store logs and checkpoints. This path will be /logs on Clusterone.') parser.add_argument('--data_subpath', type=str, default='', help='Which sub-directory the data will sit inside local_data_root (locally) ' + 'or /data/ (on Clusterone).') # CNN model params parser.add_argument('--kernel_size', type=int, default=3, help='Size of the CNN kernels to use.') parser.add_argument('--hidden_units', type=str, default='32,64', help='Comma-separated list of integers. Number of hidden units to use in CNN model.') parser.add_argument('--learning_rate', type=float, default=0.01, help='Initial learning rate used in Adam optimizer.') parser.add_argument('--learning_decay', type=float, default=0.0001, help='Exponential decay rate of the learning rate per step.') parser.add_argument('--dropout', type=float, default=0.5, help='Dropout rate used after each convolutional layer.') parser.add_argument('--batch_size', type=int, default=512, help='Batch size to use during training and evaluation.') # Training params parser.add_argument('--verbosity', type=str, default='INFO', choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'], help='TF logging level. To see intermediate results printed, set this to INFO or DEBUG.') parser.add_argument('--fashion', action='store_true', help='Download and use fashion MNIST data instead of the default handwritten digit MNIST.') parser.add_argument('--parallel_batches', type=int, default=2, help='Number of parallel batches to prepare in data pipeline.') parser.add_argument('--max_ckpts', type=int, default=2, help='Maximum number of checkpoints to keep.') parser.add_argument('--ckpt_steps', type=int, default=100, help='How frequently to save a model checkpoint.') parser.add_argument('--save_summary_steps', type=int, default=10, help='How frequently to save TensorBoard summaries.') parser.add_argument('--log_step_count_steps', type=int, default=10, help='How frequently to log loss & global steps/s.') parser.add_argument('--eval_steps', type=int, default=100, help='How frequently to run evaluation step.') parser.add_argument('--max_steps', type=int, default=1000000, help='Maximum number of steps to run.') # Parse args opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='*/*', local_root=opts.local_data_root, local_repo='', path=opts.data_subpath) opts.log_dir = get_logs_path(root=opts.local_log_root) opts.hidden_units = [int(n) for n in opts.hidden_units.split(',')] return opts
def main(): test_dict = {'dog': 'bernese'} file_path = os.path.join(get_logs_path('./logs'), 'test_json.json') with open(file_path, 'w') as f: json.dump(json.dumps(test_dict), f) return
def get_logs_path(path): """ Log dir specification, see: get_logs_path, https://clusterone.com/documentation/api/#get_logs_path :param str path: the path for the logs dir :return str: the real path for the logs """ if path.startswith('gs://'): return path return clusterone.get_logs_path(path)
def get_args(): """Return parsed args""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') parser.add_argument('--dist', type=str2bool, default='False') # Model params parser.add_argument('--hidden_units', type=int, nargs='*', default=[32, 64]) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--learning_decay', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--epochs', type=int, default=9999999) parser.add_argument( '--cuda', type=str2bool, default=None, help='Use CUDA. If left empty, CUDA will be used if available.') parser.add_argument('--ckpt_epochs', type=int, default=1) # Logging parser.add_argument('--log_freq', type=int, default=100, help='Number of steps before saving loss, etc.') parser.add_argument('--log_level', type=str, default='info', choices=['info', 'debug']) opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='*/*', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) opts.cuda = opts.cuda or torch.cuda.is_available() opts.device = torch.device('cuda' if opts.cuda else 'cpu') opts.distributed = n_workers > 1 or opts.dist return opts
def get_env(self): # Configure distributed task try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None flags = self.flags # Flags for configuring the distributed task flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training related flags flags.DEFINE_string("data_dir", get_data_path( dataset_name = self.cloud_user_repo, #all mounted repo local_root = self.data_path, local_repo = self.local_repo, path = self.cloud_data_path ), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually make sure to use" "/data/ as root path when running on ClusterOne cloud.") flags.DEFINE_string("log_dir", get_logs_path(root=self.logs_path), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") self.flags = flags
def get_args(): """Parse arguments""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, description='''Train a convolution neural network with MNIST dataset. For distributed mode, the script will use few environment variables as defaults: JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be available on distributed Tensorflow jobs on Clusterone platform by default. If running this locally, you will need to set these environment variables or pass them in as arguments (i.e. python mnist.py --job_name worker --task_index 0 --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224"). If these are not set, the script will run in non-distributed (single instance) mode.''') # Configuration for distributed task parser.add_argument('--job_name', type=str, default=os.environ.get('JOB_NAME', None), choices=['worker', 'ps'], help='Task type for the node in the distributed cluster. Worker-0 will be set as master.') parser.add_argument('--task_index', type=int, default=os.environ.get('TASK_INDEX', 0), help='Worker task index, should be >= 0. task_index=0 is the chief worker.') parser.add_argument('--ps_hosts', type=str, default=os.environ.get('PS_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') parser.add_argument('--worker_hosts', type=str, default=os.environ.get('WORKER_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') # Experiment related parameters parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') # Training params parser.add_argument('--learning_rate', type=float, default=0.001, help='Initial learning rate used in Adam optimizer.') parser.add_argument('--learning_decay', type=float, default=0.001, help='Exponential decay rate of the learning rate per step.') parser.add_argument('--batch_size', type=int, default=512, help='Batch size to use during training and evaluation.') opts = parser.parse_args() # Clusterone snippet: Grabs the correct paths, depending on if the job is running local or on Clusterone opts.data_dir = get_data_path(dataset_name='', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) return opts
def main(argv): args = parser.parse_args(argv[1:]) log_path = get_logs_path(root=os.path.abspath( os.path.expanduser('~/Documents/tf_logs/logs/titanic'))) train, test = load_data(args.train_path, args.test_path) (train_x, train_y), new_feature_classes = preprocess_data(train, ['sex', 'embarked']) (test_x, test_y), new_feature_classes = preprocess_data(test, ['sex', 'embarked']) passenger_features = [] passenger_features.append(tf.feature_column.numeric_column(key='pclass')) passenger_features.append(tf.feature_column.numeric_column(key='age')) passenger_features.append(tf.feature_column.numeric_column(key='sibsp')) passenger_features.append(tf.feature_column.numeric_column(key='parch')) passenger_features.append(tf.feature_column.numeric_column(key='sex_male')) passenger_features.append( tf.feature_column.numeric_column(key='sex_female')) passenger_features.append( tf.feature_column.numeric_column(key='embarked_C')) passenger_features.append( tf.feature_column.numeric_column(key='embarked_Q')) passenger_features.append( tf.feature_column.numeric_column(key='embarked_S')) classifier = tf.estimator.DNNClassifier(hidden_units=network, feature_columns=passenger_features, model_dir=log_path, n_classes=2) classifier.train(input_fn=lambda: train_input_fn(train_x, train_y), steps=1000) eval_result = classifier.evaluate( input_fn=lambda: eval_input_fn(test_x, test_y)) print('\nNetwork layout: %s' % network) print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
def main(argv): args = parser.parse_args(argv[1:]) log_path = get_logs_path(root=os.path.abspath(os.path.expanduser('~/Documents/tf_logs/logs/titanic_basic'))) (train_x, train_y), (test_x, test_y) = load_data(args.train_path, args.test_path) passenger_features = [] passenger_features.append(tf.feature_column.numeric_column(key='pclass')) passenger_features.append(tf.feature_column.numeric_column(key='age')) classifier = tf.estimator.DNNClassifier( hidden_units=[20, 20, 20], feature_columns=passenger_features, model_dir=log_path, n_classes=2) classifier.train(input_fn=lambda:train_input_fn(train_x, train_y), steps=1000) eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(test_x, test_y)) print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
def get_args(): '''Return parsed args''' parser = ArgumentParser() parser.add_argument('--local_data_dir', type=str, default='data/', help='Path to local data directory') parser.add_argument('--local_log_dir', type=str, default='logs/', help='Path to local log directory') # Model params parser.add_argument('--hidden_units', type=int, nargs='*', default=[32, 64]) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--learning_decay', type=float, default=0.001) parser.add_argument('--dropout', type=float, default=0.5) # Runtime params parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--num_steps', type=int, default=9999999) parser.add_argument('--input_threads', type=int, default=None) opts = parser.parse_args() opts.data_dir = get_data_path(dataset_name='*/*', local_root=opts.local_data_dir, local_repo='', path='') opts.log_dir = get_logs_path(root=opts.local_log_dir) if opts.input_threads is None: import multiprocessing opts.input_threads = multiprocessing.cpu_count() return opts
flags.DEFINE_string( "val_data_dir", get_data_path( dataset_name='artem/artem-tiny-imagenet', local_root=os.path.expanduser('~/Documents/Scratch/tiny_imagenet/'), local_repo='tiny-imagenet-200', path='val/for_keras'), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") flags.DEFINE_string( "log_dir", get_logs_path( os.path.expanduser('~/Documents/Scratch/tiny_imagenet/logs/')), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.") FLAGS = flags.FLAGS def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "")
print('[*] Finished') if __name__ == "__main__": parser = argparse.ArgumentParser(description='toynet template') parser.add_argument('--epoch', type=int, default=20, help='epoch size') parser.add_argument('--batch_size', type=int, default=100, help='mini-batch size') parser.add_argument('--lr', type=float, default=2e-4, help='learning rate') parser.add_argument('--y_dim', type=int, default=10, help='the number of classes') parser.add_argument('--target', type=int, default=-1, help='target class for targeted generation') parser.add_argument('--eps', type=float, default=1e-9, help='epsilon') parser.add_argument('--env_name', type=str, default='train', help='experiment name') parser.add_argument('--dataset', type=str, default='FMNIST', help='dataset type') parser.add_argument('--dset_dir', type=str, default='datasets', help='dataset directory path') parser.add_argument('--summary_dir', type=str, default=get_logs_path('summary'), help='summary directory path') parser.add_argument('--output_dir', type=str, default=get_logs_path('output'),help='output directory path') parser.add_argument('--ckpt_dir', type=str, default=get_logs_path('checkpoints'), help='checkpoint directory path') parser.add_argument('--load_ckpt', type=str, default='', help='') parser.add_argument('--cuda', type=str2bool, default=True, help='enable cuda') parser.add_argument('--silent', type=str2bool, default=False, help='') parser.add_argument('--mode', type=str, default='train', help='train / test / generate / universal') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--iteration', type=int, default=1, help='the number of iteration for FGSM') parser.add_argument('--epsilon', type=float, default=0.03, help='epsilon for FGSM and i-FGSM') parser.add_argument('--alpha', type=float, default=2/255, help='alpha for i-FGSM') parser.add_argument('--tensorboard', type=str2bool, default=True, help='enable tensorboard') parser.add_argument('--visdom', type=str2bool, default=False, help='enable visdom') parser.add_argument('--visdom_port', type=str, default=55558, help='visdom port') args = parser.parse_args()
# Training related flags flags.DEFINE_string( "data_dir", get_data_path( dataset_name="malo/mnist", #all mounted repo local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo="mnist", path=''), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "so that you can switch from local to clusterone without" "changing your code." "If you set your logs directory manually make sure" "to use /logs/ when running on ClusterOne cloud.") flags.DEFINE_string( "log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.") FLAGS = flags.FLAGS def device_and_target(): # If FLAGS.job_name is not set, we're running single-node TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "")
def main(): """ Main wrapper""" # clusterone snippet 1 - get environment variables try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None if job_name == None: #if running locally if LOCAL_LOG_LOCATION == "...": raise ValueError("LOCAL_LOG_LOCATION needs to be defined") if LOCAL_DATASET_LOCATION == "...": raise ValueError("LOCAL_DATASET_LOCATION needs to be defined") if LOCAL_DATASET_NAME == "...": raise ValueError("LOCAL_DATASET_NAME needs to be defined") #Path to your data locally. This will enable to run the model both locally and on # ClusterOne without changes PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION) ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION) #end of clusterone snippet 1 #Flags flags = tf.app.flags FLAGS = flags.FLAGS # clusterone snippet 2: flags. #Define the path from the root data directory to your data. #We use glob to match any .h5 datasets in Documents/comma locally, or in data/ on ClusterOne flags.DEFINE_string( "train_data_dir", get_data_path( dataset_name="tensorbot/*", local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo= LOCAL_DATASET_NAME, #all repos (we use glob downstream, see read_data.py) path='camera/training/*.h5' #all .h5 files ), """Path to training dataset. It is recommended to use get_data_path() to define your data directory. If you set your dataset directory manually make sure to use /data/ as root path when running on TensorPort cloud. On tensrport, the data will be mounted in /data/user/clusterone_dataset_name, so you can acces `path` with /data/user/clusterone_dataset_name/path """) flags.DEFINE_string( "logs_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints. It is recommended" "to use get_logs_path() to define your logs directory." "If you set your logs directory manually make sure" "to use /logs/ when running on TensorPort cloud.") # Define worker specific environment variables. Handled automatically. flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer( "task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task the performs the variable " "initialization") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # end of clusterone snippet 2 # Training flags - feel free to play with that! flags.DEFINE_integer("batch", 64, "Batch size") flags.DEFINE_integer("time", 1, "Number of frames per sample") flags.DEFINE_integer("steps_per_epoch", 10000, "Number of training steps per epoch") flags.DEFINE_integer("nb_epochs", 200, "Number of epochs") # Model flags - feel free to play with that! flags.DEFINE_float("dropout_rate1", .2, "Dropout rate on first dropout layer") flags.DEFINE_float("dropout_rate2", .5, "Dropout rate on second dropout layer") flags.DEFINE_float("starter_lr", 1e-6, "Starter learning rate. Exponential decay is applied") flags.DEFINE_integer("fc_dim", 512, "Size of the dense layer") flags.DEFINE_boolean("nogood", False, "Ignore `goods` filters.") # clusterone snippet 3: configure distributed environment def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "") # Otherwise we're running distributed TensorFlow. print("Running distributed training") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), "worker": FLAGS.worker_hosts.split(","), }) server = tf.train.Server(cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() worker_device = "/job:worker/task:{}".format(FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. return ( tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec), server.target, ) device, target = device_and_target() # end of clusterone snippet 3 print(FLAGS.logs_dir) print(FLAGS.train_data_dir) if FLAGS.logs_dir is None or FLAGS.logs_dir == "": raise ValueError("Must specify an explicit `logs_dir`") if FLAGS.train_data_dir is None or FLAGS.train_data_dir == "": raise ValueError("Must specify an explicit `train_data_dir`") # if FLAGS.val_data_dir is None or FLAGS.val_data_dir == "": # raise ValueError("Must specify an explicit `val_data_dir`") TIME_LEN = 1 #1 video frame. Other not supported. # Define graph with tf.device(device): # X = tf.placeholder(tf.float32, [FLAGS.batch, 3, 160, 320], name="X") # Y = tf.placeholder(tf.float32,[FLAGS.batch,1], name="Y") # angle only # S = tf.placeholder(tf.float32,[FLAGS.batch,1], name="S") #speed if FLAGS.task_index == 0: print("Looking for data in %s" % FLAGS.train_data_dir) reader = DataReader(FLAGS.train_data_dir) x, y, s = reader.read_row_tf() x.set_shape((3, 160, 320)) y.set_shape((1)) s.set_shape((1)) X, Y, S = tf.train.batch([x, y, s], batch_size=FLAGS.batch) predictions = get_model(X, FLAGS) steering_summary = tf.summary.image( "green-is-predicted", render_steering_tf(X, Y, S, predictions) ) # Adding numpy operation to graph. Adding image to summary loss = get_loss(predictions, Y) training_summary = tf.summary.scalar('Training_Loss', loss) #add to tboard #Batch generators global_step = tf.contrib.framework.get_or_create_global_step() learning_rate = tf.train.exponential_decay(FLAGS.starter_lr, global_step, 1000, 0.96, staircase=True) train_step = (tf.train.AdamOptimizer(learning_rate).minimize( loss, global_step=global_step)) def run_train_epoch(target, FLAGS, epoch_index): """Restores the last checkpoint and runs a training epoch Inputs: - target: device setter for distributed work - FLAGS: - requires FLAGS.logs_dir from which the model will be restored. Note that whatever most recent checkpoint from that directory will be used. - requires FLAGS.steps_per_epoch - epoch_index: index of current epoch """ hooks = [ tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch * epoch_index) ] # Increment number of required training steps i = 1 with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.logs_dir, hooks=hooks) as sess: while not sess.should_stop(): variables = [loss, learning_rate, train_step] current_loss, lr, _ = sess.run(variables) print( "Iteration %s - Batch loss: %s" % ((epoch_index) * FLAGS.steps_per_epoch + i, current_loss)) i += 1 for e in range(FLAGS.nb_epochs): run_train_epoch(target, FLAGS, e)
def main(): parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') args = parser.parse_args() args.out = get_logs_path(root=args.out) print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. model = L.Classifier(MLP(args.unit, 10)) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the MNIST dataset train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # Take a snapshot for each specified epoch frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Save two plot images to the result dir if args.plot and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) # TensorBoard trainer.extend(TensorBoardReport(args.out)) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run() writer.close()
import tensorflow as tf from keras import backend as K from keras.datasets import mnist from keras.models import Model from keras.optimizers import Adam from keras.layers import Dense, Input, Flatten from keras.utils import to_categorical from keras.callbacks import ModelCheckpoint, TensorBoard from clusterone import get_data_path, get_logs_path log_dir = get_logs_path( '/Users/artem/Documents/Scratch/mnist_keras_distributed/logs/') def train(): # # Data # (x_train, y_train), (x_test, y_test) = mnist.load_data() y_train = to_categorical(y_train, num_classes=10) y_test = to_categorical(y_test, num_classes=10) x_train = x_train.astype('float32') / 255. x_test = x_test.astype('float32') / 255. # # Model # img_inp = Input(shape=(28, 28))
local_repo = 'tiny-imagenet-200', path = 'train' ) EVAL_DATA_DIR = get_data_path( dataset_name = 'mohsen/clusterone-tiny-imagenet-example', local_root = os.path.expanduser('~/'), local_repo = 'tiny-imagenet-200', path = 'val/for_keras' ) UNIQUE_LABELS_PATH = get_data_path( dataset_name = 'mohsen/clusterone-tiny-imagenet-example', local_root = os.path.expanduser('~/'), local_repo = 'tiny-imagenet-200', path = 'wnids.txt' ) LOGS_PATH = get_logs_path('./logs') configure(LOGS_PATH) parser = argparse.ArgumentParser() parser.add_argument('--batch_size', default=64, type=int) parser.add_argument('--num_workers', default=4, type=int) parser.add_argument('--num_epochs', default=1, type=int) parser.add_argument('--save_summary_steps', default=50, type=int) args = parser.parse_args() args.cuda = torch.cuda.is_available() if args.cuda: device = torch.device('cuda:0') print('Using GPU') else: device = torch.device('cpu')
# separating data into training, validation, and testing x_train = x_data[:train] x_val = x_data[train:train + val] x_test = x_data[train + val:] y_train = y_data[:train] y_val = y_data[train:train + val] y_test = y_data[train + val:] return x_train, x_val, x_test, y_train, y_val, y_test if __name__ == "__main__": # turn of warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' log_path = get_logs_path( r"C:\Users\Ryan Meredith\Documents\github\housing_prices\logs\\") # root mean squared metric def rmse(y_true, y_pred): return backend.sqrt( backend.mean(backend.square(y_pred - y_true), axis=-1)) print("Begin collecting data") x_train, x_val, x_test, y_train, y_val, y_test = get_data() print("Finished collecting data") input_nodes = x_train.shape[1] # layers: (input: 326), 1000, 500, 200, 100, (output: 1) # dropout of 10% between each layer model = Sequential()
if (task_type in ('chief', 'master')) or (task_type == 'worker' and task_index == 0): TF_CONFIG['cluster']['worker'][task_index] = local_ip TF_CONFIG['task']['type'] = 'chief' os.environ['TF_CONFIG'] = json.dumps(TF_CONFIG) except KeyError as ex: print(ex) job_name = None task_index = 0 ps_hosts = None worker_hosts = None flags.DEFINE_string( "log_dir", get_logs_path( os.path.expanduser('~/Documents/Scratch/cluster1_experiments/logs')), "Path to dataset. It is recommended to use get_data_path()" "to define your data directory.so that you can switch " "from local to clusterone without changing your code." "If you set the data directory manually makue sure to use" "/data/ as root path when running on ClusterOne cloud.") tf.flags.DEFINE_integer('n_gpus', 1, 'number of gpus to utilize') FLAGS = flags.FLAGS def make_model(): model_inp = tf.keras.layers.Input(shape=( 28, 28, ), name='input')
def main(): #Training Data xtrain = 'Xtrain.txt' ytrain = 'Ytrain.txt' #Validation Data xtest = 'Xtest.txt' ytest = 'Ytest.txt' # Training Parameters batch_size = 500 # Batch size num_epochs = 5 # Number epochs train_holdout = 0.2 # Portion of training features used for valisation learning_rate = 0.005 # Starting learning rate steps_per_epoch = 50 # Number of training steps per epoch #----- Begin Main Code # Get environment variables try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None # Get local file paths PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION) ROOT_PATH_TO_LOCAL_DATA = os.path.expanduser(LOCAL_DATASET_LOCATION) # Flags flags = tf.app.flags FLAGS = flags.FLAGS # Flags for environment variables flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer("task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task that performs the variable " "initialization and checkpoint handling") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # Training file flags flags.DEFINE_string("xtrain", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = xtrain ), "Path to training dataset.") flags.DEFINE_string("ytrain", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = ytrain ), "Path to training dataset.") flags.DEFINE_string("log_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints.") # Validation file flags flags.DEFINE_string("xtest", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = xtest ), "Path to testing dataset.") flags.DEFINE_string("ytest", get_data_path( dataset_name = "emanrao/variantnn-demo", local_root = ROOT_PATH_TO_LOCAL_DATA, local_repo = LOCAL_DATASET_NAME, path = ytest ), "Path to testing dataset.") # Training parameter flags flags.DEFINE_integer("batch_size", batch_size, "Batch size [100].") flags.DEFINE_integer("num_epochs", num_epochs, "Number epochs [50].") flags.DEFINE_float("train_holdout", train_holdout, "Portion of training features withheld from traing and used for validation [0.2].") flags.DEFINE_float("learning_rate", learning_rate, "Starting learning rate [0.0005].") flags.DEFINE_integer("steps_per_epoch", steps_per_epoch, "Number of training steps per epoch") # Configure Distributed Environment def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "") # Otherwise we're running distributed TensorFlow. print("Running distributed training") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), "worker": FLAGS.worker_hosts.split(","), }) server = tf.train.Server( cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() worker_device = "/job:worker/task:{}".format(FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. return ( tf.train.replica_device_setter( worker_device=worker_device, cluster=cluster_spec), server.target, ) device, target = device_and_target() # ----- Read Data ----- # Check Flags if FLAGS.log_dir is None or FLAGS.log_dir == "": raise ValueError("Must specify an explicit `log_dir`") if FLAGS.xtrain is None or FLAGS.xtrain == "": raise ValueError("Must specify an explicit `xtrain`") if FLAGS.ytrain is None or FLAGS.ytrain == "": raise ValueError("Must specify an explicit `ytrain`") if FLAGS.xtest is None or FLAGS.xtest == "": raise ValueError("Must specify an explicit `xtest`") if FLAGS.ytest is None or FLAGS.ytest == "": raise ValueError("Must specify an explicit `ytest`") print('Training dataset file: ', FLAGS.xtrain) print('Training target file: ', FLAGS.ytrain) print('Testing dataset file: ', FLAGS.xtest) print('Testing target file: ', FLAGS.ytest) print('Log Files Saved To: ', FLAGS.log_dir) # Read in data Xtrain, Ytrain = read_flat_file(FLAGS.xtrain, FLAGS.ytrain) Xtest, Ytest = read_flat_file(FLAGS.xtest, FLAGS.ytest) num_train = int(np.round(Xtrain.shape[0] * (1-FLAGS.train_holdout))) num_held = int(Xtrain.shape[0]-num_train) print('Training on {:d} features'.format(num_train)) print('Validating on {:d} features (once per epoch)'.format(num_held)) Xval = Xtrain[num_train:] Yval = Ytrain[num_train:] Xtrain = Xtrain[:num_train] Ytrain = Ytrain[:num_train] num_batches = int(np.floor(Ytrain.shape[0]/FLAGS.batch_size)) if num_batches==0: # if defined bach size is below dataset, read as1 batch num_batches=1 FLAGS.batch_size = Ytrain.shape[0] # ----- Define Graph ----- tf.reset_default_graph() with tf.device(device): # X_in = tf.placeholder(tf.float32, [None, 15, 4, 3]) # Y_out = tf.placeholder(tf.float32, [None, 8]) global_step = tf.train.get_or_create_global_step() # Create Datasets train_dataset = tf.data.Dataset.from_tensor_slices((Xtrain, Ytrain)) # train_dataset = train_dataset.shuffle(buffer_size=10000) train_dataset = train_dataset.batch(FLAGS.batch_size) # train_dataset = train_dataset.repeat(FLAGS.num_epochs) val_dataset = tf.data.Dataset.from_tensor_slices((Xval, Yval)) val_dataset = val_dataset.batch(Yval.shape[0]) # val_dataset = val_dataset.repeat(FLAGS.num_epochs) test_dataset = tf.data.Dataset.from_tensor_slices((Xtest, Ytest)) test_dataset = test_dataset.batch(FLAGS.batch_size) # Create Iterator iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) features, labels = iter.get_next() # Create initialisation operations train_init_op = iter.make_initializer(train_dataset) val_init_op = iter.make_initializer(val_dataset) test_init_op = iter.make_initializer(test_dataset) # Apply model with tf.name_scope('predictions'): predictions = get_model(features, FLAGS) with tf.name_scope('loss'): loss = get_loss(predictions,labels) tf.summary.scalar('loss', loss)#add to tboard with tf.name_scope('train'): train_step = ( tf.train.AdamOptimizer(FLAGS.learning_rate) .minimize(loss, global_step=global_step) ) summ = tf.summary.merge_all() writer = tf.summary.FileWriter(FLAGS.log_dir) #%% Train Model with periodic validation def run_train_epoch(target, FLAGS, epoch_index): print('Epoch {:d} Training...'.format(epoch_index)) i=1 hooks=[tf.train.StopAtStepHook(last_step=FLAGS.steps_per_epoch*epoch_index)] # Increment number of required training steps scaffold = tf.train.Scaffold( local_init_op=[train_init_op, val_init_op], saver=tf.train.Saver(max_to_keep=5) ) with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.log_dir, hooks = hooks, scaffold=scaffold ) as sess: writer.add_graph(sess.graph) sess.run(train_init_op) # switch to train dataset while not sess.should_stop(): [current_loss,_,s] = sess.run([loss, train_step, summ]) iteration = (epoch_index)*FLAGS.steps_per_epoch + i print("Iteration {} Training Loss: {:.4f}".format(iteration,current_loss)) i += 1 #writer.add_summary(s, i) if i==FLAGS.steps_per_epoch: # validate on last session sess.run(val_init_op) # switch to val dataset while True: try: # run and save validation parameters v_loss = sess.run(loss) print("Epoch {} Validation Loss: {:.4f}".format(epoch_index, v_loss)) except tf.errors.OutOfRangeError: break for e in range(1,FLAGS.num_epochs+1): run_train_epoch(target, FLAGS,e) # ----- Test Model on Different Dataset ----- with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0) ) as sess: sess.run(test_init_op) # initialize to test dataset loss = sess.run(loss) print("Test Set Loss (independent dataset): {:.4f}".format(loss))
def main(): # This script is almost identical to train_mnist.py. The only difference is # that this script uses data-parallel computation on two GPUs. # See train_mnist.py for more details. parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=400, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--out', '-o', default='result_parallel', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() args.out = get_logs_path(root=args.out) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') chainer.backends.cuda.get_device_from_id(0).use() model = L.Classifier(MLP(args.unit, 10)) optimizer = chainer.optimizers.Adam() optimizer.setup(model) train, test = chainer.datasets.get_mnist() train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # ParallelUpdater implements the data-parallel gradient computation on # multiple GPUs. It accepts "devices" argument that specifies which GPU to # use. try: config = os.environ['TF_CONFIG'] config = json.loads(config) n_gpus = len(config['cluster']['worker']) devices = {str(i) for i in range(1, n_gpus)} devices['main'] = 0 except: devices = {'main': 0} updater = training.updaters.ParallelUpdater( train_iter, optimizer, # The device of the name 'main' is used as a "master", while others are # used as slaves. Names other than 'main' are arbitrary. devices=devices, ) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator(test_iter, model, device=0)) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend(TensorBoardReport(args.out)) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
flags.DEFINE_integer("batch_size", 2097152, "Batch size") FLAGS = tf.flags.FLAGS USERNAME = "******" DATASET_NAME = "openslr_small" PROBLEM = 'librispeech_clean' DATA_PATH = get_data_path( dataset_name="%s/%s" % (USERNAME, DATASET_NAME), #on clusterone local_root=os.path.expanduser("~/Data"), local_repo="openSLR", path='') CHECKPOINTS_PATH = get_logs_path(root=os.path.expanduser("~/logs")) if not os.path.exists(CHECKPOINTS_PATH): os.makedirs(CHECKPOINTS_PATH) try: job_name = os.environ['JOB_NAME'] task_index = int(os.environ['TASK_INDEX']) ps_hosts = os.environ['PS_HOSTS'].split(',') worker_hosts = os.environ['WORKER_HOSTS'].split(',') if job_name == 'ps': ps_hosts[task_index] = 'localhost:%s' % ( ps_hosts[task_index].split(':')[-1]) elif job_name == 'worker': worker_hosts[task_index] = 'localhost:%s' % ( worker_hosts[task_index].split(':')[-1])
) flags.DEFINE_integer("output_height", 64, "The size of the output images to produce [64]") flags.DEFINE_integer( "output_width", None, "The size of the output images to produce. If None, same value as output_height [None]" ) flags.DEFINE_string("dataset", "celebA", "The name of dataset [celebA, mnist, lsun]") flags.DEFINE_string( "data_path", get_data_path(dataset_name="%s/*" % CLUSTERONE_USERNAME, local_root=ROOT_PATH_TO_LOCAL_DATA, local_repo=LOCAL_REPO, path=""), "data path for zip file") flags.DEFINE_string("checkpoint_dir", get_logs_path(LOCAL_PATH_TO_LOGS), "Directory name to save the checkpoints [checkpoint]") flags.DEFINE_string( "sample_dir", get_logs_path("samples"), "Directory name to save the image samples [samples]" ) #TODO: replace with os.path.join(logs/samples) when folders are supported flags.DEFINE_boolean("train", True, "True for training, False for testing [True]") flags.DEFINE_boolean("crop", True, "True for training, False for testing [True]") flags.DEFINE_boolean("visualize", False, "True for visualizing, False for nothing [False]") FLAGS = flags.FLAGS def main(_):
def parse_args(): """Parse args""" parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, description='''Train a logistic regressor using XGBoost. For distributed mode, use dmlc-core submit. ''') # Experiment related parameters parser.add_argument('--data_dir', type=str, default=os.path.join(FILE_DIR, 'data'), help='Directory where your data files are.') parser.add_argument('--local_log_root', type=str, default=os.path.join(FILE_DIR, 'logs'), help='Path to store logs and checkpoints. This path will be /logs on Clusterone.') parser.add_argument('--train_file_pattern', type=str, default='*.train', help='Use * as wildcard. Describe sub-directory/filename pattern for train data.') parser.add_argument('--test_file_pattern', type=str, default='*.test', help='Use * as wildcard. Describe sub-directory/filename pattern for test data.') parser.add_argument('--model_name', type=str, default='saved.model', help='Filename to use for saved model.') # General params parser.add_argument('--silent', type=int, default=0, choices=[0, 1], help='0 means printing running messages, 1 means silent mode') # Booster params parser.add_argument('--eta', type=float, default=0.3, help='Step size shrinkage used in update to prevents overfitting. ' 'After each boosting step, we can directly get the weights of new features, ' 'and eta shrinks the feature weights to make the boosting process more conservative. ' 'Range: [0,1]') parser.add_argument('--gamma', type=float, default=0.0, help='Minimum loss reduction required to make a further partition on a leaf node of the tree. ' 'The larger gamma is, the more conservative the algorithm will be. Range: [0,inf]') parser.add_argument('--max_depth', type=int, default=6, help='Maximum depth of a tree. Increasing this value will make the model more complex and more ' 'likely to overfit. 0 indicates no limit. Note that limit is required when grow_policy is ' 'set of depthwise.') parser.add_argument('--min_child_weight', type=float, default=1.0, help='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step ' 'results in a leaf node with the sum of instance weight less than min_child_weight, then ' 'the building process will give up further partitioning. In linear regression task, this ' 'simply corresponds to minimum number of instances needed to be in each node. The larger ' 'min_child_weight is, the more conservative the algorithm will be.') parser.add_argument('--subsample', type=float, default=1.0, help='Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would ' 'randomly sample half of the training data prior to growing trees. and this will prevent ' 'overfitting. Subsampling will occur once in every boosting iteration.') parser.add_argument('--colsample_bytree', type=float, default=1.0, help='Subsample ratio of columns when constructing each tree. Subsampling will occur once in ' 'every boosting iteration.') parser.add_argument('--l2', type=float, default=1.0, help='L2 regularization term on weights. Increasing this value will make model more ' 'conservative.') parser.add_argument('--l1', type=float, default=0.0, help='L1 regularization term on weights. Increasing this value will make model more ' 'conservative.') parser.add_argument('--tree_method', type=str, default='auto', choices=['auto', 'exact', 'approx', 'hist', 'gpu_exact', 'gpu_hist'], help='The tree construction algorithm used in XGBoost. ' 'Distributed and external memory version only support tree_method=approx.') parser.add_argument('--scale_pos_weight', type=float, default=1., help='Control the balance of positive and negative weights, useful for unbalanced classes. ' 'A typical value to consider: sum(negative instances) / sum(positive instances).') # Learning task parameters parser.add_argument('--objective', type=str, default='binary:logistic', choices=['binary:logistic', 'binary:logitraw'], help='See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters') parser.add_argument('--eval_metric', type=str, nargs='*', default=['error'], choices=['logloss', 'auc', 'error'], help='Evaluation metrics for validation data. ' 'See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters') parser.add_argument('--seed', type=int, default=0, help='Random number seed') # Command line parameters parser.add_argument('--num_round', type=int, default=10, help='The number of rounds for boosting') # Train params parser.add_argument('--cache_data', action='store_true', help='Use external memory version') # Testing/Debugging parser.add_argument('--set_verbosity', type=str, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Logging verbosity level') parser.add_argument('--benchmark', action='store_true', help='Runs with benchmark settings. Ignores all XGBoost parameter inputs.') # Parse args opts = parser.parse_args() opts.train_file_pattern = os.path.join(opts.data_dir, opts.train_file_pattern) opts.test_file_pattern = os.path.join(opts.data_dir, opts.test_file_pattern) train_files = glob.glob(opts.train_file_pattern) test_files = glob.glob(opts.test_file_pattern) if train_files: opts.train_data = train_files[0] if len(train_files) > 1: logging.warning('Detected multiple files. Using {}.'.format(opts.train_data)) else: raise IOError('Did not detect any files with train_file_pattern "{}"'.format(opts.train_file_pattern)) if not opts.benchmark and test_files: opts.test_data = test_files[0] if len(test_files) > 1: logging.warning('Detected multiple files. Using {}.'.format(opts.test_data)) elif not opts.benchmark: raise IOError('Did not detect any files with test_file_pattern "{}"'.format(opts.test_file_pattern)) else: opts.test_data = '' opts.log_dir = get_logs_path(root=opts.local_log_root) return opts
import numpy as np from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets from clusterone import get_data_path, get_logs_path LOCAL_DATA_PATH = os.path.abspath(os.path.expanduser('../../data/')) LOCAL_LOGS_PATH = os.path.abspath(os.path.expanduser('logs/')) # Storage directory for the MNIST dataset. # Returns LOCAL_DATA_PATH when running locally, '/data/malo/mnist' when running on Clusterone. data_dir = get_data_path(dataset_name="malo/mnist", local_root=LOCAL_DATA_PATH, local_repo="mnist", path='') # Storage dictory for the log files produced by this script. logs_dir = get_logs_path(LOCAL_LOGS_PATH) # The MNIST dataset has 10 classes, representing the digits 0 through 9 NUM_CLASSES = 10 # The MNIST images are always 28x28 pixels IMAGE_SIZE = 28 IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE # Each hidden layer gets 128 neurons hidden1_units = 128 hidden2_units = 128 # Further hyperparameters learning_rate = 0.5 batch_size = 100
def main(): # clusterone snippet 1 - get environment variables try: job_name = os.environ['JOB_NAME'] task_index = os.environ['TASK_INDEX'] ps_hosts = os.environ['PS_HOSTS'] worker_hosts = os.environ['WORKER_HOSTS'] except: job_name = None task_index = 0 ps_hosts = None worker_hosts = None #end of clusterone snippet 1 #Flags flags = tf.app.flags FLAGS = flags.FLAGS PATH_TO_LOCAL_LOGS = os.path.expanduser(LOCAL_LOG_LOCATION) # clusterone snippet 2: flags. flags.DEFINE_string("logs_dir", get_logs_path(root=PATH_TO_LOCAL_LOGS), "Path to store logs and checkpoints") # Define worker specific environment variables. Handled automatically. flags.DEFINE_string("job_name", job_name, "job name: worker or ps") flags.DEFINE_integer( "task_index", task_index, "Worker task index, should be >= 0. task_index=0 is " "the chief worker task the performs the variable " "initialization") flags.DEFINE_string("ps_hosts", ps_hosts, "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", worker_hosts, "Comma-separated list of hostname:port pairs") # end of clusterone snippet 2 flags.DEFINE_integer("nb_epochs", 20, "Number of epochs") # clusterone snippet 3: configure distributed environment def device_and_target(): # If FLAGS.job_name is not set, we're running single-machine TensorFlow. # Don't set a device. if FLAGS.job_name is None: print("Running single-machine training") return (None, "") # Otherwise we're running distributed TensorFlow. print("Running distributed training") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") if FLAGS.ps_hosts is None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") # Represents a cluster as a set of "tasks", organized into "jobs". cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), # job1 "worker": FLAGS.worker_hosts.split(","), # job2 }) # Server instance encapsulates a set of devices and a tf.Session # target that can participate in distributed training. A server belongs # to a cluster (specified by a tf.train.ClusterSpec), and corresponds to # a particular task in a named job. server = tf.train.Server(cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() worker_device = "/job:worker/task:{}".format(FLAGS.task_index) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. return (tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec), server.target) device, target = device_and_target() # place tensors, session # end of clusterone snippet 3 if FLAGS.logs_dir is None or FLAGS.logs_dir == "": raise ValueError("Must specify an explicit `logs_dir`") with tf.device(device): with tf.name_scope("input"): (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data() print(x_train.shape, x_test.shape) # 60k, 10k x_train = x_train.astype('float32') / 255. x_test = x_test.astype('float32') / 255. x_train, x_valid = x_train[5000:], x_train[:5000] y_train, y_valid = y_train[5000:], y_train[:5000] # Reshape input data from (28, 28) to (28, 28, 1) w, h = 28, 28 x_train = x_train.reshape(x_train.shape[0], w, h, 1) # NHWC x_valid = x_valid.reshape(x_valid.shape[0], w, h, 1) x_test = x_test.reshape(x_test.shape[0], w, h, 1) # One-hot encode the labels y_train = tf.keras.utils.to_categorical(y_train, 10) y_valid = tf.keras.utils.to_categorical(y_valid, 10) y_test = tf.keras.utils.to_categorical(y_test, 10) with tf.name_scope("model"): model = model_fn(input_shape, number_of_classes) x = model["x"] y = model["y"] train_mode = model["train_mode"] def shuffle(x, y): idxs = np.random.permutation(x.shape[0]) #shuffled ordering return x[idxs], y[idxs] def run_train_epoch(target, FLAGS, epoch_index, train_writer, test_writer): epoch_loss, epoch_accuracy = 0, 0 x_train_r, y_train_r = shuffle(x_train, y_train) with tf.train.MonitoredTrainingSession( master=target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.logs_dir) as sess: total_size = x_train.shape[0] number_of_batches = int(total_size / batch_size) for i in range(number_of_batches): step = epoch_index * number_of_batches + i mini_x = x_train_r[i * batch_size:(i + 1) * batch_size, :, :, :] mini_y = y_train_r[i * batch_size:(i + 1) * batch_size, :] _, loss = sess.run([model["train_op"], model["loss"]], feed_dict={ x: mini_x, y: mini_y, train_mode: True }) epoch_loss += loss train_accuracy, summary = sess.run( [model["accuracy"], model["summary"]], feed_dict={ x: mini_x, y: mini_y, train_mode: False }) epoch_accuracy += train_accuracy train_writer.add_summary(summary, step) if step % 200 == 0: # Record summaries and test-set accuracy test_accuracy, summary = sess.run( [model["accuracy"], model["summary"]], feed_dict={ x: x_test, y: y_test, train_mode: False }) test_writer.add_summary(summary, step) print('test accuracy at step %s: %s' % (step, test_accuracy)) epoch_loss /= number_of_batches epoch_accuracy /= number_of_batches print("Epoch: {} loss: {} train accuracy: {}".format( epoch_index + 1, np.squeeze(epoch_loss), epoch_accuracy)) train_wr = tf.summary.FileWriter(FLAGS.logs_dir + '/train', graph=tf.get_default_graph()) test_wr = tf.summary.FileWriter(FLAGS.logs_dir + '/test') for e in range(FLAGS.nb_epochs): run_train_epoch(target, FLAGS, e, train_wr, test_wr)
val_data_dir = get_data_path( dataset_name = 'artem/artem-tiny-imagenet', local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/', local_repo = 'tiny-imagenet-200', path = 'val/for_keras' ) models_dir = get_data_path( dataset_name = 'artem/artem-tiny-imagenet', local_root = '/Users/artem/Documents/Scratch/tiny_imagenet/', local_repo = '', path = 'models' ) log_dir = get_logs_path('/Users/artem/Documents/Scratch/tiny_imagenet/logs/') def train(): # # Data Preparation # train_datagen = ImageDataGenerator( rescale = 1. / 255, shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True ) val_datagen = ImageDataGenerator( rescale = 1. / 255.
def parse_args(): """Parse arguments""" parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description= '''Trains a self-steering car model in single-instance or distributed mode. For distributed mode, the script will use few environment variables as defaults: JOB_NAME, TASK_INDEX, PS_HOSTS, and WORKER_HOSTS. These environment variables will be available on distributed Tensorflow jobs on Clusterone platform by default. If running this locally, you will need to set these environment variables or pass them in as arguments (i.e. python main.py --job_name worker --task_index 0 --worker_hosts "localhost:2222,localhost:2223" --ps_hosts "localhost:2224"). If these are not set, the script will run in non-distributed (single instance) mode.''' ) # Configuration for distributed task parser.add_argument( '--job_name', type=str, default=os.environ.get('JOB_NAME', None), choices=['worker', 'ps'], help= 'Task type for the node in the distributed cluster. Worker-0 will be set as master.' ) parser.add_argument( '--task_index', type=int, default=os.environ.get('TASK_INDEX', 0), help= 'Worker task index, should be >= 0. task_index=0 is the chief worker.') parser.add_argument('--ps_hosts', type=str, default=os.environ.get('PS_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') parser.add_argument('--worker_hosts', type=str, default=os.environ.get('WORKER_HOSTS', ''), help='Comma-separated list of hostname:port pairs.') # Experiment related parameters parser.add_argument( '--local_data_root', type=str, default=os.path.abspath('./data/'), help='Path to dataset. This path will be /data on Clusterone.') parser.add_argument( '--local_log_root', type=str, default=os.path.abspath('./logs/'), help= 'Path to store logs and checkpoints. This path will be /logs on Clusterone.' ) parser.add_argument( '--data_subpath', type=str, default='', help= 'Which sub-directory the data will sit inside local_data_root (locally) ' + 'or /data/ (on Clusterone)') parser.add_argument( '--absolute_data_path', type=str, default=None, help='Using this will ignore other data path arguments.') # Model params parser.add_argument('--dropout_rate1', type=float, default=0.2, help='Dropout rate after the convolutional layers.') parser.add_argument('--dropout_rate2', type=float, default=0.5, help='Dropout rate after the dense layer.') parser.add_argument('--fc_dim', type=int, default=512, help='Number of dimensions in the dense layer.') parser.add_argument('--nogood', action='store_true', help='Ignore "goods" filters') parser.add_argument('--learning_rate', type=float, default=0.0001, help='Initial learning rate used in Adam optimizer.') parser.add_argument( '--learning_decay', type=float, default=0.0001, help='Exponential decay rate of the learning rate per step.') # Training params parser.add_argument( '--batch_size', type=int, default=64, help='Batch size to use during training and evaluation.') parser.add_argument('--max_steps', type=int, default=10000, help='Max number of steps to train for.') parser.add_argument( '--verbosity', type=str, default='INFO', choices=['CRITICAL', 'ERROR', 'WARN', 'INFO', 'DEBUG'], help= 'TF logging level. To log intermediate results, set this to INFO or DEBUG.' ) parser.add_argument('--num_threads', type=int, default=1, help='Number of threads to use to prepare data') parser.add_argument('--max_ckpts', type=int, default=2, help='Maximum number of checkpoints to keep') parser.add_argument('--ckpt_steps', type=int, default=100, help='How frequently to save a model checkpoint') parser.add_argument('--save_summary_steps', type=int, default=10, help='How frequently to save TensorBoard summaries') parser.add_argument('--log_step_count_steps', type=int, default=10, help='How frequently to log loss & global steps/s') parser.add_argument( '--eval_secs', type=int, default=60, help='How frequently to run evaluation step. ' + 'By default, there is no evaluation dataset, thus effectively no evaluation.' ) # Parse args opts = parser.parse_args() if opts.absolute_data_path is None: opts.train_data = get_data_path(dataset_name='*/*', local_root=opts.local_data_root, local_repo=opts.data_subpath, path='camera/training/*.h5') else: opts.train_data = os.path.join(opts.absolute_data_path, 'camera/training/*.h5') opts.log_dir = get_logs_path(root=opts.local_log_root) opts.ps_hosts = opts.ps_hosts.split(',') if opts.ps_hosts else [] opts.worker_hosts = opts.worker_hosts.split( ',') if opts.worker_hosts else [] return opts