def __init__(self, should_raise): # The links in this refcycle from Thread back to self # should be cleaned up when the thread completes. self.should_raise = should_raise self.thread = threading.Thread(target=self._run, args=(self, ), kwargs={'yet_another': self}) self.thread.start()
def submit(self, fn, *args, **kwargs): t = threading2.Thread(target=fn, group=self._group, args=args, kwargs=kwargs) t.daemon = self._daemonize self._threads.append(t) t.start()
def test_enumerate_after_join(self): # Try hard to trigger #1703448: a thread is still returned in # threading.enumerate() after it has been join()ed. enum = threading.enumerate old_interval = sys.getcheckinterval() try: for i in xrange(1, 100): # Try a couple times at each thread-switching interval # to get more interleavings. sys.setcheckinterval(i // 5) t = threading.Thread(target=lambda: None) t.start() t.join() l = enum() self.assertFalse( t in l, "#1703448 triggered after %d trials: %s" % (i, l)) finally: sys.setcheckinterval(old_interval)
def test_daemonize_active_thread(self): thread = threading.Thread() thread.start() self.assertRaises(RuntimeError, setattr, thread, "daemon", True)
def test_joining_inactive_thread(self): thread = threading.Thread() self.assertRaises(RuntimeError, thread.join)
def test_start_thread_again(self): thread = threading.Thread() thread.start() self.assertRaises(RuntimeError, thread.start)
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are inferred from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) #batchSizeManager = BatchSizeManager(32, 4) # Ops are assigned to worker by default. tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers)) partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0) device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers) slim = tf.contrib.slim with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.variable_scope('root', partitioner=partitioner): # Variables and its related init/assign ops are assigned to ps. # with slim.arg_scope( # [slim.variables.variable, slim.variables.global_step], # device=slim.variables.VariableDeviceChooser(num_parameter_servers)): with tf.device(device_setter): # partitioner=partitioner): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. # global_step = slim.variables.global_step() global_step = tf.Variable(0, trainable=False) # Calculate the learning rate schedule. batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate*num_workers, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. # tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) print(images.get_shape()) print(labels.get_shape()) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. # num_classes = dataset.num_classes() + 1 num_classes = dataset.num_classes() print(num_classes) # logits = inception.inference(images, num_classes, for_training=True) network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) (logits,_) = network_fn(images) print(logits.get_shape()) # Add classification loss. # inception.loss(logits, labels, batch_size) # Gather all of the losses including regularization losses. labels = tf.one_hot(labels, 1000, 1, 0) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) # losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. # for l in losses + [total_loss]: # loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. # tf.summary.scalar(loss_name + ' (raw)', l) # tf.summary.scalar(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. # for var in variables_to_average: # tf.summary.histogram(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) # assert batchnorm_updates, 'Batchnorm updates are missing' # batchnorm_updates_op = tf.group(*batchnorm_updates) # # Add dependency to compute batchnorm_updates. # with tf.control_dependencies([batchnorm_updates_op]): # total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. # grads = opt.compute_gradients(total_loss) grads0 = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] # Add histograms for gradients. # for grad, var in grads: # if grad is not None: # tf.summary.histogram(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners and init_tokens, which is used to synchronize # replicas. More details can be found in SyncReplicasOptimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. # summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=None, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. # next_summary_time = time.time() + FLAGS.save_summaries_secs step = 0 time0 = time.time() batch_size_num = 1 while not sv.should_stop(): try: start_time = time.time() batch_size_num = 32 batch_size_num = 2*int(step/5)+16 # batch_size_num = int((int(step)/3*10)) % 100000 + 1 # if step < 5: # batch_size_num = 32 # batch_size_num = (batch_size_num ) % 64 + 1 # else: # batch_size_num = 80 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() # assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) thread.start() # tl = timeline.Timeline(run_metadata.step_stats) # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') c0 = time.time() # batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time) # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) # ctf = tl.generate_chrome_trace_format() # with open("timeline.json", 'a') as f: # f.write(ctf) if step % 1 == 0: examples_per_sec = FLAGS.batch_size / float(duration) c = time.time() tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num)) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. # if is_chief and next_summary_time < time.time(): # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. # next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('Chief got exception while running!') raise # Stop the supervisor. This also waits for service threads to finish. sv.stop()
def train(): global updated_batch_size_num global passed_info global shall_update ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) # batchSizeManager = BatchSizeManager(FLAGS.batch_size, len(worker_hosts)) if FLAGS.job_name == 'ps': # rpcServer = batchSizeManager.create_rpc_server(ps_hosts[0].split(':')[0]) # rpcServer.serve() server.join() # rpcClient = batchSizeManager.create_rpc_client(ps_hosts[0].split(':')[0]) is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): partitioner=tf.fixed_size_partitioner(len(ps_hosts), axis=0) with tf.variable_scope('root', partitioner=partitioner): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000*350.0/FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) # print (str(tf.shape(images))+ str(tf.shape(labels))) re = tf.shape(images)[0] inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) # labels = tf.reshape(labels, [-1, _NUM_CLASSES]) labels = tf.one_hot(labels, 10, 1, 0) #network_fn = nets_factory.get_network_fn('inception_v3',num_classes=10) network_fn = nets_factory.get_network_fn('vgg_16',num_classes=10) (logits,_) = network_fn(inputs) print(logits.get_shape()) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # logits = cifar10.inference(images, batch_size) # loss = cifar10.loss(logits, labels, batch_size) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE*len(worker_hosts), global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), # replica_id=FLAGS.task_id, total_num_replicas=len(worker_hosts)) # variable_averages=exp_moving_averager, # variables_to_average=variables_to_average) # Compute gradients with respect to the loss. # grads0 = opt.compute_gradients(loss) # grads = list() # for grad, var in grads0: # grads.append((tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var)) grads0 = opt.compute_gradients(loss) grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] #grads = tf.map_fn(lambda x : (tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), x[0]), x[1]), grads0) #grads = tf.while_loop(lambda x : x, grads0) # grads = opt.compute_gradients(loss) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() # saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, # saver=saver, saver=None, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess_config.gpu_options.allow_growth = True # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # sess.run(tf.global_variables_initializer()) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) """Train CIFAR-10 for a number of steps.""" # available_cpu = psutil.cpu_percent(interval=None) # thread = threading2.Thread(target = local_update_batch_size, name = "update_batch_size_thread", args = (rpcClient, FLAGS.task_id,)) # thread.start() time0 = time.time() batch_size_num = FLAGS.batch_size loss_list = [] threshold = 0.95 for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() batch_size_num = 128 num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) # _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num}) b = time.time() # tl = timeline.Timeline(run_metadata.step_stats) ## ctf = tl.generate_chrome_trace_format() # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) thread.start() c0 = time.time() if step % 1 == 0: duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) c = time.time() ## tf.logging.info("time statistics - batch_process_time: " + str( last_batch_time) + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0)) format_str = ("time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) loss_list.append(loss_value) if step > 10 and loss_list[-1] < loss_list[0]*0.95 and loss_list[-2] < loss_list[0]*0.95: tf.logging.info("Haha cc-terminate: step "+str(step)) exit()
if payload["_action"] == "new": _allocate_new_worker(payload["_uuid"], payload) elif payload["_action"] == "stopped": _allocate_stopped_worker(payload) elif payload["_action"] == "snapshotted": _allocate_snapshot_worker(payload["_uuid"], payload) elif payload["_action"] == "manage": _manage_stack_worker(payload["_uuid"], payload["_manage"], payload["_key"]) time.sleep(2) _new_queue = Queue.Queue() _new_stack_worker = threading2.Thread(target=_stack_worker) _new_stack_worker.daemon = True _new_stack_worker.start() def _allocate_backend_from_snapshot(cluster_uuid, payload, key_name): """ Allocate the backend from a snapshot. """ snapshot_uuid = payload['_file'] backends = docker.fetch_snapshot_backend(snapshot_uuid) if backends: return _allocate_backend(cluster_uuid=cluster_uuid, payload=None, key_name=key_name,