Ejemplo n.º 1
0
 def __init__(self, should_raise):
     # The links in this refcycle from Thread back to self
     # should be cleaned up when the thread completes.
     self.should_raise = should_raise
     self.thread = threading.Thread(target=self._run,
                                    args=(self, ),
                                    kwargs={'yet_another': self})
     self.thread.start()
Ejemplo n.º 2
0
 def submit(self, fn, *args, **kwargs):
     t = threading2.Thread(target=fn,
                           group=self._group,
                           args=args,
                           kwargs=kwargs)
     t.daemon = self._daemonize
     self._threads.append(t)
     t.start()
Ejemplo n.º 3
0
 def test_enumerate_after_join(self):
     # Try hard to trigger #1703448: a thread is still returned in
     # threading.enumerate() after it has been join()ed.
     enum = threading.enumerate
     old_interval = sys.getcheckinterval()
     try:
         for i in xrange(1, 100):
             # Try a couple times at each thread-switching interval
             # to get more interleavings.
             sys.setcheckinterval(i // 5)
             t = threading.Thread(target=lambda: None)
             t.start()
             t.join()
             l = enum()
             self.assertFalse(
                 t in l, "#1703448 triggered after %d trials: %s" % (i, l))
     finally:
         sys.setcheckinterval(old_interval)
Ejemplo n.º 4
0
 def test_daemonize_active_thread(self):
     thread = threading.Thread()
     thread.start()
     self.assertRaises(RuntimeError, setattr, thread, "daemon", True)
Ejemplo n.º 5
0
 def test_joining_inactive_thread(self):
     thread = threading.Thread()
     self.assertRaises(RuntimeError, thread.join)
Ejemplo n.º 6
0
 def test_start_thread_again(self):
     thread = threading.Thread()
     thread.start()
     self.assertRaises(RuntimeError, thread.start)
def train(target, dataset, cluster_spec):
  """Train Inception on a dataset for a number of steps."""
  # Number of workers and parameter servers are inferred from the workers and ps
  # hosts string.
  num_workers = len(cluster_spec.as_dict()['worker'])
  num_parameter_servers = len(cluster_spec.as_dict()['ps'])
  # If no value is given, num_replicas_to_aggregate defaults to be the number of
  # workers.
  if FLAGS.num_replicas_to_aggregate == -1:
    num_replicas_to_aggregate = num_workers
  else:
    num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

  # Both should be greater than 0 in a distributed training.
  assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                         'num_parameter_servers'
                                                         ' must be > 0.')

  # Choose worker 0 as the chief. Note that any worker could be the chief
  # but there should be only one chief.
  is_chief = (FLAGS.task_id == 0)

  #batchSizeManager = BatchSizeManager(32, 4)

  # Ops are assigned to worker by default.
  tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers))
  partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0)  

  device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers)
  slim = tf.contrib.slim
  with tf.device('/job:worker/task:%d' % FLAGS.task_id):
   with tf.variable_scope('root', partitioner=partitioner):
    # Variables and its related init/assign ops are assigned to ps.
#    with slim.arg_scope(
#        [slim.variables.variable, slim.variables.global_step],
#        device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
    with tf.device(device_setter):
#	partitioner=partitioner):
      # Create a variable to count the number of train() calls. This equals the
      # number of updates applied to the variables.
#      global_step = slim.variables.global_step()
      global_step = tf.Variable(0, trainable=False)

      # Calculate the learning rate schedule.

      batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size')
      num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                               FLAGS.batch_size)
      # Decay steps need to be divided by the number of replicas to aggregate.
      decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                        num_replicas_to_aggregate)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(FLAGS.initial_learning_rate*num_workers,
                                      global_step,
                                      decay_steps,
                                      FLAGS.learning_rate_decay_factor,
                                      staircase=True)
      # Add a summary to track the learning rate.
#      tf.summary.scalar('learning_rate', lr)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.RMSPropOptimizer(lr,
                                      RMSPROP_DECAY,
                                      momentum=RMSPROP_MOMENTUM,
                                      epsilon=RMSPROP_EPSILON)

      images, labels = image_processing.distorted_inputs(
          dataset,
          batch_size,
          num_preprocess_threads=FLAGS.num_preprocess_threads)
      print(images.get_shape())
      print(labels.get_shape())

      # Number of classes in the Dataset label set plus 1.
      # Label 0 is reserved for an (unused) background class.
#      num_classes = dataset.num_classes() + 1
      num_classes = dataset.num_classes()
      print(num_classes)
#      logits = inception.inference(images, num_classes, for_training=True)
      network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) 
      (logits,_) = network_fn(images)
      print(logits.get_shape())
      # Add classification loss.
#      inception.loss(logits, labels, batch_size)

      # Gather all of the losses including regularization losses.
      labels = tf.one_hot(labels, 1000, 1, 0)
      cross_entropy = tf.losses.softmax_cross_entropy(
          logits=logits, 
          onehot_labels=labels)
#      losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
#      losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
      losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
      total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n(
          [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

#      total_loss = tf.add_n(losses, name='total_loss')

      if is_chief:
        # Compute the moving average of all individual losses and the
        # total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        # Attach a scalar summmary to all individual losses and the total loss;
        # do the same for the averaged version of the losses.
#        for l in losses + [total_loss]:
#          loss_name = l.op.name
          # Name each loss as '(raw)' and name the moving average version of the
          # loss as the original loss name.
#          tf.summary.scalar(loss_name + ' (raw)', l)
#          tf.summary.scalar(loss_name, loss_averages.average(l))

        # Add dependency to compute loss_averages.
        with tf.control_dependencies([loss_averages_op]):
          total_loss = tf.identity(total_loss)

      # Track the moving averages of all trainable variables.
      # Note that we maintain a 'double-average' of the BatchNormalization
      # global statistics.
      # This is not needed when the number of replicas are small but important
      # for synchronous distributed training with tens of workers/replicas.
      exp_moving_averager = tf.train.ExponentialMovingAverage(
          MOVING_AVERAGE_DECAY, global_step)

      variables_to_average = (
          tf.trainable_variables() + tf.moving_average_variables())

      # Add histograms for model variables.
#      for var in variables_to_average:
#        tf.summary.histogram(var.op.name, var)

      # Create synchronous replica optimizer.
      opt = tf.train.SyncReplicasOptimizer(
          opt,
          replicas_to_aggregate=num_replicas_to_aggregate,
          total_num_replicas=num_workers,
          variable_averages=exp_moving_averager,
          variables_to_average=variables_to_average)

#      batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
#      assert batchnorm_updates, 'Batchnorm updates are missing'
#      batchnorm_updates_op = tf.group(*batchnorm_updates)
#      # Add dependency to compute batchnorm_updates.
#      with tf.control_dependencies([batchnorm_updates_op]):
#        total_loss = tf.identity(total_loss)

      # Compute gradients with respect to the loss.
      # grads = opt.compute_gradients(total_loss)
      grads0 = opt.compute_gradients(total_loss) 
      grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0]

      # Add histograms for gradients.
#      for grad, var in grads:
#        if grad is not None:
#          tf.summary.histogram(var.op.name + '/gradients', grad)

      apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

      with tf.control_dependencies([apply_gradients_op]):
        train_op = tf.identity(total_loss, name='train_op')

      # Get chief queue_runners and init_tokens, which is used to synchronize
      # replicas. More details can be found in SyncReplicasOptimizer.
      chief_queue_runners = [opt.get_chief_queue_runner()]
      init_tokens_op = opt.get_init_tokens_op()

      # Create a saver.
      saver = tf.train.Saver()

      # Build the summary operation based on the TF collection of Summaries.
#      summary_op = tf.summary.merge_all()

      # Build an initialization operation to run below.
      init_op = tf.global_variables_initializer()

      # We run the summaries in the same thread as the training operations by
      # passing in None for summary_op to avoid a summary_thread being started.
      # Running summaries and training operations in parallel could run out of
      # GPU memory.
      sv = tf.train.Supervisor(is_chief=is_chief,
                               logdir=FLAGS.train_dir,
                               init_op=init_op,
                               summary_op=None,
                               global_step=global_step,
                               recovery_wait_secs=1,
                               saver=None,
                               save_model_secs=FLAGS.save_interval_secs)

      tf.logging.info('%s Supervisor' % datetime.now())

      sess_config = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement)

      # Get a session.
      sess = sv.prepare_or_wait_for_session(target, config=sess_config)

      # Start the queue runners.
      queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
      sv.start_queue_runners(sess, queue_runners)
      tf.logging.info('Started %d queues for processing input data.',
                      len(queue_runners))

      if is_chief:
        sv.start_queue_runners(sess, chief_queue_runners)
        sess.run(init_tokens_op)

      # Train, checking for Nans. Concurrently run the summary operation at a
      # specified interval. Note that the summary_op and train_op never run
      # simultaneously in order to prevent running out of GPU memory.
#      next_summary_time = time.time() + FLAGS.save_summaries_secs
      step = 0
      time0 = time.time()
      batch_size_num = 1
      while not sv.should_stop():
        try:
          start_time = time.time()

	  batch_size_num = 32
	  batch_size_num = 2*int(step/5)+16
#	   batch_size_num = int((int(step)/3*10)) % 100000 + 1
#          if step < 5:
#            batch_size_num = 32 
#          batch_size_num = (batch_size_num ) % 64 + 1
#          else:
#            batch_size_num = 80

          run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
          run_metadata = tf.RunMetadata()

          my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata)
	  b = time.time()
#          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
          if step > FLAGS.max_steps:
            break
          duration = time.time() - start_time
	  thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,))
	  thread.start()
#          tl = timeline.Timeline(run_metadata.step_stats)
#          last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue')
          c0 = time.time()
#          batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time)
#          batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) 
#          ctf = tl.generate_chrome_trace_format()
#          with open("timeline.json", 'a') as f:
#            f.write(ctf)

          if step % 1 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            c = time.time()
            tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time:  " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num))
            format_str = ('Worker %d: %s: step %d, loss = %.2f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

          # Determine if the summary_op should be run on the chief worker.
#          if is_chief and next_summary_time < time.time():
#            tf.logging.info('Running Summary operation on the chief.')
#            summary_str = sess.run(summary_op)
#            sv.summary_computed(sess, summary_str)
#            tf.logging.info('Finished running Summary operation.')

            # Determine the next time for running the summary.
#            next_summary_time += FLAGS.save_summaries_secs
        except:
          if is_chief:
            tf.logging.info('Chief got exception while running!')
          raise

      # Stop the supervisor.  This also waits for service threads to finish.
      sv.stop()
Ejemplo n.º 8
0
def train():
    global updated_batch_size_num
    global passed_info
    global shall_update
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server({'ps': ps_hosts, 'worker': worker_hosts},
                             job_name = FLAGS.job_name,
                             task_index=FLAGS.task_id)

#    batchSizeManager = BatchSizeManager(FLAGS.batch_size, len(worker_hosts))

    if FLAGS.job_name == 'ps':
#	rpcServer = batchSizeManager.create_rpc_server(ps_hosts[0].split(':')[0])
#        rpcServer.serve()
        server.join()

#    rpcClient = batchSizeManager.create_rpc_client(ps_hosts[0].split(':')[0])
    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)

    device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts))
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
      partitioner=tf.fixed_size_partitioner(len(ps_hosts), axis=0)
      with tf.variable_scope('root', partitioner=partitioner):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

	    decay_steps = 50000*350.0/FLAGS.batch_size
	    batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size')
            images, labels = cifar10.distorted_inputs(batch_size)
#            print (str(tf.shape(images))+ str(tf.shape(labels)))
	    re = tf.shape(images)[0]
            inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH])
#            labels = tf.reshape(labels, [-1, _NUM_CLASSES])
            labels = tf.one_hot(labels, 10, 1, 0)
  	    #network_fn = nets_factory.get_network_fn('inception_v3',num_classes=10) 
  	    network_fn = nets_factory.get_network_fn('vgg_16',num_classes=10) 
  	    (logits,_) = network_fn(inputs)
            print(logits.get_shape())
            cross_entropy = tf.losses.softmax_cross_entropy(
                logits=logits, 
                onehot_labels=labels)

#            logits = cifar10.inference(images, batch_size)

#            loss = cifar10.loss(logits, labels, batch_size)
            loss = cross_entropy + _WEIGHT_DECAY * tf.add_n(
                [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

            # Decay the learning rate exponentially based on the number of steps.
            lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE*len(worker_hosts),
                                            global_step,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
            opt = tf.train.GradientDescentOptimizer(lr)

            # Track the moving averages of all trainable variables.
            exp_moving_averager = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
            variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())

            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=len(worker_hosts),
#                replica_id=FLAGS.task_id,
                total_num_replicas=len(worker_hosts))
#                variable_averages=exp_moving_averager,
#                variables_to_average=variables_to_average)

            # Compute gradients with respect to the loss.
#            grads0 = opt.compute_gradients(loss) 
#	    grads = list()
#	    for grad, var in grads0:
#		grads.append((tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var))
            grads0 = opt.compute_gradients(loss) 
	    grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0]
	    #grads = tf.map_fn(lambda x : (tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), x[0]), x[1]), grads0)
	    #grads = tf.while_loop(lambda x : x, grads0)

#            grads = opt.compute_gradients(loss) 

            apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(loss, name='train_op')

            chief_queue_runners = [opt.get_chief_queue_runner()]
            init_tokens_op = opt.get_init_tokens_op()

#            saver = tf.train.Saver()
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
				     init_op=tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()),
                                     summary_op=None,
                                     global_step=global_step,
#                                     saver=saver,
                                     saver=None,
				     recovery_wait_secs=1,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())
   	    sess_config = tf.ConfigProto(allow_soft_placement=True,
   	                                 log_device_placement=FLAGS.log_device_placement)
	    sess_config.gpu_options.allow_growth = True

   	    # Get a session.
   	    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
#	    sess.run(tf.global_variables_initializer())

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)

            sv.start_queue_runners(sess, chief_queue_runners)
            sess.run(init_tokens_op)

            """Train CIFAR-10 for a number of steps."""
#            available_cpu = psutil.cpu_percent(interval=None)

#            thread = threading2.Thread(target = local_update_batch_size, name = "update_batch_size_thread", args = (rpcClient, FLAGS.task_id,))
#            thread.start()

	    time0 = time.time()
	    batch_size_num = FLAGS.batch_size
	    loss_list = []
	    threshold = 0.95
            for step in range(FLAGS.max_steps):

                start_time = time.time()

      		run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
      		run_metadata = tf.RunMetadata()

		batch_size_num = 128

                num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num
                decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

                _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num},  options=run_options, run_metadata=run_metadata)
#                _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num})
		b = time.time()
#    		tl = timeline.Timeline(run_metadata.step_stats)
##	        ctf = tl.generate_chrome_trace_format()
#		last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue')
		thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,))
		thread.start()

                c0 = time.time()

                if step % 1 == 0:
                    duration = time.time() - start_time
                    num_examples_per_step = batch_size_num
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

		    c = time.time()
##                    tf.logging.info("time statistics - batch_process_time: " + str( last_batch_time)  + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time:  " + str(c-c0) + " - accum_time: " + str(c-time0))

                    format_str = ("time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
		    loss_list.append(loss_value)
		    if step > 10 and loss_list[-1] < loss_list[0]*0.95 and loss_list[-2] < loss_list[0]*0.95:
		    	tf.logging.info("Haha cc-terminate: step "+str(step))
			exit()
Ejemplo n.º 9
0
        if payload["_action"] == "new":
            _allocate_new_worker(payload["_uuid"], payload)
        elif payload["_action"] == "stopped":
            _allocate_stopped_worker(payload)
        elif payload["_action"] == "snapshotted":
            _allocate_snapshot_worker(payload["_uuid"], payload)
        elif payload["_action"] == "manage":
            _manage_stack_worker(payload["_uuid"], payload["_manage"],
                                 payload["_key"])

        time.sleep(2)


_new_queue = Queue.Queue()
_new_stack_worker = threading2.Thread(target=_stack_worker)
_new_stack_worker.daemon = True
_new_stack_worker.start()


def _allocate_backend_from_snapshot(cluster_uuid, payload, key_name):
    """
    Allocate the backend from a snapshot. 
    """
    snapshot_uuid = payload['_file']
    backends = docker.fetch_snapshot_backend(snapshot_uuid)

    if backends:
        return _allocate_backend(cluster_uuid=cluster_uuid,
                                 payload=None,
                                 key_name=key_name,