def train(): """Train CIFAR-10 for a number of steps.""" g1 = tf.Graph() with g1.as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) grads = cifar10.train_part1(loss, global_step) only_gradients = [g for g, _ in grads] only_vars = [v for _, v in grads] placeholder_gradients = [] #with tf.device("/gpu:0"): for grad_var in grads: placeholder_gradients.append( (tf.placeholder('float', shape=grad_var[0].get_shape()), grad_var[1])) feed_dict = {} for i, grad_var in enumerate(grads): feed_dict[placeholder_gradients[i][0]] = np.zeros( placeholder_gradients[i][0].shape) train_op = cifar10.train_part2(global_step, placeholder_gradients) sess = tf.Session() sess.run(tf.global_variables_initializer()) feeds = [] print("Reached here") for i, grad_var in enumerate(grads): feeds.append(placeholder_gradients[i][0]) # Partial Run print("Reached here", len(feeds)) for x in feeds: print(x, ) h = sess.partial_run_setup([only_gradients, train_op], feeds) print("Reached here") for i in xrange(10): res_grads = sess.partial_run(h, only_gradients, feed_dict=feed_dict) feed_dict = {} for i, grad_var in enumerate(res_grads): feed_dict[placeholder_gradients[i][0]] = res_grads[i] res_train_op = sess.partial_run(h, train_op, feed_dict=feed_dict)
def train(): """Train CIFAR-10 for a number of steps.""" g1 = tf.Graph() with g1.as_default(): # Build a Graph that trains the model with one batch of examples and # updates the model parameters. #global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.Variable(-1, name='global_step', trainable=False, dtype=tf.int32) increment_global_step_op = tf.assign(global_step, global_step + 1) cifar10.build_graph_part2() placeholder_gradients = [] #with tf.device("/gpu:0"): for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): placeholder_gradients.append( (tf.placeholder('float', shape=var.get_shape()), var)) feed_dict = {} i = 0 for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): feed_dict[placeholder_gradients[i][0]] = np.zeros( placeholder_gradients[i][0].shape) i = i + 1 train_op = cifar10.train_part2(global_step, placeholder_gradients) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d,(%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook() ], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)) as mon_sess: for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(v) # Sending the initial value of variables var_val = [] for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_val.append(mon_sess.run(v, feed_dict=feed_dict)) send_data = pickle.dumps(var_val, pickle.HIGHEST_PROTOCOL) global global_var_vals global_var_vals.value = send_data size = len(send_data) size = pickle.dumps(size, pickle.HIGHEST_PROTOCOL) for i in xrange(MAX_WORKERS): conn, addr = s.accept() conn.sendall(size) conn.sendall(send_data) conn.close() print("Sent initial var values to workers") while not mon_sess.should_stop(): val = mon_sess.run(global_step, feed_dict=feed_dict) #print("Iteration: ", val) if (val == (FLAGS.max_steps - 1)): print("Global step val while stoping.") sys.exit() recv_grads = gradients_q.get() #print("received gradients from worker") feed_dict = {} for i, grad_var in enumerate(recv_grads): feed_dict[placeholder_gradients[i][0]] = recv_grads[i] res = mon_sess.run(train_op, feed_dict=feed_dict) var_val = [] #print("Run complete with new values") for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_val.append(mon_sess.run(v, feed_dict=feed_dict)) global global_var_vals global_var_vals.value = pickle.dumps(var_val, pickle.HIGHEST_PROTOCOL)
def train(): """Train CIFAR-10 for a number of steps.""" g1 = tf.Graph() with g1.as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) grads = cifar10.train_part1(loss, global_step) only_gradients = [g for g, _ in grads] only_vars = [v for _, v in grads] placeholder_gradients = [] #with tf.device("/gpu:0"): for grad_var in grads: placeholder_gradients.append( (tf.placeholder('float', shape=grad_var[0].get_shape()), grad_var[1])) feed_dict = {} for i, grad_var in enumerate(grads): feed_dict[placeholder_gradients[i][0]] = np.zeros( placeholder_gradients[i][0].shape) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train_part2(global_step, placeholder_gradients) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)) as mon_sess: global port while not mon_sess.should_stop(): gradients = mon_sess.run(only_gradients, feed_dict=feed_dict) # pickling the gradients send_data = pickle.dumps(gradients, pickle.HIGHEST_PROTOCOL) # finding size of pickled gradients to_send_size = len(send_data) # Sending the size of the gradients first send_size = pickle.dumps(to_send_size, pickle.HIGHEST_PROTOCOL) s.sendall(send_size) # sending the gradients s.sendall(send_data) recv_size = safe_recv(8, s) recv_size = pickle.loads(recv_size) recv_data = safe_recv(recv_size, s) gradients2 = pickle.loads(recv_data) #print("Recevied gradients of size: ", len(recv_data)) feed_dict = {} for i, grad_var in enumerate(gradients2): feed_dict[placeholder_gradients[i][0]] = gradients2[i] #print(gradients[i].shape) #print(gradients2[i].shape) res = mon_sess.run(train_op, feed_dict=feed_dict)
def train(): """Train CIFAR-10 for a number of steps.""" g1 = tf.Graph() with g1.as_default(): # Build a Graph that trains the model with one batch of examples and # updates the model parameters. #global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.Variable(-1, name='global_step', trainable=False, dtype=tf.int32) increment_global_step_op = tf.assign(global_step, global_step + 1) cifar10.build_graph() placeholder_gradients = [] #with tf.device("/gpu:0"): for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): placeholder_gradients.append( (tf.placeholder('float', shape=var.get_shape()), var)) feed_dict = {} i = 0 for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): feed_dict[placeholder_gradients[i][0]] = np.zeros( placeholder_gradients[i][0].shape) i = i + 1 train_op, atest = cifar10.train_part2(global_step, placeholder_gradients) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 def after_run(self, run_context, run_values): if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) format_str = ('%s: step %d,(%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook() ], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)) as mon_sess: # Sending the initial value of variables global global_var_vals global done_flag s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) print("Connecting to port : ", port, " and no of workers: ", MAX_WORKERS) s.bind((TCP_IP, port)) s.listen(5) var_val = [] for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_val.append(mon_sess.run(v, feed_dict=feed_dict)) #print("Var val:: ",var_val) send_data = pickle.dumps(var_val, pickle.HIGHEST_PROTOCOL) #print("Send the send_data:: ",send_data) global_var_vals.value = send_data size = len(send_data) #print("Send the size:: ",size) size = pickle.dumps(size, pickle.HIGHEST_PROTOCOL) #print("Send the size in bytes:: ",size) for i in range(MAX_WORKERS): conn, addr = s.accept() #print("Conn: {}, Addr: {}".format(conn, addr)) conn.sendall(size) #print("Sent Size") conn.sendall(send_data) conn.close() s.close() print("Sent initial var values to workers") while not mon_sess.should_stop(): #print("Done with Sending") val = mon_sess.run(global_step, feed_dict=feed_dict) #print("Iteration: ", val) if (val == (FLAGS.max_steps - 1)): print("Global step val while stoping.") return #print("Before For") for i in range(MAX_WORKERS): recv_grads = gradients_q.get() feed_dict = {} for i, grad_var in enumerate(recv_grads): feed_dict[placeholder_gradients[i][0]] = recv_grads[i] res = mon_sess.run(train_op, feed_dict=feed_dict) with tf.Session() as sess: print(atest.eval()) var_val = [] for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): v_temp = mon_sess.run(v, feed_dict=feed_dict) var_val.append(v_temp) global_var_vals.value = pickle.dumps(var_val, pickle.HIGHEST_PROTOCOL) #print("New values of variables ready") done_flag.value = 1 for i in range(MAX_WORKERS): val = ack_q.get() done_flag.value = 0