def train():
    """Train CIFAR-10 for a number of steps."""

    g1 = tf.Graph()
    with g1.as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)
        grads = cifar10.train_part1(loss, global_step)

        only_gradients = [g for g, _ in grads]
        only_vars = [v for _, v in grads]
        placeholder_gradients = []

        #with tf.device("/gpu:0"):
        for grad_var in grads:
            placeholder_gradients.append(
                (tf.placeholder('float',
                                shape=grad_var[0].get_shape()), grad_var[1]))

        feed_dict = {}

        for i, grad_var in enumerate(grads):
            feed_dict[placeholder_gradients[i][0]] = np.zeros(
                placeholder_gradients[i][0].shape)

        train_op = cifar10.train_part2(global_step, placeholder_gradients)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        feeds = []
        print("Reached here")
        for i, grad_var in enumerate(grads):
            feeds.append(placeholder_gradients[i][0])
        # Partial Run
        print("Reached here", len(feeds))
        for x in feeds:
            print(x, )
        h = sess.partial_run_setup([only_gradients, train_op], feeds)
        print("Reached here")

        for i in xrange(10):
            res_grads = sess.partial_run(h,
                                         only_gradients,
                                         feed_dict=feed_dict)

            feed_dict = {}
            for i, grad_var in enumerate(res_grads):
                feed_dict[placeholder_gradients[i][0]] = res_grads[i]

            res_train_op = sess.partial_run(h, train_op, feed_dict=feed_dict)
def train():
    """Train CIFAR-10 for a number of steps."""

    g1 = tf.Graph()
    with g1.as_default():
        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        #global_step = tf.contrib.framework.get_or_create_global_step()

        global_step = tf.Variable(-1,
                                  name='global_step',
                                  trainable=False,
                                  dtype=tf.int32)
        increment_global_step_op = tf.assign(global_step, global_step + 1)

        cifar10.build_graph_part2()

        placeholder_gradients = []

        #with tf.device("/gpu:0"):
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            placeholder_gradients.append(
                (tf.placeholder('float', shape=var.get_shape()), var))
        feed_dict = {}

        i = 0
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            feed_dict[placeholder_gradients[i][0]] = np.zeros(
                placeholder_gradients[i][0].shape)
            i = i + 1
        train_op = cifar10.train_part2(global_step, placeholder_gradients)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d,(%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print(format_str % (datetime.now(), self._step,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(
                    log_device_placement=FLAGS.log_device_placement,
                    gpu_options=gpu_options)) as mon_sess:

            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                print(v)
            # Sending the initial value of variables
            var_val = []
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                var_val.append(mon_sess.run(v, feed_dict=feed_dict))
            send_data = pickle.dumps(var_val, pickle.HIGHEST_PROTOCOL)
            global global_var_vals
            global_var_vals.value = send_data
            size = len(send_data)
            size = pickle.dumps(size, pickle.HIGHEST_PROTOCOL)
            for i in xrange(MAX_WORKERS):
                conn, addr = s.accept()
                conn.sendall(size)
                conn.sendall(send_data)
                conn.close()
            print("Sent initial var values to workers")
            while not mon_sess.should_stop():
                val = mon_sess.run(global_step, feed_dict=feed_dict)
                #print("Iteration: ", val)
                if (val == (FLAGS.max_steps - 1)):
                    print("Global step val while stoping.")
                    sys.exit()
                recv_grads = gradients_q.get()
                #print("received gradients from worker")
                feed_dict = {}
                for i, grad_var in enumerate(recv_grads):
                    feed_dict[placeholder_gradients[i][0]] = recv_grads[i]

                res = mon_sess.run(train_op, feed_dict=feed_dict)
                var_val = []
                #print("Run complete with new values")
                for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                    var_val.append(mon_sess.run(v, feed_dict=feed_dict))
                global global_var_vals
                global_var_vals.value = pickle.dumps(var_val,
                                                     pickle.HIGHEST_PROTOCOL)
Example #3
0
def train():
    """Train CIFAR-10 for a number of steps."""

    g1 = tf.Graph()
    with g1.as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)
        grads = cifar10.train_part1(loss, global_step)

        only_gradients = [g for g, _ in grads]
        only_vars = [v for _, v in grads]

        placeholder_gradients = []

        #with tf.device("/gpu:0"):
        for grad_var in grads:
            placeholder_gradients.append(
                (tf.placeholder('float',
                                shape=grad_var[0].get_shape()), grad_var[1]))

        feed_dict = {}

        for i, grad_var in enumerate(grads):
            feed_dict[placeholder_gradients[i][0]] = np.zeros(
                placeholder_gradients[i][0].shape)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train_part2(global_step, placeholder_gradients)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(
                    log_device_placement=FLAGS.log_device_placement,
                    gpu_options=gpu_options)) as mon_sess:

            global port
            while not mon_sess.should_stop():

                gradients = mon_sess.run(only_gradients, feed_dict=feed_dict)
                # pickling the gradients
                send_data = pickle.dumps(gradients, pickle.HIGHEST_PROTOCOL)
                # finding size of pickled gradients
                to_send_size = len(send_data)
                # Sending the size of the gradients first
                send_size = pickle.dumps(to_send_size, pickle.HIGHEST_PROTOCOL)
                s.sendall(send_size)
                # sending the gradients
                s.sendall(send_data)
                recv_size = safe_recv(8, s)
                recv_size = pickle.loads(recv_size)
                recv_data = safe_recv(recv_size, s)
                gradients2 = pickle.loads(recv_data)
                #print("Recevied gradients of size: ", len(recv_data))
                feed_dict = {}

                for i, grad_var in enumerate(gradients2):
                    feed_dict[placeholder_gradients[i][0]] = gradients2[i]
                    #print(gradients[i].shape)
                    #print(gradients2[i].shape)

                res = mon_sess.run(train_op, feed_dict=feed_dict)
Example #4
0
def train():
    """Train CIFAR-10 for a number of steps."""

    g1 = tf.Graph()
    with g1.as_default():
        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        #global_step = tf.contrib.framework.get_or_create_global_step()

        global_step = tf.Variable(-1,
                                  name='global_step',
                                  trainable=False,
                                  dtype=tf.int32)
        increment_global_step_op = tf.assign(global_step, global_step + 1)

        cifar10.build_graph()

        placeholder_gradients = []

        #with tf.device("/gpu:0"):
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            placeholder_gradients.append(
                (tf.placeholder('float', shape=var.get_shape()), var))
        feed_dict = {}

        i = 0
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            feed_dict[placeholder_gradients[i][0]] = np.zeros(
                placeholder_gradients[i][0].shape)
            i = i + 1
        train_op, atest = cifar10.train_part2(global_step,
                                              placeholder_gradients)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d,(%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print(format_str % (datetime.now(), self._step,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(
                    log_device_placement=FLAGS.log_device_placement,
                    gpu_options=gpu_options)) as mon_sess:

            # Sending the initial value of variables
            global global_var_vals
            global done_flag
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            print("Connecting to port : ", port, " and no of workers: ",
                  MAX_WORKERS)
            s.bind((TCP_IP, port))
            s.listen(5)
            var_val = []
            for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                var_val.append(mon_sess.run(v, feed_dict=feed_dict))

            #print("Var val:: ",var_val)
            send_data = pickle.dumps(var_val, pickle.HIGHEST_PROTOCOL)
            #print("Send the send_data:: ",send_data)
            global_var_vals.value = send_data
            size = len(send_data)
            #print("Send the size:: ",size)
            size = pickle.dumps(size, pickle.HIGHEST_PROTOCOL)
            #print("Send the size in bytes:: ",size)
            for i in range(MAX_WORKERS):
                conn, addr = s.accept()
                #print("Conn: {}, Addr: {}".format(conn, addr))
                conn.sendall(size)
                #print("Sent Size")
                conn.sendall(send_data)
                conn.close()
            s.close()
            print("Sent initial var values to workers")

            while not mon_sess.should_stop():
                #print("Done with Sending")
                val = mon_sess.run(global_step, feed_dict=feed_dict)
                #print("Iteration: ", val)
                if (val == (FLAGS.max_steps - 1)):
                    print("Global step val while stoping.")
                    return
                #print("Before For")
                for i in range(MAX_WORKERS):
                    recv_grads = gradients_q.get()
                    feed_dict = {}
                    for i, grad_var in enumerate(recv_grads):
                        feed_dict[placeholder_gradients[i][0]] = recv_grads[i]
                    res = mon_sess.run(train_op, feed_dict=feed_dict)
                    with tf.Session() as sess:
                        print(atest.eval())

                var_val = []

                for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                    v_temp = mon_sess.run(v, feed_dict=feed_dict)
                    var_val.append(v_temp)
                global_var_vals.value = pickle.dumps(var_val,
                                                     pickle.HIGHEST_PROTOCOL)
                #print("New values of variables ready")
                done_flag.value = 1
                for i in range(MAX_WORKERS):
                    val = ack_q.get()
                done_flag.value = 0