Beispiel #1
0
    def execute(self):
        result_file = os.path.join(self.result_dir, "train_result_{}.txt".format(self.task_index))
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(self.server.target, config=config) as sess:
            K.set_session(sess)
            if self.go_on:
                self.restore_model()
            tb_callback = TensorBoard(log_dir=self.log_dir, write_grads=True, write_images=True)
            ckpt_callback = ModelCheckpoint(self.checkpoint_path,
                                            monitor='loss',
                                            save_weights_only=True)
            reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1)
            early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [tb_callback, ckpt_callback] if self.task_index == 0 else []

            callbacks += [reduce_lr, early_stopping]
            # try:
            his = self.model.fit_generator(self.generate_rdd_data(),
                                           steps_per_epoch=self.steps_per_epoch,
                                           # validation_data=self.val_generate_data(val_data),
                                           # validation_steps=max(1, self.val_num // self.batch_size),
                                           epochs=self.epochs + self.initial_epoch,
                                           initial_epoch=self.initial_epoch,
                                           workers=0,
                                           callbacks=callbacks)
            logger.debug("{}-{}".format(self.task_index, his.history))
            ModelDir.write_result(result_file, self.get_results(his), self.go_on)
            # except Exception as e:
            #     logger.debug(str(e))
            self.save_model()
            self.tf_feed.terminate()
Beispiel #2
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir, "train_result_{}.txt".format(self.task_index))
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(self.server.target, config=config) as sess:
            K.set_session(sess)
            if self.go_on:
                self.restore_model()
            tb_callback = TensorBoard(log_dir=self.log_dir,
                                      write_grads=True,
                                      write_images=True)
            ckpt_callback = ModelCheckpoint(self.checkpoint_path,
                                            monitor='loss',
                                            save_weights_only=True)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [tb_callback, ckpt_callback
                         ] if self.task_index == 0 else None

            # train on data read from a generator which is producing data from a Spark RDD
            his = self.model.fit_generator(
                generator=self.generate_rdd_data(),
                steps_per_epoch=self.steps_per_epoch,
                epochs=self.epochs + self.initial_epoch,
                callbacks=callbacks,
                workers=0,
                initial_epoch=self.initial_epoch)
            self.save_model()
            ModelDir.write_result(result_file, self.get_results(his),
                                  self.go_on)
            self.tf_feed.terminate()
Beispiel #3
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir, "train_result_{}.txt".format(self.task_index))
        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()
        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.gpu_memory_fraction)
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options, log_device_placement=False)) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph)
            coord = tf.train.Coordinator()
            tf.train.start_queue_runners(coord=coord, sess=sess)

            steps = 0
            while not sess.should_stop() and not self.tf_feed.should_stop():
                if self.go_on:
                    self.restore_model(sess)

                # Training and validation loop
                print('Running training')
                image_list, label_list = self.get_data()
                # Enqueue one epoch of image paths and labels
                labels_array = np.expand_dims(np.array(image_list), 1)
                image_paths_array = np.expand_dims(np.array(label_list), 1)
                control_value = facenet.RANDOM_ROTATE * self.random_rotate + \
                                facenet.RANDOM_CROP * self.random_crop + \
                                facenet.RANDOM_FLIP * self.random_flip + \
                                facenet.FIXED_STANDARDIZATION * self.use_fixed_image_standardization
                control_array = np.ones_like(labels_array) * control_value
                enqueue_op = tf.get_collection(OUTPUTS)[0]
                feed_dict = dict(
                    zip(tf.get_collection(INPUTS),
                        [image_paths_array, labels_array, control_array]))
                sess.run(enqueue_op, feed_dict)

                self.model.add_params(batch_size=self.batch_size,
                                      steps_per_epoch=self.steps_per_epoch,
                                      phase_train=True,
                                      n_classes=self.n_classes)
                keys = ["_task_index", "_epoch"]
                for epoch in range(1, self.epochs + 1):
                    for _ in range(self.steps_per_epoch - 1):
                        sess.run(self.model.fetches,
                                 feed_dict=self.model.feed_dict)
                    res = sess.run(self.model.fetches + [summary_op],
                                   feed_dict=self.model.feed_dict)
                    steps = sess.run(self.global_step)
                    summary_writer.add_summary(res[-1], global_step=steps)
                    results = [dict(zip(keys, res))]
                    ModelDir.write_result(result_file, results, True)
            summary = tf.Summary()
            summary_writer.add_summary(summary, global_step=steps)
            self.tf_feed.terminate()
Beispiel #4
0
    def train(self, save_dir, result_dir, checkpoint_dir, log_dir):
        result_file = os.path.join(result_dir, "train_result.txt")
        train_set = self.train_set
        config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.compat.v1.Session(config=config) as sess:
            # K.set_session(sess)
            if self.go_on:
                self.restore_model(checkpoint_dir)
            tb_callback = TensorBoard(log_dir=log_dir, write_images=True)
            checkpoint_file = os.path.join(checkpoint_dir,
                                           self.name + '_checkpoint_{epoch}')
            ckpt_callback = ModelCheckpoint(
                checkpoint_file,
                # monitor='loss',
                save_weights_only=True)
            reduce_lr = ReduceLROnPlateau(monitor='loss',
                                          factor=0.1,
                                          patience=3,
                                          verbose=1)
            early_stopping = EarlyStopping(monitor='loss',
                                           min_delta=0,
                                           patience=10,
                                           verbose=1)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [tb_callback, ckpt_callback]
            # callbacks = []

            self.model.compile(optimizer=Adam(lr=1e-4),
                               loss={
                                   'yolo_loss': lambda y_true, y_pred: y_pred
                               })
            # print('Unfreeze all of the layers.')
            callbacks.extend([reduce_lr, early_stopping])
            steps_per_epoch = len(train_set) // self.batch_size
            # note that more GPU memory is required after unfreezing the body
            # try:
            his = self.model.fit_generator(
                self.train_generate_data(train_set),
                steps_per_epoch=steps_per_epoch,
                # validation_data=self.val_generate_data(val_data),
                # validation_steps=max(1, self.val_num // self.batch_size),
                epochs=self.initial_epoch + 1,
                initial_epoch=self.initial_epoch,
                workers=1,
                callbacks=callbacks)
            logger.debug(str(his.history))
            # except Exception as e:
            #     logger.debug(str(e))
            # logger.debug('end')
            save_model_path = os.path.join(save_dir, 'model.h5')
            self.model.save(save_model_path)
            ModelDir.write_result(result_file, self.get_results(his))
Beispiel #5
0
 def execute(self):
     result_file = os.path.join(
         self.result_dir, "predict_result_{}.txt".format(self.task_index))
     with tf.Session(self.server.target) as sess:
         K.set_session(sess)
         self.load_model()
         his = self.model.predict_generator(self.generate_rdd_data(),
                                            steps=self.steps_per_epoch)
         ModelDir.write_result(result_file, self.get_results(his))
         self.tf_feed.terminate()
         self.delete_tmp_dir()
Beispiel #6
0
 def execute(self):
     result_file = os.path.join(
         self.result_dir, "predict_result_{}.txt".format(self.task_index))
     config = tf.ConfigProto(allow_soft_placement=True)
     config.gpu_options.allow_growth = True
     with tf.Session(self.server.target, config=config) as sess:
         self.load_model(sess)
         for _ in range(self.steps_per_epoch):
             x, y = self.generate_rdd_data
             if len(x) == 0:
                 break
             predictions = sess.run(self.model.outputs['y'],
                                    self.feed_dict(x=x))
             y_pred = np.argmax(predictions, 1)
             y_true = np.argmax(y, 1) if y is not None else None
             logger.debug(predictions)
             results = self.get_results(y_pred, y_true)
             ModelDir.write_result(result_file, results, True)
         self.tf_feed.terminate()
Beispiel #7
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir, "train_result_{}.txt".format(self.task_index))
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        summary_op = tf.summary.merge_all()
        with tf.Session(self.server.target, config=config) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            self.summary_writer = tf.summary.FileWriter(
                self.log_dir, sess.graph)
            coord = tf.train.Coordinator()
            tf.train.start_queue_runners(coord=coord, sess=sess)

            if self.go_on:
                self.restore_checkpoint(sess)
            names, values = zip(*self.model.fetches.items())
            names = list(names)
            values = list(values)
            res, summary_str = None, None
            for epoch in range(1, self.epochs + 1):
                for _ in range(self.steps_per_epoch):
                    x, y = self.generate_rdd_data
                    if len(x) == 0:
                        break
                    if summary_op is not None:
                        *res, summary_str = sess.run(values + [summary_op],
                                                     self.feed_dict(x=x, y=y))
                    else:
                        res = sess.run(values, self.feed_dict(x=x, y=y))
                result = dict(
                    (k, v) for k, v in zip(names, res) if v is not None)
                result.update(self.common_dict(epoch + self.initial_epoch))
                ModelDir.write_result(result_file, [result], True)
                self.save_checkpoint(sess, epoch + self.initial_epoch,
                                     summary_str)

            self.model.write_model(self.model_config_path, False)
            self.save_model(sess)
            self.tf_feed.terminate()