def execute(self): result_file = os.path.join(self.result_dir, "train_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(self.server.target, config=config) as sess: K.set_session(sess) if self.go_on: self.restore_model() tb_callback = TensorBoard(log_dir=self.log_dir, write_grads=True, write_images=True) ckpt_callback = ModelCheckpoint(self.checkpoint_path, monitor='loss', save_weights_only=True) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [tb_callback, ckpt_callback] if self.task_index == 0 else [] callbacks += [reduce_lr, early_stopping] # try: his = self.model.fit_generator(self.generate_rdd_data(), steps_per_epoch=self.steps_per_epoch, # validation_data=self.val_generate_data(val_data), # validation_steps=max(1, self.val_num // self.batch_size), epochs=self.epochs + self.initial_epoch, initial_epoch=self.initial_epoch, workers=0, callbacks=callbacks) logger.debug("{}-{}".format(self.task_index, his.history)) ModelDir.write_result(result_file, self.get_results(his), self.go_on) # except Exception as e: # logger.debug(str(e)) self.save_model() self.tf_feed.terminate()
def execute(self): result_file = os.path.join( self.result_dir, "recurrent_predict_result_{}.txt".format(self.task_index)) with tf.Session(self.server.target) as sess: K.set_session(sess) self.load_model() for x, y in self.generate_rdd_data(): x_len = x.shape[1] if x_len < self.units: break x_train = np.array(x[:self.units]) for _ in range(self.steps): ys = self.model.predict(x_train, batch_size=1) y_label = np.argmax(ys, 1) if self.feature_type == 'one_hot': shape = ys.shape y_l = np.zeros(shape) y_l[..., y_label] = 1 x_train = np.array( [x_train.tolist()[0][1:] + y_l.tolist()]) else: x_train = np.array( [x_train.tolist()[0][1:] + y_label.tolist()]) ModelDir.write_str(result_file, str(y_label.tolist()[0]) + " ", True) ModelDir.write_str(result_file, "\n", True) self.tf_feed.terminate() self.delete_tmp_dir()
def execute(self): result_file = os.path.join( self.result_dir, "train_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(self.server.target, config=config) as sess: K.set_session(sess) if self.go_on: self.restore_model() tb_callback = TensorBoard(log_dir=self.log_dir, write_grads=True, write_images=True) ckpt_callback = ModelCheckpoint(self.checkpoint_path, monitor='loss', save_weights_only=True) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [tb_callback, ckpt_callback ] if self.task_index == 0 else None # train on data read from a generator which is producing data from a Spark RDD his = self.model.fit_generator( generator=self.generate_rdd_data(), steps_per_epoch=self.steps_per_epoch, epochs=self.epochs + self.initial_epoch, callbacks=callbacks, workers=0, initial_epoch=self.initial_epoch) self.save_model() ModelDir.write_result(result_file, self.get_results(his), self.go_on) self.tf_feed.terminate()
def execute(self): result_file = os.path.join( self.result_dir, "train_result_{}.txt".format(self.task_index)) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.gpu_memory_fraction) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) steps = 0 while not sess.should_stop() and not self.tf_feed.should_stop(): if self.go_on: self.restore_model(sess) # Training and validation loop print('Running training') image_list, label_list = self.get_data() # Enqueue one epoch of image paths and labels labels_array = np.expand_dims(np.array(image_list), 1) image_paths_array = np.expand_dims(np.array(label_list), 1) control_value = facenet.RANDOM_ROTATE * self.random_rotate + \ facenet.RANDOM_CROP * self.random_crop + \ facenet.RANDOM_FLIP * self.random_flip + \ facenet.FIXED_STANDARDIZATION * self.use_fixed_image_standardization control_array = np.ones_like(labels_array) * control_value enqueue_op = tf.get_collection(OUTPUTS)[0] feed_dict = dict( zip(tf.get_collection(INPUTS), [image_paths_array, labels_array, control_array])) sess.run(enqueue_op, feed_dict) self.model.add_params(batch_size=self.batch_size, steps_per_epoch=self.steps_per_epoch, phase_train=True, n_classes=self.n_classes) keys = ["_task_index", "_epoch"] for epoch in range(1, self.epochs + 1): for _ in range(self.steps_per_epoch - 1): sess.run(self.model.fetches, feed_dict=self.model.feed_dict) res = sess.run(self.model.fetches + [summary_op], feed_dict=self.model.feed_dict) steps = sess.run(self.global_step) summary_writer.add_summary(res[-1], global_step=steps) results = [dict(zip(keys, res))] ModelDir.write_result(result_file, results, True) summary = tf.Summary() summary_writer.add_summary(summary, global_step=steps) self.tf_feed.terminate()
def train(self, save_dir, result_dir, checkpoint_dir, log_dir): result_file = os.path.join(result_dir, "train_result.txt") train_set = self.train_set config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: # K.set_session(sess) if self.go_on: self.restore_model(checkpoint_dir) tb_callback = TensorBoard(log_dir=log_dir, write_images=True) checkpoint_file = os.path.join(checkpoint_dir, self.name + '_checkpoint_{epoch}') ckpt_callback = ModelCheckpoint( checkpoint_file, # monitor='loss', save_weights_only=True) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [tb_callback, ckpt_callback] # callbacks = [] self.model.compile(optimizer=Adam(lr=1e-4), loss={ 'yolo_loss': lambda y_true, y_pred: y_pred }) # print('Unfreeze all of the layers.') callbacks.extend([reduce_lr, early_stopping]) steps_per_epoch = len(train_set) // self.batch_size # note that more GPU memory is required after unfreezing the body # try: his = self.model.fit_generator( self.train_generate_data(train_set), steps_per_epoch=steps_per_epoch, # validation_data=self.val_generate_data(val_data), # validation_steps=max(1, self.val_num // self.batch_size), epochs=self.initial_epoch + 1, initial_epoch=self.initial_epoch, workers=1, callbacks=callbacks) logger.debug(str(his.history)) # except Exception as e: # logger.debug(str(e)) # logger.debug('end') save_model_path = os.path.join(save_dir, 'model.h5') self.model.save(save_model_path) ModelDir.write_result(result_file, self.get_results(his))
def execute(self): result_file = os.path.join( self.result_dir, "predict_result_{}.txt".format(self.task_index)) with tf.Session(self.server.target) as sess: K.set_session(sess) self.load_model() his = self.model.predict_generator(self.generate_rdd_data(), steps=self.steps_per_epoch) ModelDir.write_result(result_file, self.get_results(his)) self.tf_feed.terminate() self.delete_tmp_dir()
def train(self, data_rdd, model_rdd, batch_size, epochs, model_dir, go_on=False): n_samples = data_rdd.count() # steps_per_epoch = n_samples // batch_size // self.num_workers steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers) assert steps_per_epoch > 0 md = ModelDir(model_dir, 'train*') if go_on: md.create_model_dir() else: md = md.rebuild_model_dir() worker = TFTrainWorker(model_rdd, go_on=go_on, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, **md.to_dict()) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=epochs, feed_timeout=60000) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def execute(self): result_file = os.path.join( self.result_dir, "predict_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(self.server.target, config=config) as sess: self.load_model(sess) for _ in range(self.steps_per_epoch): x, y = self.generate_rdd_data if len(x) == 0: break predictions = sess.run(self.model.outputs['y'], self.feed_dict(x=x)) y_pred = np.argmax(predictions, 1) y_true = np.argmax(y, 1) if y is not None else None logger.debug(predictions) results = self.get_results(y_pred, y_true) ModelDir.write_result(result_file, results, True) self.tf_feed.terminate()
def execute(self): result_file = os.path.join( self.result_dir, "train_result_{}.txt".format(self.task_index)) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True summary_op = tf.summary.merge_all() with tf.Session(self.server.target, config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) self.summary_writer = tf.summary.FileWriter( self.log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) if self.go_on: self.restore_checkpoint(sess) names, values = zip(*self.model.fetches.items()) names = list(names) values = list(values) res, summary_str = None, None for epoch in range(1, self.epochs + 1): for _ in range(self.steps_per_epoch): x, y = self.generate_rdd_data if len(x) == 0: break if summary_op is not None: *res, summary_str = sess.run(values + [summary_op], self.feed_dict(x=x, y=y)) else: res = sess.run(values, self.feed_dict(x=x, y=y)) result = dict( (k, v) for k, v in zip(names, res) if v is not None) result.update(self.common_dict(epoch + self.initial_epoch)) ModelDir.write_result(result_file, [result], True) self.save_checkpoint(sess, epoch + self.initial_epoch, summary_str) self.model.write_model(self.model_config_path, False) self.save_model(sess) self.tf_feed.terminate()
def main(self): md = ModelDir(self.model_dir, 'train*') if self.go_on: md.create_model_dir() else: md = md.rebuild_model_dir() self.build_model() self.train(**md.to_dict())
def evaluate(self, data_rdd, steps, model_dir): md = ModelDir(model_dir, 'evaluate*') steps_per_epoch = data_rdd.count() if steps <= 0 else steps steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers) worker = EvaluateWorker(steps_per_epoch=steps_per_epoch, **md.to_dict()) md.delete_result_file() cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=1) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def recurrent_predict(self, data_rdd, units, steps, feature_type, model_dir): md = ModelDir(model_dir, 'recurrent_predict*') worker = RecurrentPredictWorker(units=units, steps=steps, feature_type=feature_type, **md.to_dict()) md.delete_result_file() cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000) cluster.shutdown() results = md.read_result(True) return self.sqlc.createDataFrame([{"result": result} for result in results])
def yolov3_tiny_train(self, model_rdd, batch_size, epochs, classes_path, anchors_path, train_path, val_path, image_size, model_dir, weights_path=None, freeze_body=2, go_on=False): columns = model_rdd.columns assert "model_config" in columns, "not exists model layer config!" assert tf.io.gfile.exists(train_path), "train dataset path not exists!" data_rdd = self.sc.textFile(train_path) n_samples = data_rdd.count() steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers) md = ModelDir(model_dir, 'train*') if go_on: md.create_model_dir() else: md = md.rebuild_model_dir() worker = YOLOV3TinyModelTrainWorker(model_rdd, go_on=go_on, batch_size=batch_size, epochs=epochs, classes_path=classes_path, anchors_path=anchors_path, weights_path=weights_path, val_path=val_path, image_size=image_size, steps_per_epoch=steps_per_epoch, freeze_body=freeze_body, **md.to_dict()) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def predict(self, data_rdd, steps, model_dir, output_prob=False): md = ModelDir(model_dir, 'predict*') steps_per_epoch = data_rdd.count() if steps <= 0 else steps steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers) worker = PredictWorker(steps_per_epoch=steps_per_epoch, output_prob=output_prob, **md.to_dict()) md.delete_result_file() cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def yolov3_train(self, model_rdd, data_dir, batch_size, epochs, image_size, model_dir, weights_path=None, freeze_body=2, go_on=False): train_path = os.path.join(data_dir, 'train.txt') assert tf.io.gfile.exists(train_path), "train dataset path not exists!" data_rdd = self.sc.textFile(train_path) n_samples = data_rdd.count() steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers) md = ModelDir(model_dir, 'train*') if go_on: md.create_model_dir() else: md = md.rebuild_model_dir() worker = YOLOV3ModelTrainWorker(model_rdd, data_dir, go_on=go_on, batch_size=batch_size, epochs=epochs, image_size=image_size, steps_per_epoch=steps_per_epoch, freeze_body=freeze_body, **md.to_dict()) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000) cluster.shutdown() results = md.read_result() if results: return self.sqlc.createDataFrame(results)