def remove_training_directory(self, train_dir): """Removes the training directory.""" try: logging.info("%s: Removing existing train directory.", task_as_string(self.task)) gfile.DeleteRecursively(train_dir) except: logging.error( "%s: Failed to delete directory " + train_dir + " when starting a new model. Please delete it manually and" + " try again.", task_as_string(self.task))
def remove_training_directory(self, train_dir): """Removes the training directory.""" if tf.gfile.Exists(train_dir): try: logging.info("{}: Removing existing train dir.".format( task_as_string(self.task))) gfile.DeleteRecursively(train_dir) except: logging.error( "{}: Failed to delete dir {} when starting a new model. Delete it manually and try again." .format(task_as_string(self.task), train_dir))
def setup_model_checkpoint_and_embeddings(self): """Copy embeddings over and precompute theorem database embeddings. This function makes sure that we have at least one model checkpoint file present. Also it copies over the latest new embeddings when they become available and precomputes the embedding store for them. """ logging.info('Setting up model checkpoint and embeddings %s %s', str(self.config.copy_model_checkpoints), str(self.checkpoint_monitor.has_checkpoint())) # We can prohibit copying checkpoints by setting copy_model_checkpoints # to false, unless we don't have any checkpoint yet, in which case # we try to copy a new checkpoint over. while self.config.copy_model_checkpoints or not ( self.checkpoint_monitor.has_checkpoint()): # Whether we have a pre-existing checkpoint. has_checkpoint = self.checkpoint_monitor.has_checkpoint() logging.info('has checkpoint: %s', has_checkpoint) # new_checkpoint is None if the training directory does not # have a more recent checkpoint than the one stored in the loop # directory. Otherwise it refers to the current newest checkpoint. new_checkpoint = self.checkpoint_monitor.new_checkpoint() logging.info('new checkpoint: %s', new_checkpoint) if new_checkpoint is not None: # We have a more recent checkpoint than in our local directory. logging.info('New checkpoint: "%s"', new_checkpoint) self.checkpoint_monitor.copy_latest_checkpoint() chkpt = os.path.join(self.loop_meta.checkpoints_path(), new_checkpoint) logging.info('Copied checkpoint: "%s"', chkpt) # We try to compute embeddings until we succeed. while not gfile.Exists(chkpt + '.npy'): runner.Runner().run(self.embedding_store_pipeline( chkpt)).wait_until_finish() if not gfile.Exists(chkpt + '.npy'): logging.error( 'Could not generate embeddings for the latest ' 'checkpoint %s.', chkpt) else: self.checkpoint_monitor.update_latest_checkpoint( new_checkpoint) break # If we had a pre-existing checkpoint or we managed to copy over # a new one, then we are succeeded. Let's not check the checkpoint # unless we had none. if has_checkpoint or self.checkpoint_monitor.has_checkpoint(): break else: # We don't have a checkpoint and never had one. Let's wait for # one appear in the training directory. logging.info( 'Waiting for the first model checkpoint to appear.') time.sleep(10)
def remove_training_directory(self, train_dir): """Removes the training directory.""" try: logging.info( "%s: Removing existing train directory.", task_as_string(self.task)) gfile.DeleteRecursively(train_dir) except: logging.error( "%s: Failed to delete directory " + train_dir + " when starting a new model. Please delete it manually and" + " try again.", task_as_string(self.task))
def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes): list_of_feature_names = [ feature_names.strip() for feature_names in feature_names.split(",") ] list_of_feature_sizes = [ int(feature_sizes) for feature_sizes in feature_sizes.split(",") ] if len(list_of_feature_names) != len(list_of_feature_sizes): logging.error("length of the feature names (=" + str(len(list_of_feature_names)) + ") != length of feature " "sizes (=" + str(len(list_of_feature_sizes)) + ")") return list_of_feature_names, list_of_feature_sizes
def remove_training_directory(self, train_dir): """Removes the training directory.""" try: logging.info( ("{}: Train dir already exist and start_new_model " "set to True. To restart model from scratch, " "delete the directory.").format(task_as_string(self.task))) # gfile.DeleteRecursively(train_dir) sys.exit() except: logging.error( "{}: Failed to delete directory {} when starting a new " "model. Please delete it manually and try again.".format( task_as_string(self.task), train_dir)) sys.exit()
def watch_threads(self): """Watch example queue and batch queue threads and restart if dead.""" while True: time.sleep(60) for idx, t in enumerate(self._example_q_threads): if not t.is_alive(): # if the thread is dead log.error('Found example queue thread dead. Restarting.') new_t = Thread(target=self.fill_example_queue) self._example_q_threads[idx] = new_t new_t.daemon = True new_t.start() for idx, t in enumerate(self._batch_q_threads): if not t.is_alive(): # if the thread is dead log.error('Found batch queue thread dead. Restarting.') new_t = Thread(target=self.fill_batch_queue) self._batch_q_threads[idx] = new_t new_t.daemon = True new_t.start()
def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes): """Extract the list of feature names and the dimensionality of each feature from string of comma separated values. Args: feature_names: string containing comma separated list of feature names feature_sizes: string containing comma separated list of feature sizes Returns: List of the feature names and list of the dimensionality of each feature. Elements in the first/second list are strings/integers. """ list_of_feature_names = [ feature_names.strip() for feature_names in feature_names.split(',')] list_of_feature_sizes = [ int(feature_sizes) for feature_sizes in feature_sizes.split(',')] if len(list_of_feature_names) != len(list_of_feature_sizes): logging.error("length of the feature names (=" + str(len(list_of_feature_names)) + ") != length of feature " "sizes (=" + str(len(list_of_feature_sizes)) + ")") return list_of_feature_names, list_of_feature_sizes
def recover_session(self): # Recover session saver = None latest_checkpoint = tf.train.latest_checkpoint(self.train_dir) if self.config.start_new_model: logging.info("'start_new_model' flag is set. Removing existing train dir.") try: gfile.DeleteRecursively(self.train_dir) except: logging.error( "Failed to delete directory " + self.train_dir + " when starting a new model. Please delete it manually and" + " try again.") elif not latest_checkpoint: logging.info("No checkpoint file found. Building a new model.") else: meta_filename = latest_checkpoint + ".meta" if not gfile.Exists(meta_filename): logging.info("No meta graph file found. Building a new model.") else: logging.info("Restoring from meta graph file %s", meta_filename) saver = tf.train.import_meta_graph(meta_filename) return saver
def __init__(self, num_classes=4800, feature_sizes=[1024], feature_names=["inc3"], max_frames=300): """Construct a YT8MFrameFeatureReader. Args: num_classes: a positive integer for the number of classes. feature_sizes: positive integer(s) for the feature dimensions as a list. feature_names: the feature name(s) in the tensorflow record as a list. max_frames: the maximum number of frames to process. """ if len(feature_names) != len(feature_sizes): logging.error("length of the feature names " "(=" + str(len(feature_names)) + ") != length of " "feature sizes (=" + str(len(feature_sizes)) + ")") self.num_classes = num_classes self.feature_sizes = feature_sizes self.feature_names = feature_names self.max_frames = max_frames
def text_generator(self, example_generator): """Generates article and abstract text from tf.Example. Args: example_generator: a generator of tf.Examples from file. See data.example_generator""" while True: e = next(example_generator) # e is a tf.Example try: article_text = e.features.feature['article'].bytes_list.value[ 0].decode( ) # the article text was saved under the key 'article' in the data files abstract_text = e.features.feature[ 'abstract'].bytes_list.value[0].decode( ) # the abstract text was saved under the key 'abstract' in the data files except ValueError: log.error('Failed to get article or abstract from example') continue if len( article_text ) == 0: # See https://github.com/abisee/pointer-generator/issues/1 log.warning( 'Found an example with empty article text. Skipping it.') else: yield (article_text, abstract_text)
def __init__(self, root: str, config: loop_pb2.LoopConfig, controller_fingerprint: int, read_only=None): self.root = os.path.join(root, config.name) self.config = config self.read_only = read_only self.controller_fingerprint = controller_fingerprint self.status = None self.error = None if self.layout_exists(): self.error = self.check_layout() self.read_status() assert self.status, 'Could not read status %s.' % self.status_filename() else: if read_only: self.error = 'Non-existent loop layout at %s' % self.root else: self.status = self.new_status() self.make_layout() self.error = self.check_layout() if self.error is not None: logging.error('%s', self.error)
def run_model(target, num_workers, global_step): ########################## # Config learning_rate # ########################## learning_rate = optimizer_utils.configure_learning_rate( FLAGS.num_sample_per_epoch, global_step) ########################################################## # Config optimizer and Wrapper optimizer with PAI-Soar # ########################################################## samples_per_step = FLAGS.batch_size optimizer = optimizer_utils.configure_optimizer(learning_rate) if FLAGS.enable_paisoar: import paisoar optimizer = paisoar.ReplicatedVarsOptimizer( optimizer, clip_norm=FLAGS.max_gradient_norm) ctx = paisoar.Config.get() samples_per_step *= len(ctx.device_indices) * num_workers ####################### # Config model func # ####################### model_fn = model_factory.get_model_fn(FLAGS.model_name, num_classes=FLAGS.num_classes, weight_decay=FLAGS.weight_decay, is_training=True) ############################# # Config dataset iterator # ############################# with tf.device('/cpu:0'): train_image_size = model_fn.default_image_size # split dataset by worker data_sources = get_tfrecord_files( _DATASET_TRAIN_FILES[FLAGS.dataset_name] or FLAGS.train_files, num_workers) # select the preprocessing func preprocessing_fn = preprocessing_factory.get_preprocessing( FLAGS.preprocessing_name or FLAGS.model_name, is_training=True) if (FLAGS.preprocessing_name or FLAGS.model_name) else None dataset_iterator = dataset_factory.get_dataset_iterator( FLAGS.dataset_name, train_image_size, preprocessing_fn, data_sources, FLAGS.reader) ############################################### # Config loss_func and Wrapper with PAI-Soar # ############################################### accuracy = [] def loss_fn(): with tf.device('/cpu:0'): images, labels = dataset_iterator.get_next() logits, end_points = model_fn(images) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=tf.cast( logits, tf.float32), weights=1.0) if 'AuxLogits' in end_points: loss += tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=tf.cast(end_points['AuxLogits'], tf.float32), weights=0.4) per_accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(logits, axis=1), labels), tf.float32)) accuracy.append(per_accuracy) return loss # wrapper loss_fn with PAI-Soar 2.0 loss = optimizer.compute_loss(loss_fn, loss_scale=FLAGS.loss_scale) if FLAGS.enable_paisoar \ else loss_fn() ######################## # Config train tensor # ######################## train_op = optimizer.minimize(loss, global_step=global_step) ############################################### # Log trainable or optimizer variables info, # # including name and size. # ############################################### log_trainable_or_optimizer_vars_info() ################ # Restore ckpt # ################ if FLAGS.model_dir and FLAGS.task_type == 'finetune': utils.load_checkpoint() ######################### # Config training hooks # ######################### params = dict() if FLAGS.log_loss_every_n_iters > 0: tensors_to_log = { 'loss': loss if isinstance(loss, tf.Tensor) else loss.replicas[0], 'accuracy': tf.reduce_mean(accuracy), 'lrate': learning_rate } params['tensors_to_log'] = tensors_to_log params['samples_per_step'] = samples_per_step hooks = get_hooks(params=params) ########################### # Kicks off the training. # ########################### logging.info('training starts.') with tf.train.MonitoredTrainingSession(target, is_chief=(FLAGS.task_index == 0), hooks=hooks) as sess: try: while not sess.should_stop(): sess.run(train_op) except tf.errors.OutOfRangeError: print('All threads done.') except Exception as e: import sys import traceback logging.error(e.message) traceback.print_exc(file=sys.stdout) logging.info('training ends.')
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error("Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] labels_origin = tf.get_collection("original_labels")[0] sampling_distribution = tf.get_collection("sampling_probs")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, #save_model_secs=15 * 60, save_model_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600), #save_summaries_secs=120, save_summaries_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600), saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val, labels_origin_val, sampling_distribution_val = sess.run( [train_op, global_step, loss, predictions, labels, labels_origin, sampling_distribution]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch ## check on the correctness of label sampling #if not np.all(labels_val[0]==labels_origin_val[0]): # print(np.where(labels_val[0])[0], " --- ", np.where(labels_origin_val[0])[0]) #print(np.all(labels_val == labels_origin_val, 1).mean()) #temp_bool = np.sum(labels_val, 1)<1 #if np.any(temp_bool): # print("\n\n\nFOUND!!", np.where(labels_val[temp_bool])[0], "\n\n\n") #print(sampling_distribution_val[:5, :5]) if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True #if self.is_master and global_step_val % 10 == 0 and self.train_dir: if self.is_master and global_step_val % FLAGS.validate_every_n_training_steps == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_origin_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_origin_val) gap = eval_util.calculate_gap(predictions_val, labels_origin_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() with open(FLAGS.train_dir + '/global_step_{%d}_training_GAP_{%.6f}.txt' % (global_step_val, gap), 'w') as f: f.write('\n') # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: #logging.info("training step " + str(global_step_val) + " | Loss: " + #("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) continue except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) logging.info( '############## PARAMETERS ##############################') logging.info("feature_names {}".format(FLAGS.feature_names)) logging.info("feature_sizes {}".format(FLAGS.feature_sizes)) logging.info("frame_features {}".format(FLAGS.frame_features)) logging.info("model {}".format(FLAGS.model)) logging.info("start_new_model {}".format(FLAGS.start_new_model)) logging.info("num_gpu {}".format(FLAGS.num_gpu)) logging.info("batch_size {}".format(FLAGS.batch_size)) logging.info("label_loss {}".format(FLAGS.label_loss)) logging.info("regularization_penalty {}".format( FLAGS.regularization_penalty)) logging.info("base_learning_rate {}".format(FLAGS.base_learning_rate)) logging.info("learning_rate_decay {}".format( FLAGS.learning_rate_decay)) logging.info("learning_rate_decay_examples {}".format( FLAGS.learning_rate_decay_examples)) logging.info("num_epochs {}".format(FLAGS.num_epochs)) logging.info("max_steps {}".format(FLAGS.max_steps)) logging.info("export_model_steps {}".format(FLAGS.export_model_steps)) logging.info("num_readers {}".format(FLAGS.num_readers)) logging.info("optimizer {}".format(FLAGS.optimizer)) logging.info("clip_gradient_norm {}".format(FLAGS.clip_gradient_norm)) logging.info( '########################################################') logging.info(' '.join([x for x in sys.argv])) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error( "Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: with tf.device(device_fn): saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op, init_fn = None, None if meta_filename: saver = tf.train.Saver(tf.global_variables(), max_to_keep=0, keep_checkpoint_every_n_hours=0.25) def init_fn(sess): return saver.restore(sess, meta_filename) else: init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, init_fn=init_fn, is_chief=self.is_master, global_step=global_step, save_model_secs=40 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error("Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() if FLAGS.ema_source: # Here the variables still exsist ema_op = tf.get_collection("ema_op")[0] def_vars = tf.get_collection("updatable_vars") ema_vars = tf.get_collection("ema_vars") sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: if FLAGS.ema_source: logging.info("%s: Entering training loop.", task_as_string(self.task)) sess.graph._unsafe_unfinalize() ckpt_reader = pywrap_tensorflow.NewCheckpointReader(FLAGS.ema_source) for xtensor, ematensor in zip(def_vars, ema_vars): src_tensor = ckpt_reader.get_tensor(xtensor.name.split(":")[0]) # Loading does not take up graoh space xtensor.load(src_tensor, session=sess) ematensor.load(src_tensor, session=sess) # sess.run(tf.assign(xtensor, src_tensor)) # sess.run(tf.assign(ematensor, src_tensor)) try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) if FLAGS.ema_source: # Update EMA if needed _ = sess.run(ema_op) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: # self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: save_name = "{0}/model.ckpt".format(self.train_dir) saver.save(sess, save_name, global_step_val) logging.info("Final model export.") logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def fit(self, train_data_pipeline, start_new_model=False, tr_data_fn=None, tr_data_paras=None, validate_set=None, validate_fn=None, bootstrap=False, init_learning_rate=0.01, decay_steps=40000, decay_rate=0.95, epochs=None, l1_reg_rate=None, l2_reg_rate=0.01, pos_weights=None, initial_weights=None, initial_biases=None): """ Logistic regression fit function. Args: train_data_pipeline: A namedtuple consisting of reader, data_pattern, batch_size and num_readers. start_new_model: If True, start a new model instead of restoring from existing checkpoints. tr_data_fn: a function that transforms input data. tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary. validate_set: If not None, check validation loss regularly. Else, ignored. validate_fn: The function to check the performance of learned model parameters on validate set. bootstrap: If True, sampling training examples with replacement by differential weighting. init_learning_rate: Decayed gradient descent parameter. decay_steps: Decayed gradient descent parameter. decay_rate: Decayed gradient descent parameter. epochs: Maximal epochs to use. l1_reg_rate: None, not impose l1 regularization. l2_reg_rate: l2 regularization rate. pos_weights: For imbalanced binary classes. Here, num_pos << num_neg, the weights should be > 1.0. If None, treated as 1.0 for all binary classifiers. initial_weights: If not None, the weights will be initialized with it. initial_biases: If not None, the biases will be initialized with it. Returns: None. """ reader = train_data_pipeline.reader batch_size = train_data_pipeline.batch_size num_classes = reader.num_classes feature_names = reader.feature_names feature_sizes = reader.feature_sizes logging.info( 'Logistic regression uses {} features with dims {}.'.format( feature_names, feature_sizes)) raw_feature_size = sum(feature_sizes) self.train_data_pipeline = train_data_pipeline self.raw_feature_size = raw_feature_size self.feature_size = raw_feature_size self.num_classes = num_classes self.batch_size = batch_size self.tr_data_fn = tr_data_fn self.tr_data_paras = tr_data_paras self.bootstrap = bootstrap self.init_learning_rate = init_learning_rate self.decay_steps = decay_steps self.decay_rate = decay_rate self.epochs = epochs self.l1_reg_rate = l1_reg_rate self.l2_reg_rate = l2_reg_rate self.pos_weights = pos_weights self.initial_weights = initial_weights self.initial_biases = initial_biases # Check extra data transform function arguments. # If transform changes the features size, change it. if self.tr_data_fn is not None: if self.tr_data_paras is None: self.tr_data_paras = dict() else: if ('reshape' in self.tr_data_paras) and ( self.tr_data_paras['reshape'] is True): self.feature_size = self.tr_data_paras['size'] logging.warn( 'Data transform changes the features size to {}.'. format(self.feature_size)) logging.debug('Data transform arguments are {}.'.format( self.tr_data_paras)) else: self.tr_data_paras = dict() start_new_model = start_new_model or (not tf.gfile.Exists(self.logdir)) # This is NECESSARY to avoid contaminating default graph. # Alternatively, we can define a member graph variable. When building a new graph or # restoring a graph, wrap the code into a similar contextmanager. self.graph = tf.Graph() with self.graph.as_default(): if start_new_model: logging.info('Starting a new model...') # Start new model, delete existing checkpoints. if tf.gfile.Exists(self.logdir): try: tf.gfile.DeleteRecursively(self.logdir) except tf.errors.OpError: logging.error('Failed to delete dir {}.'.format( self.logdir)) else: logging.info( 'Succeeded to delete train dir {}.'.format( self.logdir)) else: # Do nothing. pass # Build graph, namely building a graph and initialize member variables associated with graph. self.saver = self._build_graph() else: self.saver = self._restore_graph() # After either building a graph or restoring a graph, graph is CONSTRUCTED successfully. # Get collections to be used in training. self.global_step = tf.get_collection('global_step')[0] self.init_op = tf.get_collection('init_op')[0] self.train_op = tf.get_collection('train_op')[0] self.summary_op = tf.get_collection('summary_op')[0] self.raw_features_batch = tf.get_collection( 'raw_features_batch')[0] self.labels_batch = tf.get_collection('labels_batch')[0] self.loss = tf.get_collection('loss')[0] self.pred_prob = tf.get_collection('predictions')[0] if self._check_graph_initialized(): logging.info('Succeeded to initialize logistic regression Graph.') else: logging.error('Failed to initialize logistic regression Graph.') # Start or restore training. # To avoid summary causing memory usage peak, manually save summaries. sv = tf.train.Supervisor(graph=self.graph, init_op=self.init_op, logdir=self.logdir, global_step=self.global_step, summary_op=None, save_model_secs=600, saver=self.saver) with sv.managed_session() as sess: logging.info("Entering training loop...") for step in range(self.max_train_steps): if sv.should_stop(): # Save the final model and break. self.saver.save(sess, save_path='{}_{}'.format( sv.save_path, 'final')) break if step % 500 == 0: if validate_fn is not None: _, summary, train_pred_prob_batch, train_labels_batch, global_step_val = sess.run( [ self.train_op, self.summary_op, self.pred_prob, self.labels_batch, self.global_step ]) # Evaluate on train data. train_per = validate_fn( predictions=train_pred_prob_batch, labels=train_labels_batch) sv.summary_writer.add_summary( MakeSummary( 'train/{}'.format(validate_fn.func_name), train_per), global_step_val) logging.info('Step {}, train {}: {}.'.format( global_step_val, validate_fn.func_name, train_per)) else: _, summary, global_step_val = sess.run( [self.train_op, self.summary_op, self.global_step]) # Add train summary. sv.summary_computed(sess, summary, global_step=global_step_val) # Compute validate loss and performance (validate_fn). if validate_set is not None: validate_data, validate_labels = validate_set # Compute validation loss. num_validate_videos = validate_data.shape[0] split_indices = np.linspace( 0, num_validate_videos + 1, num=max( num_validate_videos // (2 * batch_size) + 1, 2), dtype=np.int32) validate_loss_vals, predictions = [], [] for i in range(len(split_indices) - 1): start_ind = split_indices[i] end_ind = split_indices[i + 1] if validate_fn is not None: ith_validate_loss_val, ith_predictions = sess.run( [self.loss, self.pred_prob], feed_dict={ self.raw_features_batch: validate_data[start_ind:end_ind], self.labels_batch: validate_labels[start_ind:end_ind] }) validate_loss_vals.append( ith_validate_loss_val * (end_ind - start_ind)) predictions.append(ith_predictions) else: ith_validate_loss_val = sess.run( self.loss, feed_dict={ self.raw_features_batch: validate_data[start_ind:end_ind], self.labels_batch: validate_labels[start_ind:end_ind] }) validate_loss_vals.append( ith_validate_loss_val * (end_ind - start_ind)) validate_loss_val = sum( validate_loss_vals) / num_validate_videos # Add validate summary. sv.summary_writer.add_summary( MakeSummary('validate/xentropy', validate_loss_val), global_step_val) if validate_fn is not None: validate_per = validate_fn( predictions=np.concatenate(predictions, axis=0), labels=validate_labels) sv.summary_writer.add_summary( MakeSummary( 'validate/{}'.format( validate_fn.func_name), validate_per), global_step_val) logging.info('Step {}, validate {}: {}.'.format( global_step_val, validate_fn.func_name, validate_per)) elif step % 200 == 0: _, summary, global_step_val = sess.run( [self.train_op, self.summary_op, self.global_step]) sv.summary_computed(sess, summary, global_step=global_step_val) else: sess.run(self.train_op) logging.info("Exited training loop.") # Session will close automatically when with clause exits. # sess.close() sv.stop()
from ffn.inference import movement from ffn.training import mask from ffn.training.import_util import import_symbol from ffn.training import inputs from ffn.training import augmentation # Necessary so that optimizer flags are defined. # pylint: disable=unused-import from ffn.training import optimizer # pylint: enable=unused-import try: import horovod.tensorflow as hvd horovodworks = True except Exception as e: horovodworks = False logging.error('Horovod not found') logging.error(e) class hvd(): __version__='0.0.0' def rank(): return 0 def local_rank(): return 0 def size(): return 1 def init(): pass FLAGS = flags.FLAGS # Options related to training data. flags.DEFINE_string('train_coords', None,
def main(unused_argv): """ Train the rbf network. """ logging.set_verbosity(logging.INFO) start_new_model = FLAGS.start_new_model output_dir = FLAGS.output_dir # The ratio of examples to sample as centers (prototypes). num_centers_ratio = FLAGS.num_centers_ratio model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers # distance metric, cosine or euclidean. dist_metric = FLAGS.dist_metric init_with_linear_clf = FLAGS.init_with_linear_clf init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate train_epochs = FLAGS.train_epochs l1_reg_rate = FLAGS.l1_reg_rate l2_reg_rate = FLAGS.l2_reg_rate # ....Start rbf network... logging.info('Entering rbf network...') # Validate set is not stored in graph or meta data. Re-create it any way. # Sample validate set for logistic regression early stopping. validate_data_pipeline = DataPipeline(reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Sample validate set. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline, name_scope='sample_validate') with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) # DataPipeline consists of reader, batch size, no. of readers and data pattern. train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) # If start a new model or output dir does not exist, truly start a new model. start_new_model = start_new_model or (not tf.gfile.Exists(output_dir)) if start_new_model: # PHASE ONE - selecting prototypes c, computing scaling factors sigma. # num_centers = FLAGS.num_centers # num_centers_ratio = float(num_centers) / NUM_TRAIN_EXAMPLES # metric is euclidean or cosine. If cosine, alpha=1.0, otherwise can be less than 1.0. if 'cosine' == dist_metric: # 200 will lead to decreasing drastically and increasing slowly. alpha = 1.0 else: alpha = 1.0 centers, sigmas = initialize(num_centers_ratio, data_pipeline=train_data_pipeline, method='kmeans', metric=dist_metric, scaling_method=4, alpha=alpha) # PHASE TWO - computing linear regression weights and biases. num_centers = centers.shape[0] # Compute mean and variance after data transform. tr_data_fn = rbf_transform tr_data_paras = { 'centers': centers, 'sigmas': sigmas, 'metric': dist_metric, 'reshape': True, 'size': num_centers } """ # Include standard scale to rbf transform. tr_data_mean, tr_data_var = compute_data_mean_var(train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras) logging.debug('tr_data_mean: {}\ntr_data_var: {}'.format(tr_data_mean, tr_data_var)) tr_data_paras.update({'mean': tr_data_mean, 'variance': tr_data_var}) """ if init_with_linear_clf: # Call linear classification to get a good initial values of weights and biases. linear_clf = LinearClassifier( logdir=path_join(output_dir, 'linear_classifier')) linear_clf.fit(data_pipeline=train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, l2_regs=[ 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ], validate_set=(validate_data, validate_labels), line_search=True) linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases else: linear_clf_weights, linear_clf_biases = None, None # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: pos_weights = None else: linear_clf_weights, linear_clf_biases = None, None tr_data_fn, tr_data_paras = None, None pos_weights = None # PHASE THREE - fine tuning prototypes c, scaling factors sigma and weights and biases. log_reg_clf = LogisticRegression(logdir=path_join(output_dir, 'log_reg')) log_reg_clf.fit(train_data_pipeline=train_data_pipeline, start_new_model=start_new_model, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, init_learning_rate=init_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate, epochs=train_epochs, l1_reg_rate=l1_reg_rate, l2_reg_rate=l2_reg_rate, pos_weights=pos_weights, initial_weights=linear_clf_weights, initial_biases=linear_clf_biases) # ....Exit rbf network... logging.info('Exit rbf network.')
def main(unused_argv): logging.set_verbosity(logging.INFO) start_new_model = FLAGS.start_new_model output_dir = FLAGS.output_dir init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate l1_reg_rate = FLAGS.l1_reg_rate l2_reg_rate = FLAGS.l2_reg_rate is_bootstrap = FLAGS.is_bootstrap train_epochs = FLAGS.train_epochs model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Increase num_readers. validate_data_pipeline = DataPipeline( reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) # Sample validate set. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline, name_scope='sample_validate') with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) model_save_path = path_join(output_dir, 'mlp_fuse') if start_new_model and tf.gfile.Exists(model_save_path): logging.info('Starting a new model...') # Start new model, delete existing checkpoints. try: tf.gfile.DeleteRecursively(model_save_path) except tf.errors.OpError: logging.error('Failed to delete dir {}.'.format(model_save_path)) else: logging.info( 'Succeeded to delete train dir {}.'.format(model_save_path)) # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: logging.warn('Not to use positive weights.') pos_weights = None train(train_data_pipeline, epochs=train_epochs, pos_weights=pos_weights, l1_reg_rate=l1_reg_rate, l2_reg_rate=l2_reg_rate, init_learning_rate=init_learning_rate, bootstrap=is_bootstrap, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, logdir=model_save_path)
arg_parser.add_argument("--preprocess", "-p", type=bool, default=False) arg_parser.add_argument("--data", "-d", type=str, default="./data") args = arg_parser.parse_args() mode = args.mode if_preprocess = args.preprocess data_dir = args.data logging.set_verbosity(logging.INFO) train_x, train_y, test_x, train_data, test_data = load_data(data_dir=data_dir, if_preprocess=if_preprocess) logging.info("building model...") model = attention() restored = model.restore() if mode == "train": logging.info("training...") model.train(train_x, train_y,epochs=100,batch_size=150) elif mode == "evaluate": logging.info("evaluating...") if restored: for name, value in model.evaluate(train_x,train_y,batch_size=150): print("name: %s, value: %f" % (name, value)) else: logging.error("error: model weights not exist!") elif mode == "submit": logging.info("predicting final result...") test_data[LABEL_LIST] = model.predict(test_x, batch_size=150) test_data = test_data[["id"]+LABEL_LIST] test_data.to_csv("submission.csv", index=False)
if msg == 'reset': for data_path in paths: filename = os.path.split(data_path)[1] filepath = os.path.join(data_dir, filename) if os.path.exists(filepath): logging.debug(filename + " all readly exist in local") data_msg_q.put_data(filepath) logging.debug("reuse local data " + filepath + " done") continue count, ret_code = 0, -1 command = ["hadoop", "fs", "-get", data_path, data_dir] while count < retry_times and ret_code != 0: try: ret_code = subprocess.check_call(command) except subprocess.CalledProcessError, e: logging.error(e) count += 1 if ret_code == 0: data_msg_q.put_data(filepath) logging.debug("fetch " + data_path + " done") else: logging.debug("fetch " + data_path + " failed") data_msg_q.put_data(None) elif msg == 'shuffle': random.shuffle(paths) elif msg == 'Done': break msg = data_msg_q.get_msg()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error("Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def main(unused_argv): """ Training. init_learning_rate: Initial learning rate. decay_steps: How many training steps to decay learning rate once. decay_rate: How much to decay learning rate. l2_reg_rate: l2 regularization rate. epochs: The maximal epochs to pass all training data. """ logging.set_verbosity(logging.INFO) output_dir = FLAGS.output_dir start_new_model = FLAGS.start_new_model init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate l2_reg_rate = FLAGS.l2_reg_rate train_epochs = FLAGS.train_epochs model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers init_with_linear_clf = FLAGS.init_with_linear_clf is_bootstrap = FLAGS.is_bootstrap # Increase num_readers. validate_data_pipeline = DataPipeline(reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Sample validate set for line search in linear classifier or logistic regression early stopping. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline) with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) start_new_model = start_new_model or (not tf.gfile.Exists(output_dir)) # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( (float(NUM_TRAIN_EXAMPLES) - train_sum_labels) / train_sum_labels) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: logging.error('Disable pos_weights.') # Set it as None to disable pos_weights. pos_weights = None train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) if start_new_model: # Load train data mean and std. train_features_mean, train_features_var = load_features_mean_var( reader) tr_data_fn = standard_scale tr_data_paras = { 'mean': train_features_mean, 'variance': train_features_var, 'reshape': False, 'size': None } if init_with_linear_clf: # ...Start linear classifier... # Compute weights and biases of linear classifier using normal equation. # Linear search helps little. linear_clf = LinearClassifier( logdir=path_join(output_dir, 'linear_classifier')) linear_clf.fit(data_pipeline=train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, l2_regs=[ 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ], validate_set=(validate_data, validate_labels), line_search=True) linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases logging.info( 'linear classifier weights and biases with shape {}, {}'. format(linear_clf_weights.shape, linear_clf_biases.shape)) logging.debug( 'linear classifier weights and {} biases: {}.'.format( linear_clf_weights, linear_clf_biases)) # ...Exit linear classifier... else: linear_clf_weights, linear_clf_biases = None, None else: linear_clf_weights, linear_clf_biases = None, None tr_data_fn = None tr_data_paras = None # Run logistic regression. log_reg = LogisticRegression(logdir=path_join(output_dir, 'log_reg')) log_reg.fit(train_data_pipeline, start_new_model=start_new_model, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, bootstrap=is_bootstrap, init_learning_rate=init_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate, epochs=train_epochs, l2_reg_rate=l2_reg_rate, pos_weights=pos_weights, initial_weights=linear_clf_weights, initial_biases=linear_clf_biases)
def main(unused_argv): logging.set_verbosity(tf.logging.INFO) print("tensorflow version: %s" % tf.__version__) is_chief = (FLAGS.task == 0) # Recover session saver = None latest_checkpoint = tf.train.latest_checkpoint(FLAGS.train_dir) if FLAGS.start_new_model: logging.info("'start_new_model' flag is set. Removing existing train dir.") try: gfile.DeleteRecursively(FLAGS.train_dir) except: logging.error( "Failed to delete directory " + FLAGS.train_dir + " when starting a new model. Please delete it manually and" + " try again.") elif not latest_checkpoint: logging.info("No checkpoint file found. Building a new model.") else: meta_filename = latest_checkpoint + ".meta" if not gfile.Exists(meta_filename): logging.info("No meta graph file found. Building a new model.") else: logging.info("Restoring from meta graph file %s", meta_filename) saver = tf.train.import_meta_graph(meta_filename) if not saver: # convert feature_names and feature_sizes to lists of values feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( FLAGS.feature_names, FLAGS.feature_sizes) if FLAGS.frame_features: reader = readers.YT8MFrameFeatureReader( feature_names=feature_names, feature_sizes=feature_sizes) else: reader = readers.YT8MAggregatedFeatureReader( feature_names=feature_names, feature_sizes=feature_sizes) model = find_class_by_name(FLAGS.model, [frame_level_models, video_level_models])() label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train]) build_graph(reader=reader, model=model, optimizer_class=optimizer_class, clip_gradient_norm=FLAGS.clip_gradient_norm, train_data_pattern=FLAGS.train_data_pattern, label_loss_fn=label_loss_fn, base_learning_rate=FLAGS.base_learning_rate, learning_rate_decay=FLAGS.learning_rate_decay, learning_rate_decay_examples=FLAGS.learning_rate_decay_examples, regularization_penalty=FLAGS.regularization_penalty, num_readers=FLAGS.num_readers, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) logging.info("built graph") saver = tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=0.25) train_loop(is_chief=is_chief, train_dir=FLAGS.train_dir, saver=saver, master=FLAGS.master)
def define_data_input(model, queue_batch=None): """Adds TF ops to load input data.""" label_volume_map = {} for vol in FLAGS.label_volumes.split(','): volname, path, dataset = vol.split(':') label_volume_map[volname] = h5py.File(path)[dataset] image_volume_map = {} for vol in FLAGS.data_volumes.split(','): volname, path, dataset = vol.split(':') image_volume_map[volname] = h5py.File(path)[dataset] if queue_batch is None: queue_batch = FLAGS.batch_size # Fetch sizes of images and labels label_size = train_labels_size(model) image_size = train_image_size(model) label_radii = (label_size // 2).tolist() label_size = label_size.tolist() image_radii = (image_size // 2).tolist() image_size = image_size.tolist() # Fetch a single coordinate and volume name from a queue reading the # coordinate files or from saved hard/important examples import os.path if os.path.isfile(FLAGS.train_coords): logging.info('{} exists.'.format(FLAGS.train_coords)) else: logging.error('{} does not exist.'.format(FLAGS.train_coords)) if FLAGS.sharding_rule == 0: coord, volname = inputs.load_patch_coordinates(FLAGS.train_coords) elif FLAGS.sharding_rule == 1 and 'horovod' in sys.modules: d = tf.data.TFRecordDataset(FLAGS.train_coords, compression_type='GZIP') d = d.shard(hvd.size(), hvd.rank()) d = d.map(parser_fn) iterator = d.make_one_shot_iterator() coord, volname = iterator.get_next() else: logging.warning("You need to install Horovod to use sharding. Turning sharding off..") FLAGS.sharding_rule = 0 coord, volname = inputs.load_patch_coordinates(FLAGS.train_coords) # Load object labels (segmentation). labels = inputs.load_from_numpylike( coord, volname, label_size, label_volume_map) label_shape = [1] + label_size[::-1] + [1] #label_shape = [1] + [1] + label_size[::-1] # NCDHW labels = tf.reshape(labels, label_shape) loss_weights = tf.constant(np.ones(label_shape, dtype=np.float32)) # Load image data. patch = inputs.load_from_numpylike( coord, volname, image_size, image_volume_map) data_shape = [1] + image_size[::-1] + [1] patch = tf.reshape(patch, shape=data_shape) if ((FLAGS.image_stddev is None or FLAGS.image_mean is None) and not FLAGS.image_offset_scale_map): raise ValueError('--image_mean, --image_stddev or --image_offset_scale_map ' 'need to be defined') # Convert segmentation into a soft object mask. lom = tf.logical_and( labels > 0, tf.equal(labels, labels[0, label_radii[2], label_radii[1], label_radii[0], 0])) labels = inputs.soften_labels(lom) # Apply basic augmentations. transform_axes = augmentation.PermuteAndReflect( rank=5, permutable_axes=_get_permutable_axes(), reflectable_axes=_get_reflectable_axes()) labels = transform_axes(labels) patch = transform_axes(patch) loss_weights = transform_axes(loss_weights) # Normalize image data. patch = inputs.offset_and_scale_patches( patch, volname[0], offset_scale_map=_get_offset_and_scale_map(), default_offset=FLAGS.image_mean, default_scale=FLAGS.image_stddev) # Create a batch of examples. Note that any TF operation before this line # will be hidden behind a queue, so expensive/slow ops can take advantage # of multithreading. #MK TODO: check num_threads usage here patches, labels, loss_weights = tf.train.shuffle_batch( [patch, labels, loss_weights], queue_batch, num_threads=max(1, FLAGS.batch_size // 2), capacity=32 * FLAGS.batch_size, min_after_dequeue=4 * FLAGS.batch_size, enqueue_many=True) return patches, labels, loss_weights, coord, volname
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if file_io.file_exists(flags_json_path): existing_flags = json.load( file_io.FileIO(flags_json_path, mode="r")) if existing_flags != model_flags_dict: logging.error( "Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with file_io.FileIO(flags_json_path, mode="w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model() global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() if FLAGS.dropout: keep_prob_tensor = tf.get_collection("keep_prob")[0] if FLAGS.noise_level > 0: noise_level_tensor = tf.get_collection("noise_level")[0] if FLAGS.reweight: weights_input, weights_assignment = None, None if len(tf.get_collection("weights_input")) > 0: weights_input = tf.get_collection("weights_input")[0] weights_assignment = tf.get_collection( "weights_assignment")[0] sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=FLAGS.keep_checkpoint_interval * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: # re-assign weights if FLAGS.reweight: optional_assign_weights(sess, weights_input, weights_assignment) steps = 0 try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while not sv.should_stop(): steps += 1 batch_start_time = time.time() custom_feed = {} if FLAGS.dropout: custom_feed[keep_prob_tensor] = FLAGS.keep_prob if FLAGS.noise_level > 0: custom_feed[noise_level_tensor] = FLAGS.noise_level _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels], feed_dict=custom_feed) seconds_per_batch = time.time() - batch_start_time if self.is_master: examples_per_second = labels_val.shape[ 0] / seconds_per_batch hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) recall = "N/A" if False: recall = eval_util.calculate_recall_at_n( predictions_val, labels_val, FLAGS.recall_at_n) sv.summary_writer.add_summary( utils.MakeSummary( "model/Training_Recall@%d" % FLAGS.recall_at_n, recall), global_step_val) recall = "%.2f" % recall gap = eval_util.calculate_gap(predictions_val, labels_val) logging.info( "%s: training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) + " Recall@%d: " % FLAGS.recall_at_n + recall + " Loss: " + str(loss_val), task_as_string(self.task)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() if FLAGS.max_steps is not None and steps > FLAGS.max_steps: logging.info( "%s: Done training -- max_steps limit reached.", task_as_string(self.task)) break except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def makeTFRecordBatches(save_path, xml_path, img_path, batch_size, words=False, bounded_size_h=25, bounded_size_w=350): if not os.path.exists(save_path): os.makedirs(save_path) totalT, totalI, totalL = [], [], [] nr = 0 xml_files = sorted(glob.glob(xml_path + "*.xml")) for i, xml in enumerate(xml_files): print( "=============================== FILE: {} / {} ===============================" .format(i + 1, len(xml_files))) name = xml.split('.xml')[0].split('/')[-1] img = img_path + name + '.png' if not os.path.isfile(img): continue ret = process_page(xml, img, words) if ret is None: continue texts, imgs, lengths = ret totalT.extend(texts) totalI.extend(imgs) totalL.extend(lengths) if len(totalT) >= batch_size: writer = tf.python_io.TFRecordWriter( path=save_path + "/iam_data-{}.tfrecords".format(nr)) for ii in range(batch_size): imshape_1_bad = False imshape_2_bad = False old_shape = totalI[ii].shape if totalI[ii].shape[0] > bounded_size_h: imshape_1_bad = True totalI[ii] = cv2.resize( totalI[ii], (totalI[ii].shape[1], bounded_size_h)) if totalI[ii].shape[1] > bounded_size_w: imshape_2_bad = True totalI[ii] = cv2.resize( totalI[ii], (bounded_size_w, totalI[ii].shape[0])) if imshape_1_bad or imshape_2_bad: logging.error( "Image shape was bounded: (={}) -> (={})".format( old_shape, totalI[ii].shape)) imgI = add_padding(totalI[ii], pad_to=(bounded_size_h, bounded_size_w)) example = tf.train.Example(features=tf.train.Features( feature={ 'seq_len': tf.train.Feature(int64_list=tf.train.Int64List( value=[totalL[ii]])), 'target': tf.train.Feature(int64_list=tf.train.Int64List( value=totalT[ii].astype("int64"))), 'imageInput': tf.train.Feature(float_list=tf.train.FloatList( value=(imgI).reshape(-1).astype("float"))) })) serialized = example.SerializeToString() writer.write(serialized) writer.flush() writer.close() nr += 1 print("Batch file #({}) saved successfully to: {}!".format( nr, (save_path + "/iam_data-{}.tfrecords".format(nr)))) totalT = totalT[batch_size:] totalI = totalI[batch_size:] totalL = totalL[batch_size:]
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, "netvlad_cluster_size": FLAGS.netvlad_cluster_size, "netvlad_hidden_size": FLAGS.netvlad_hidden_size, "moe_l2": FLAGS.moe_l2, "iterations": FLAGS.iterations, "netvlad_relu": FLAGS.netvlad_relu, "gating": FLAGS.gating, "moe_num_mixtures": FLAGS.moe_num_mixtures, "moe_prob_gating": FLAGS.moe_prob_gating, } # model_flags_dict = {} # for k in FLAGS.__flags: # model_flags_dict[k] = FLAGS[k].value flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error( "Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def train_model(model_name, model, row_start=None, row_end=None, step=None, initial_epoch=0, end_epoch=1, time_limit=None): if initial_epoch >= end_epoch: logging.error('initial_epoch(%d) >= end_epoch(%d).') return None if 'batch_size' not in config: config['batch_size'] = default_batch_size if 'does_shuffle' not in config: config['does_shuffle'] = default_does_shuffle if 'callbacks' not in config: config['callbacks'] = default_callbacks if 'monitored_loss_name' not in config: config['monitored_loss_name'] = default_monitored_loss_name if 'max_queue_size' not in config: config['max_queue_size'] = default_max_queue_size if 'does_use_multiprocessing' not in config: config['does_use_multiprocessing'] = default_does_use_multiprocessing if 'worker_number' not in config: config['worker_number'] = default_worker_number if 'verbose' not in config: config['verbose'] = default_verbose callbacks = list() if config['callbacks'] is not None else None if callbacks is not None: for cb in config['callbacks']: if isinstance(cb, keras.callbacks.Callback): if isinstance(cb, TimeLimiter) and time_limit is not None: logging.warning( 'train_model: parameter time_limit is not None, ignored TimeLimiter in config.' ) continue callbacks.append(cb) elif isinstance(cb, str): cb_str = cb.lower() cb_str = re.sub(pattern=_remove_pattern, repl='', string=cb_str) sep_idx = cb_str.find(':') cb_params = dict() if sep_idx >= 0: cb_name = cb_str[:sep_idx] cb_params_strs = cb_str[sep_idx + 1:].split(',') for cb_param_str in cb_params_strs: eq_idx = cb_param_str.find('=') if eq_idx >= 0: cb_params[ cb_param_str[:eq_idx]] = cb_param_str[eq_idx + 1:] else: cb_params[cb_param_str] = '1' else: cb_name = cb_str if cb_name == 'earlystopping': es_monitor = config[ 'monitored_loss_name'] if 'monitor' not in cb_params else cb_params[ 'monitor'] if 'baseline' not in cb_params: _, es_baseline = load_best_info( model_name=model_name, monitor_name=es_monitor) else: es_baseline = float(cb_params['baseline']) callbacks.append( keras.callbacks.EarlyStopping( monitor=es_monitor, min_delta=EPSILON if 'min_delta' not in cb_params else float(cb_params['min_delta']), patience=2 if 'patience' not in cb_params else int( cb_params['patience']), verbose=1 if 'verbose' not in cb_params else int( cb_params['verbose']), mode='min' if 'mode' not in cb_params else cb_params['mode'], baseline=es_baseline, )) elif cb_name == 'tensorboard': callbacks.append( keras.callbacks.TensorBoard( log_dir=os.path.join(LOG_DIRECTORY, model_name) if 'log_dir' not in cb_params else cb_params['log_dir'], batch_size=config['batch_size'], write_graph=True if 'write_graph' not in cb_params else str_to_bool(cb_params['write_graph']), )) elif cb_name == 'modelsaver': callbacks.append( ModelSaver( model_name=model_name, period=1 if 'period' not in cb_params else int( cb_params['period']), verbose=1 if 'verbose' not in cb_params else int( cb_params['verbose']), )) elif cb_name == 'epochnumbersaver': callbacks.append( EpochNumberSaver( model_name=model_name, verbose=1 if 'verbose' not in cb_params else int( cb_params['verbose']), )) elif cb_name == 'bestinfosaver': bi_monitor = config[ 'monitored_loss_name'] if 'monitor' not in cb_params else cb_params[ 'monitor'] if 'baseline' not in cb_params: _, bi_baseline = load_best_info( model_name=model_name, monitor_name=bi_monitor) else: bi_baseline = float(cb_params['baseline']) callbacks.append( BestInfoSaver( model_name=model_name, monitor=bi_monitor, mode='min' if 'mode' not in cb_params else cb_params['mode'], baseline=bi_baseline, verbose=1 if 'verbose' not in cb_params else int( cb_params['verbose']), )) elif cb_name == 'timelimiter': if time_limit is not None: logging.warning( 'train_model: parameter time_limit is not None, ignored TimeLimiter in config.' ) continue if 'limit' not in cb_params: raise ValueError( "TimeLimiter's parameter limit is missed.") callbacks.append( TimeLimiter( limit=cb_params['limit'], verbose=1 if 'verbose' not in cb_params else int( cb_params['verbose']), )) else: raise UnknownCallbackNameException(cb) else: raise TypeError( 'Callback must be an instance of keras.callbacks.Callback or a callback name(string).' ) if time_limit is not None: callbacks.append(TimeLimiter(limit=time_limit, verbose=1)) rolling_window_size = get_rolling_window_size(model_name) generator = SquareExDataGenerator( dataset_name=DATASET_NAME_TRAIN, rolling_window_size=rolling_window_size, row_start=row_start, row_end=row_end, step=step, max_batch_size=config['batch_size'], does_shuffle=config['does_shuffle'], ) history = model.fit_generator( generator=generator, epochs=end_epoch, verbose=config['verbose'], callbacks=callbacks, max_queue_size=config['max_queue_size'], use_multiprocessing=config['does_use_multiprocessing'], workers=config['worker_number'], initial_epoch=initial_epoch, ) return history