def cli(**args): """Main control for the experiments.""" mode = args['mode'] exp_name = args['exp_name'].strip("/") assert exp_name.startswith(path.join("experiments", "config")) exp_purename = path.basename(exp_name) exp_feat_fp = path.join("experiments", "features", exp_purename) exp_log_fp = path.join("experiments", "states", exp_purename) if not path.exists(exp_feat_fp): os.makedirs(exp_feat_fp) if not path.exists(exp_log_fp): os.makedirs(exp_log_fp) # Set up file logging. fh = logging.FileHandler(path.join(exp_log_fp, '')) fh.setLevel(logging.INFO) formatter = logging.Formatter(LOGFORMAT) fh.setFormatter(formatter) LOGGER.addHandler(fh)"Running on host: %s", socket.getfqdn()) if "JOB_ID" in os.environ.keys():"Condor job id: %s", os.environ["JOB_ID"])"Running mode `%s` for experiment `%s`.", mode, exp_name) # Configuration. exp_config_mod = imp.load_source('_exp_config', path.join(exp_name, '')) exp_config = exp_config_mod.adjust_config(exp_config_mod.get_config(), mode) assert mode in exp_config["supp_modes"], ( "Unsupported mode by this model: %s, available: %s." % (mode, str(exp_config["supp_modes"]))) if args["override_dset_suffix"] is not None: LOGGER.warn("Overriding dset suffix to `%s`!", args["override_dset_suffix"]) exp_config["dset_suffix"] = args["override_dset_suffix"] if args['custom_options'] != '': custom_options = ast.literal_eval(args["custom_options"]) exp_config.update(custom_options) exp_config['num_threads'] = args["num_threads"] exp_config['n_samples'] = args["n_samples"]"Configuration:") for key, val in exp_config.items():"%s = %s", key, val) # Data setup. with tf.device('/cpu:0'): nsamples, steps_per_epoch, loader_list = \ data.get_dataset(EXP_DATA_FP, mode, exp_config)"%d examples prepared, %d steps per epoch.", nsamples, steps_per_epoch)"Setting up preprocessing...") exp_prep_mod = imp.load_source('_exp_preprocessing', path.join(exp_name, '')) with tf.device('/cpu:0'): examples = exp_prep_mod.preprocess(nsamples, loader_list, mode, exp_config) # Checkpointing. if args['no_checkpoint']: assert args['checkpoint'] is None if not args["no_checkpoint"]:"Looking for checkpoints...") if args['checkpoint'] is not None: checkpoint = args['checkpoint'][:-5] else: checkpoint = tf.train.latest_checkpoint(exp_log_fp) if checkpoint is None:"No checkpoint found. Continuing without.") checkpoint, load_session, load_graph = None, None, None else: load_graph = tf.Graph() with load_graph.as_default(): meta_file = checkpoint + '.meta'"Restoring graph from `%s`...", meta_file) rest_saver = tf.train.import_meta_graph(meta_file, clear_devices=True)"Graph restored. Loading checkpoint `%s`...", checkpoint) load_session = tf.Session() rest_saver.restore(load_session, checkpoint) else: checkpoint, load_session, load_graph = None, None, None if mode not in ['train', 'trainval'] and load_session is None: raise Exception("The mode %s requires a checkpoint!" % (mode)) # Build model. model_mod = imp.load_source('_model', path.join(exp_name, '')) model = model_mod.create_model(mode, examples, exp_config, (load_session, load_graph)) del load_graph # Free graph. if load_session is not None: load_session.close() # Setup summaries. summary_mod = imp.load_source('_summaries', path.join(exp_name, '')) display_fetches, test_fetches = summary_mod.create_summaries( mode, examples, model, exp_config) # Stats. with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum( [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) # Prepare output. out_mod = imp.load_source("_write_output", path.join(exp_name, '')) # Preparing session. if mode in ['train', 'trainval']: saver = tf.train.Saver(max_to_keep=exp_config["kept_saves"]) else: saver = None sess_config = tf.ConfigProto(log_device_placement=False) sess_config.gpu_options.allow_growth = True sw = tf.summary.FileWriter(path.join(exp_log_fp, mode)) summary_op = tf.summary.merge_all() prepared_session = tf.Session(config=sess_config) initializer = tf.global_variables_initializer() epoch = 0 with prepared_session as sess: # from tensorflow.python import debug as tf_debug # sess = tf_debug.LocalCLIDebugWrapperSession(sess)"Parameter count: %d.","Starting queue runners...") coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord)"Initializing variables...") fetches = {} fetches["global_step"] = model.global_step global_step =["global_step"][0]"On global step: %d.", global_step) if len(glob(path.join(exp_log_fp, mode, 'events.*'))) == 0:"Summarizing graph...") sw.add_graph(sess.graph, global_step=global_step) if mode in ['val', 'test']: image_dir = path.join(exp_feat_fp, 'step_' + str(global_step)) elif mode in ['sample']: image_dir = path.join( exp_feat_fp, time.strftime("%Y-%m-%d_%H-%M-%S", time.gmtime())) else: image_dir = exp_log_fp if args["out_fp"] is not None: image_dir = args["out_fp"] if not args["no_output"]:"Writing image status to `%s`.", image_dir) else: image_dir = None if mode in ['val', 'test', 'sample', 'transform']: shutdown_requested = [False] def SIGINT_handler(signal, frame): # noqa: E306 LOGGER.warn("Received SIGINT.") shutdown_requested[0] = True signal.signal(signal.SIGINT, SIGINT_handler) # run a single epoch over all input data if mode in ['val', 'test', 'transform']: num_ex = steps_per_epoch else: # TODO figure out what is happending here #if exp_config['model_version'] == 'cvae': # num_ex = steps_per_epoch * exp_config["cvae_nsamples_per_image"] #else: # num_ex = int(np.ceil(float(args["n_samples"]) / # exp_config["batch_size"])) num_ex = int( np.ceil( float(args["n_samples"]) / exp_config["batch_size"])) av_results = dict((name, []) for name in test_fetches.keys()) av_placeholders = dict((name, tf.placeholder(tf.float32)) for name in test_fetches.keys()) for name in test_fetches.keys(): tf.summary.scalar(name, av_placeholders[name], collections=['evaluation']) test_summary = tf.summary.merge_all('evaluation') display_fetches.update(test_fetches) for b_id in tqdm.tqdm(range(num_ex)): results = if not args['no_output']: index_fp = out_mod.save_images(results, image_dir, mode, exp_config, batch=b_id) # Check for problems with this result. results_valid = True for key in test_fetches.keys(): if not np.isfinite(results[key]): if 'paths' in results.keys(): LOGGER.warn( "There's a problem with results for " "%s! Skipping.", results['paths'][0]) else: LOGGER.warn("Erroneous result for batch %d!", b_id) results_valid = False break if results_valid: for key in test_fetches.keys(): av_results[key].append(results[key]) if shutdown_requested[0]: break"Results:") feed_results = dict() for key in test_fetches.keys(): av_results[key] = np.mean(av_results[key]) feed_results[av_placeholders[key]] = av_results[key]" %s: %s", key, av_results[key]) if not shutdown_requested[0]: sw.add_summary(, feed_dict=feed_results), global_step) else: LOGGER.warn("Not writing results to tf summary due to " "incomplete evaluation.") if not args['no_output']:"Wrote index at `%s`.", index_fp) elif mode in ["train", "trainval"]: # Training. max_steps = 2**32 last_summary_written = time.time() if exp_config["max_epochs"] is not None: max_steps = steps_per_epoch * exp_config["max_epochs"] if exp_config["max_steps"] is not None: max_steps = exp_config["max_steps"] shutdown_requested = [False] # Needs to be mutable to access. # Register signal handler to save on Ctrl-C. def SIGINT_handler(signal, frame): # noqa: E306 LOGGER.warn("Received SIGINT. Saving model..."), path.join(exp_log_fp, "model"), global_step=model.global_step) shutdown_requested[0] = True signal.signal(signal.SIGINT, SIGINT_handler) pbar = tqdm.tqdm(total=(max_steps - global_step) * exp_config["batch_size"]) for step in range(global_step, max_steps): def should(freq, epochs=False): if epochs: return freq > 0 and ((epoch + 1) % freq == 0 and (step + 1) % steps_per_epoch == 0 or step == max_steps - 1) else: return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(exp_config["trace_freq"]): options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # Setup fetches. fetches = { "train": model.train, "global_step": model.global_step, } if ((time.time() - last_summary_written) > exp_config["summary_freq"]): fetches["summary"] = summary_op if (should(exp_config["display_freq"], epochs=True) or should(exp_config["save_freq"], epochs=True) or step == max_steps - 1): fetches["display"] = display_fetches # Run! results =, options=options, run_metadata=run_metadata) # Write. if (should(exp_config["save_freq"], epochs=True) or results["global_step"] == 1 or step == max_steps - 1): # Save directly at first iteration to make sure this is # working."Saving model...") gs = model.global_step, path.join(exp_log_fp, "model"), global_step=gs) if "summary" in results.keys(): sw.add_summary(results["summary"], results["global_step"]) last_summary_written = time.time() if "display" in results.keys():"saving display images") out_mod.save_images(results["display"], image_dir, mode, exp_config, step=results["global_step"][0]) if should(exp_config["trace_freq"]):"recording trace") sw.add_run_metadata(run_metadata, "step_%d" % results["global_step"]) trace = timeline.Timeline( step_stats=run_metadata.step_stats) with open(path.join(exp_log_fp, "timeline.json"), "w") as trace_file: trace_file.write(trace.generate_chrome_trace_format()) # Enter 'chrome://tracing' in chrome to open the file. epoch = results["global_step"] // steps_per_epoch pbar.update(exp_config["batch_size"]) if shutdown_requested[0]: break pbar.close()"Shutting down...") coord.request_stop() coord.join(threads) data.cleanup()"Done.")
def on_closing(): cleanup() window.destroy()