def __init__(self, trainpath, testpath, log_path='data', n_layers=150): self.train_path = trainpath self.test_path = testpath self.n_layers = n_layers self.data_folder = log_path self.callbacks_list = [] # Define the basic TensorBoard callback. logdir = os.path.join( config.LOG_PATH, "logs/image/" + datetime.now().strftime("%Y%m%d-%H%M%S")) file_writer_cm = create_file_writer(logdir + '/cm') tensorboard_callback = TensorBoard(log_dir=logdir) checkpoint_path = config.BASE_MODEL + "_training/cp.ckpt" cp_callback = ModelCheckpoint(filepath=os.path.join( config.MODEL_PATH, checkpoint_path), save_weights_only=True, monitor='val_loss', verbose=1, save_best_only=True) early_stopper = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto') self.callbacks_list = [cp_callback, tensorboard_callback] self.transform() self.fit()
def host_call_fn(**kwargs): writer = contrib_summary.create_file_writer(summary_dir, max_queue=1000) always_record = contrib_summary.always_record_summaries() with writer.as_default(), always_record: for name, scalar in kwargs.items(): contrib_summary.scalar(name, tf.reduce_mean(input_tensor=scalar)) return contrib_summary.all_summary_ops()
def main(env_name="CartPole-v0", n_steps=1000000, random_actions=False, verbose=False, visualise=False, reward_style=None, testing=False): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") test = "-test" if testing else "" log_dir = 'logs/dqn/' + current_time + test summary_writer = tf_summary.create_file_writer(log_dir) env_ = gym.make(env_name) e = Epsilon(0.1, 0.9, 0.99) agent = Q_Learn(env_, e, obs_precision=1, tf_writer=summary_writer, random_actions=random_actions, verbose=verbose, visualise=visualise, reward_style=reward_style) for i in range(n_steps): agent.step() env_.env.close()
def __init__(self, base_dir, create_agent_fn, create_environment_fn=unity_lib.create_otc_environment, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=2500, evaluation_steps=1250, max_steps_per_episode=2700): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. create_agent_fn: A function that takes as args an environment, and returns an agent. create_environment_fn: A function which receives a problem name and creates a Gym environment for that problem (e.g. an Atari 2600 game). checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. evaluation_steps: int, the number of evaluation steps to perform. max_steps_per_episode: int, maximum number of steps after which an episode terminates. This constructor will take the following actions: - Initialize an environment. - Initialize a logger. - Initialize an agent. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._num_iterations = num_iterations self._training_steps = training_steps self._evaluation_steps = evaluation_steps self._max_steps_per_episode = max_steps_per_episode self._base_dir = base_dir self._create_directories() self._summary_writer = tf_summary.create_file_writer(self._base_dir) self._summary_writer.as_default() self.experiment_data = {} self._environment = create_environment_fn() config = tf.compat.v1.ConfigProto(allow_soft_placement=True) # Allocate only subset of the GPU memory as needed which allows for running # multiple agents/workers on the same GPU. # config.gpu_options.allow_growth = True self._agent = create_agent_fn(self._environment, self._base_dir, summary_writer=self._summary_writer) # self._summary_writer.add_graph(graph=tf.get_default_graph()) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix)
def _get_SummaryWriter(self): if not self.args.debug and not self.args.do_test: ensure_dir(os.path.join('./summary/', self.experiment_name)) self.summarywriter = summary.create_file_writer( logdir='./summary/{}/{}/train'.format( self.experiment_name, time.strftime("%m%d-%H-%M-%S", time.localtime( time.time()))))
def prepare_log(name): log_dir = f"./logs/{name}" os.makedirs(log_dir, exist_ok=True) tensorboard = TensorBoard(log_dir=log_dir) tensorboard.set_model(discriminator) writer = create_file_writer(log_dir) return (log_dir, writer)
def __init__(self, logdir="./tensorboard_logs/", run_id=None): """ :param logdir: dir where TensorBoard events will be written :param run_id: name for log id, otherwise it usses datetime """ from tensorflow import summary self.summary = summary run_id = datetime.now().isoformat()[:-7].replace("T", " ").replace( ":", "_") if run_id is None else run_id self._path = path.join(logdir, run_id) self.writer = summary.create_file_writer(self._path)
def log_test_results(cfg, model, test_generator, test_metrics, log_dir): ''' Visualize performance of a trained model on the test set. Optionally save the model. :param cfg: Project config :param model: A trained Keras model :param test_generator: A Keras generator for the test set :param test_metrics: Dict of test set performance metrics :param log_dir: Path to write TensorBoard logs ''' # Visualization of test results test_predictions = model.predict(test_generator, verbose=0) test_labels = test_generator.labels plt = plot_roc(test_labels, test_predictions, list(test_generator.class_indices.keys()), dir_path=cfg['PATHS']['IMAGES']) roc_img = plot_to_tensor() plt = plot_confusion_matrix(test_labels, test_predictions, list(test_generator.class_indices.keys()), dir_path=cfg['PATHS']['IMAGES']) cm_img = plot_to_tensor() # Log test set results and plots in TensorBoard writer = tf_summary.create_file_writer(logdir=log_dir) # Create table of test set metrics test_summary_str = [['**Metric**', '**Value**']] for metric in test_metrics: metric_values = test_metrics[metric] test_summary_str.append([metric, str(metric_values)]) # Create table of model and train hyperparameters used in this experiment hparam_summary_str = [['**Variable**', '**Value**']] for key in cfg['TRAIN']: hparam_summary_str.append([key, str(cfg['TRAIN'][key])]) for key in cfg['NN'][cfg['TRAIN']['MODEL_DEF'].upper()]: hparam_summary_str.append( [key, str(cfg['NN'][cfg['TRAIN']['MODEL_DEF'].upper()][key])]) # Write to TensorBoard logs with writer.as_default(): tf_summary.text(name='Test set metrics', data=tf.convert_to_tensor(test_summary_str), step=0) tf_summary.text(name='Run hyperparameters', data=tf.convert_to_tensor(hparam_summary_str), step=0) tf_summary.image(name='ROC Curve (Test Set)', data=roc_img, step=0) tf_summary.image(name='Confusion Matrix (Test Set)', data=cm_img, step=0) return
def test_filewriter(): train_log_dir = 'test_log_dir' train_summary_writer = summary.create_file_writer(train_log_dir) name = "loss" + str(random.randint(1, 10)) with train_summary_writer.as_default(): #name, tensor, collections = None, family = None): for i in range(10): loss = F.l1_loss(torch.rand(1), torch.rand(1)) tf.summary.scalar(name, loss.item(), step=i)
def __init__(self, model, train_log_dir, test_log_dir, manager): self._model = model self._loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits self._manager = manager self._train_loss = Mean(name='train_loss') self._test_loss = Mean(name='test_loss') self._train_acc = SparseCategoricalAccuracy(name='train_acc') self._test_acc = SparseCategoricalAccuracy(name='test_acc') self._train_loss.reset_states() self._test_loss.reset_states() self._train_acc.reset_states() self._test_acc.reset_states() os.makedirs(train_log_dir, exist_ok=True) os.makedirs(test_log_dir, exist_ok=True) self._train_summary_writer = create_file_writer(train_log_dir) self._test_summary_writer = create_file_writer(test_log_dir)
def log_test_results(cfg, model, test_generator, test_metrics, log_dir): ''' Visualize performance of a trained model on the test set. Optionally save the model. :param cfg: Project config :param model: A trained Keras model :param test_generator: A Keras generator for the test set :param test_metrics: Dict of test set performance metrics :param log_dir: Path to write TensorBoard logs ''' # Visualization of test results test_predictions = model.predict_generator(test_generator, verbose=0) test_labels = test_generator.labels covid_idx = test_generator.class_indices['COVID-19'] plt = plot_roc("Test set", test_labels, test_predictions, class_id=covid_idx) roc_img = plot_to_tensor() plt = plot_confusion_matrix(test_labels, test_predictions, class_id=covid_idx) cm_img = plot_to_tensor() # Log test set results and plots in TensorBoard writer = tf_summary.create_file_writer(logdir=log_dir) # Create table of test set metrics test_summary_str = [['**Metric**','**Value**']] thresholds = cfg['TRAIN']['THRESHOLDS'] # Load classification thresholds for metric in test_metrics: if metric in ['precision', 'recall'] and isinstance(metric, list): metric_values = dict(zip(thresholds, test_metrics[metric])) else: metric_values = test_metrics[metric] test_summary_str.append([metric, str(metric_values)]) # Create table of model and train config values hparam_summary_str = [['**Variable**', '**Value**']] for key in cfg['TRAIN']: hparam_summary_str.append([key, str(cfg['TRAIN'][key])]) if cfg['TRAIN']['CLASS_MODE'] == 'binary': for key in cfg['NN']['DCNN_BINARY']: hparam_summary_str.append([key, str(cfg['NN']['DCNN_BINARY'][key])]) else: for key in cfg['NN']['DCNN_BINARY']: hparam_summary_str.append([key, str(cfg['NN']['DCNN_BINARY'][key])]) # Write to TensorBoard logs with writer.as_default(): tf_summary.text(name='Test set metrics', data=tf.convert_to_tensor(test_summary_str), step=0) tf_summary.text(name='Run hyperparameters', data=tf.convert_to_tensor(hparam_summary_str), step=0) tf_summary.image(name='ROC Curve (Test Set)', data=roc_img, step=0) tf_summary.image(name='Confusion Matrix (Test Set)', data=cm_img, step=0) return
def __init__(self, run_name, save_every, base_dir="experiments"): super().__init__() self.save_counter = 0 self.least_loss = -1 os.makedirs(base_dir, exist_ok=True) self.base_dir = os.path.join(base_dir, run_name) os.makedirs(self.base_dir, exist_ok=True) os.makedirs(os.path.join(self.base_dir, "models"), exist_ok=True) os.makedirs(os.path.join(self.base_dir, "logs"), exist_ok=True) self.summary_writer = create_file_writer( os.path.join(self.base_dir, "logs")) self.summary_writer.set_as_default() self.iters_since_last_model_save = save_every + 1 self.save_every = save_every
def start_model_manager_training(self, epoch_start=0, *, logdir, hparam, other_loggers=[], **runtime_options): assert hasattr(self.model_manager,"save_hparams") self.model_manager.save_hparams(hparam=hparam, logdir=logdir) with summary.create_file_writer(logdir).as_default(): hp.hparams(hparam) # TODO: # For now the call to the training function of LoggingExperimentManager is quite superfluous. # However the idea is to have LoggingExperimentManager specify one of the user-facing API with detailed # specifications on requirements. return super(TBExperimentManager, self).\ start_model_manager_training(logdir=logdir, hparam=hparam, epoch_start=epoch_start, logger_functions=[summary.scalar]+other_loggers, **runtime_options)
def prepare_dirs(self): self.log_dir = f"./logs/{self.name}" if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) tensorboard = TensorBoard(log_dir=self.log_dir) tensorboard.set_model(self.discriminator) self.writer = create_file_writer(self.log_dir) checkpoint_dir = f'.\\checkpoints\\{self.name}' self.checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') self.checkpoint = tf.train.Checkpoint(generator=self.generator, discriminator=self.discriminator) manager = tf.train.CheckpointManager(self.checkpoint, checkpoint_dir, max_to_keep=5) if manager.latest_checkpoint: self.checkpoint.restore(manager.latest_checkpoint) print(f"Restored from {manager.latest_checkpoint}")
def set_callbacks(self, checkpoints=True, tensorboard=True): """ Set any model callbacks here """ if checkpoints: if not os.path.exists('checkpoints'): os.mkdir('checkpoints') checkpoint = ModelCheckpoint(filepath='checkpoints/' + self.filename(), monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') self.callbacks.append(checkpoint) if tensorboard: log_dir = os.path.join(self.model_log_dir, self.filename()[:-3]) self.file_writer = create_file_writer(log_dir + '/metrics') self.file_writer.set_as_default() tensorboard_callback = TensorBoard( log_dir=log_dir, write_graph=True, write_images=True, histogram_freq=0, profile_batch=0, ) self.callbacks.append(tensorboard_callback) lr_schedule = None config = self.lr_schedule_config if config: if config.get('lr_schedule') == 'polynomial': lr_schedule = PolynomialDecay(maxEpochs=self.epochs, initAlpha=self.lr, power=config.get('lr_power')) elif config.get('lr_schedule') == 'linear': lr_schedule = PolynomialDecay(maxEpochs=self.epochs, initAlpha=self.lr, power=1) if lr_schedule: lr_callback = LearningRateScheduler(lr_schedule) self.callbacks.append(lr_callback)
def _write_aggregate_summaries(model_dir, global_step, eval_tag, aggregates_dict): """Writes text metrics as summaries.""" eval_dir = os.path.join(model_dir, eval_tag) summary_writer = contrib_summary.create_file_writer(eval_dir) with summary_writer.as_default(), \ contrib_summary.always_record_summaries(): for k, v in sorted(aggregates_dict[_ROUGE_METRIC].items()): contrib_summary.scalar("text_eval/%s-R" % k, v.mid.recall, step=global_step) contrib_summary.scalar("text_eval/%s-P" % k, v.mid.precision, step=global_step) contrib_summary.scalar("text_eval/%s-F" % k, v.mid.fmeasure, step=global_step) for k, v in sorted(aggregates_dict[_BLEU_METRIC].items()): contrib_summary.scalar("text_eval/%s" % k, v.mid.bleu, step=global_step) for k, v in sorted(aggregates_dict[_REPETITION_METRIC].items()): contrib_summary.scalar("text_eval/%s-T" % k, v.mid.target_ratio, step=global_step) contrib_summary.scalar("text_eval/%s-P" % k, v.mid.prediction_ratio, step=global_step) for k, v in sorted(aggregates_dict[_LENGTH_METRIC].items()): contrib_summary.scalar("text_eval/%s-T" % k, v.mid.target_length, step=global_step) contrib_summary.scalar("text_eval/%s-P" % k, v.mid.prediction_length, step=global_step) contrib_summary.scalar("text_eval/%s-R" % k, v.mid.relative_length, step=global_step)
def boss(env, nb_AP, nb_Users, action_queues, matrix_queues, logger_folder, max_epsiode_steps): step = 0 writer = summary.create_file_writer('logs/' + logger_folder + '/boss') writer.set_as_default() summary.experimental.set_step(step) while True: step += max_epsiode_steps W = np.zeros((nb_AP, nb_Users)).astype('float32') for i in range(nb_AP): W[i:] = action_queues[i].get() W = W / np.linalg.norm(W, axis=1).reshape(W.shape[0], 1) for q in matrix_queues: q.put(W) env.set_W(W) r = np.sum(np.log2(1 + env.sinr())) summary.scalar(name='Episode/Reward', data=r, step=step) print( "********* \nReward {0:5.6f} Step {1: 6} norm {2: 4.5f}\n************" .format(np.sum(np.log2(1 + env.sinr())), step, np.linalg.norm(W)))
def _get_active_writer(self): if self.mode not in self._writers: self._writers[self.mode] = create_file_writer( os.path.join(self._log_dir, self.mode.value)) return self._writers[self.mode]
def eval_metrics_host_call_fn(policy_output, value_output, pi_tensor, value_tensor, policy_cost, value_cost, l2_cost, combined_cost, step, est_mode=tf.estimator.ModeKeys.TRAIN): policy_entropy = -tf.reduce_mean( tf.reduce_sum(policy_output * tf.compat.v1.log(policy_output), axis=1)) # pi_tensor is one_hot when generated from sgfs (for supervised learning) # and soft-max when using self-play records. argmax normalizes the two. policy_target_top_1 = tf.argmax(pi_tensor, axis=1) policy_output_in_top1 = tf.compat.v1.to_float( tf.compat.v1.nn.in_top_k(policy_output, policy_target_top_1, k=1)) policy_output_in_top3 = tf.compat.v1.to_float( tf.compat.v1.nn.in_top_k(policy_output, policy_target_top_1, k=3)) policy_top_1_confidence = tf.reduce_max(policy_output, axis=1) policy_target_top_1_confidence = tf.boolean_mask( policy_output, tf.one_hot(policy_target_top_1, tf.shape(policy_output)[1])) value_cost_normalized = value_cost / params['value_cost_weight'] avg_value_observed = tf.reduce_mean(value_tensor) with tf.compat.v1.variable_scope('metrics'): metric_ops = { 'policy_cost': tf.compat.v1.metrics.mean(policy_cost), 'value_cost': tf.compat.v1.metrics.mean(value_cost), 'value_cost_normalized': tf.compat.v1.metrics.mean(value_cost_normalized), 'l2_cost': tf.compat.v1.metrics.mean(l2_cost), 'policy_entropy': tf.compat.v1.metrics.mean(policy_entropy), 'combined_cost': tf.compat.v1.metrics.mean(combined_cost), 'avg_value_observed': tf.compat.v1.metrics.mean(avg_value_observed), 'policy_accuracy_top_1': tf.compat.v1.metrics.mean(policy_output_in_top1), 'policy_accuracy_top_3': tf.compat.v1.metrics.mean(policy_output_in_top3), 'policy_top_1_confidence': tf.compat.v1.metrics.mean(policy_top_1_confidence), 'policy_target_top_1_confidence': tf.compat.v1.metrics.mean(policy_target_top_1_confidence), 'value_confidence': tf.compat.v1.metrics.mean(tf.abs(value_output)), } if est_mode == tf.estimator.ModeKeys.EVAL: return metric_ops # NOTE: global_step is rounded to a multiple of FLAGS.summary_steps. eval_step = tf.reduce_min(step) # Create summary ops so that they show up in SUMMARIES collection # That way, they get logged automatically during training summary_writer = contrib_summary.create_file_writer(FLAGS.work_dir) #with summary_writer.as_default(), \ # contrib_summary.record_summaries_every_n_global_steps( # params['summary_steps'], eval_step): # for metric_name, metric_op in metric_ops.items(): # contrib_summary.scalar( # metric_name, metric_op[1], step=eval_step) # Reset metrics occasionally so that they are mean of recent batches. reset_op = tf.compat.v1.variables_initializer( tf.compat.v1.local_variables('metrics')) cond_reset_op = tf.cond( tf.equal(eval_step % params['summary_steps'], tf.compat.v1.to_int64(1)), lambda: reset_op, lambda: tf.no_op()) #return contrib_summary.all_summary_ops() + [cond_reset_op] return [cond_reset_op]
def __init__(self, **kwargs): super().__init__(**kwargs) self.step = 1 self.writer = summary.create_file_writer(self.log_dir)
def train(self, lr: float = 1e-5, num_epochs: int = 3, eval_every: int = None, best_valid_loss=float("Inf")): """ :param lr: :param num_epochs: :param eval_every: :param best_valid_loss: :return: """ self.optimizer = optim.Adam(self.model.parameters(), lr=lr) # instantiate tensorboard writer pathstr = str(Path(self.TRAIN_LOG_DIR / f"lr={lr}-epochs={num_epochs}")) self.writer = summary.create_file_writer(pathstr) # initialize running values if eval_every is None: eval_every = len(self.train_iter) // 2 running_loss = 0.0 valid_running_loss = 0.0 global_step = 0 train_loss_list = [] valid_loss_list = [] global_steps_list = [] # training loop self.model.train() for epoch in range(num_epochs): for (label, text), _ in self.train_iter: label = label.type(torch.LongTensor) label = label.to(self.device) text = text.type(torch.LongTensor) text = text.to(self.device) output = self.model(text, label) loss, _ = output self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update running values running_loss += loss.item() global_step += 1 # evaluation step if global_step % eval_every == 0: self.model.eval() with torch.no_grad(): # validation loop for (label, text), _ in self.valid_iter: label = label.type(torch.LongTensor) label = label.to(self.device) text = text.type(torch.LongTensor) text = text.to(self.device) output = self.model(text, label) loss, _ = output valid_running_loss += loss.item() curr_val_loss = loss.item() # evaluation average_train_loss = running_loss / eval_every average_valid_loss = valid_running_loss / len( self.valid_iter) train_loss_list.append(average_train_loss) valid_loss_list.append(average_valid_loss) global_steps_list.append(global_step) # resetting running values curr_train_loss = loss.item() running_loss = 0.0 valid_running_loss = 0.0 self.model.train() # print progress print( "Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}" .format(epoch + 1, num_epochs, global_step, num_epochs * len(self.train_iter), average_train_loss, average_valid_loss)) # write to tensorboard logs with self.writer.as_default(): tf.summary.scalar('train loss', curr_train_loss, step=global_step) with self.writer.as_default(): tf.summary.scalar('validation loss', curr_val_loss, step=global_step) # checkpoint if best_valid_loss > average_valid_loss: best_valid_loss = average_valid_loss #print(self.OUTPUT_DIR / 'foo.pt') #print(best_valid_loss) self.save_checkpoint(self.OUTPUT_DIR / 'model.pt', best_valid_loss) self.save_metrics(self.OUTPUT_DIR / 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) self.save_metrics(self.OUTPUT_DIR / 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) print("Finished Training!")
def train_fit(data_set): """ function: 开启模型训练过程 :return: Model, Log """ summary_writer = summary.create_file_writer(log_dir) data_params = { 'batch_size': BATCH_SIZE, 'shuffle': SHUFFLE, 'num_workers': NUM_WORKS } devices = 'cuda' if cuda.is_available() else 'cpu' device(devices) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.99) train_set = CustomDataset(data_set, token, TEXT_LEN) train_set_pt = DataLoader(train_set, **data_params) model = BertClass() ckpt = train.Checkpoint(transformer=model.trainable_variables, optimizer=optimizer) ckpt_manager = train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=MAX_TO_KEEP) def train_step(model_, id_, mk_, type_ids_, optimizer_, target_): with GradientTape() as tp: y_pred = model_(id_, mk_, type_ids_) loss_value = loss_fn(target=target_, output=y_pred) # y_pred = [round(y_p) for y_p in y_pred] acc = accuracy_fn(target_, y_pred) gradient = tp.gradient(loss_value, model.trainable_variables) optimizer_.apply_gradients(zip(gradient, model.trainable_variables)) return loss_value, np.array(acc).mean(), y_pred for epoch in range(1, EPOCHS + 1): for _, batch_data in enumerate(train_set_pt): ids = convert_to_tensor(batch_data['ids'].detach().numpy()) mask = convert_to_tensor(batch_data['mask'].detach().numpy()) token_type_ids = convert_to_tensor( batch_data['token_type_ids'].detach().numpy()) targets = convert_to_tensor( batch_data['targets'].detach().numpy()) loss, accuracy, pred = train_step(model_=model, id_=ids, mk_=mask, type_ids_=token_type_ids, optimizer_=optimizer, target_=targets) if _ % 20 == 0 and _ > 0: # 将loss和accuracy写入日志文件 # 日志每训练十批数据保存一次日志文件 print("epoch: {}, fit step: {}, loss: {}, accuracy: {}". format(epoch, _, loss, accuracy)) print("epoch is {}, predict: {}".format(epoch, pred)) if epoch % 2 == 0: # 模型每训练两轮保存一次 ckpt_manager.save(check_interval=True) with summary_writer.as_default(): summary.scalar(name="loss_value_step:{}".format(epoch), data=loss, step=epoch) with summary_writer.as_default(): summary.scalar(name='accuracy_value_step:{}'.format(epoch), data=accuracy, step=epoch)
from cpprb import ReplayBuffer, PrioritizedReplayBuffer gamma = 0.99 batch_size = 1024 N_iteration = int(1e+5) target_update_freq = 1000 eval_freq = 100 egreedy = 0.1 # Log dir_name = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") logdir = os.path.join("logs", dir_name) writer = create_file_writer(logdir + "/metrics") writer.set_as_default() # Env env = gym.make('CartPole-v1') eval_env = gym.make('CartPole-v1') # For CartPole: input 4, output 2 model = Sequential([ Dense(64, activation='relu', input_shape=(env.observation_space.shape)), Dense(64, activation='relu'), Dense(env.action_space.n) ]) target_model = clone_model(model) # Loss Function
def run(_run): # Load configs, if parameters are unspecified, fill in a default config = _run.config run = config.get('fit_params') model_params = config.get('model_params') data_params = config.get('data_params') batch_size = data_params.get('batch_size') augmentations = data_params.get('augmentations') buffer_size = data_params.get('buffer_size') # the buffer sizes for shuffling use_sampling = data_params.get('use_sampling') class_target_prob = 1 / model_params.get('num_classes') print("[!] list of parameter configurations") pprint(config) # Load data and define generators ------------------------------------------ print("[!] loading datasets \n") x_train, x_val, x_test, probs = load_data() # get a rough estimate: there are 100 files per TFRecord # except for one TFRecord per item, so this estimate might not be 100% correct num_training = len(x_train) * 100 # TF parsing functions print("[!] Creating dataset iterators \n") # Load the dataset iterators train_dataset = create_training_dataset(x_train, batch_size, buffer_size, augmentations, use_sampling, probs, class_target_prob, **model_params) val_dataset = validate(x_val, batch_size, **model_params) test_dataset = validate(x_test, batch_size, **model_params) # we need the actual labels from the TFRecords, but they take INCREDIBLY long to parse # parse through them once, and create a csv file with a list of all the labels # note: the tf parsing requires that there is no randomness (shuffling) in the validation/test labels if not os.path.exists('../datasets/data/valid/val_labels.csv'): print(os.path.exists('../datasets/data/valid/val_labels.csv')) print("[!] creating validation label file in ../datasets/data/valid/val_labels.csv") create_label_csv(val_dataset,'../datasets/data/valid/val_labels.csv') else: print("[!] validation labels csv exist") if not os.path.exists('../datasets/data/test/test_labels.csv'): print("[!] creating test label file in ../datasets/data/test/test_labels.csv") create_label_csv(test_dataset,'../datasets/data/test/test_labels.csv') else: print("[!] test labels csv exist") # load the file with validation labels # getting labels from a TFRecords with lots of other data is horribly slow... print("[!] Loading validation labels for callbacks") val_labels = pd.read_csv('../datasets/data/valid/val_labels.csv') val_labels = np.squeeze(val_labels.to_numpy()) # Model definitions -------------------------------------------------------- print("[!] compiling model and adding callbacks \n") # function for building the model model_func = model_dict[run.get('model')] # invoke the user function model = model_func(**model_params) model.summary() # compile the model with catcrossentropy: one hot encoded labels!! model.compile(optimizer= tf.keras.optimizers.Adam(run.get('lr')), loss= 'categorical_crossentropy', metrics=['accuracy']) # Model callbacks ---------------------------------------------------------- # ReduceLRonPlateau if run.get('reduce_lr_on_plateau'): reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=10e-7, verbose=1) else: reduce_lr = Callback() # Model checkpoints now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") aug_string = 'aug' if augmentations==True else 'noaug' modelcheckpoint_name= lambda x: "checkpoints/model-{}-{}-{}-{}-{}.hdf5".format(run.get('model'), x, aug_string, 'ch_' + str(len(model_params.get('channels'))), now) modelcheckpoint = ModelCheckpoint(modelcheckpoint_name('best_loss'), monitor = 'val_loss', verbose=1, save_best_only=True, save_weights_only=True) # Model early stopping earlystopping = EarlyStopping(monitor='val_loss', patience=10) # tensorboard and metric callbacks log_dir = "logs/fit/{}-{}-{}-{}".format(run.get('model'), aug_string, 'ch_' + str(len(model_params.get('channels'))), now) file_writer = tfsum.create_file_writer(log_dir) tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0) f1_metric = Metrics(val_dataset, val_labels, save_best=True, save_name= modelcheckpoint_name('best_f1'), writer=file_writer) # Model Training and evaluation -------------------------------------------- print("[!] fitting model \n") model.fit( train_dataset.repeat(), epochs=run.get('epochs'), steps_per_epoch= int(num_training / batch_size), validation_data=val_dataset, validation_steps = None, shuffle=True, verbose= 1, callbacks = [tensorboard_cb, f1_metric, LogMetrics(), modelcheckpoint, earlystopping, reduce_lr, MemoryCallback()] ) print("[!] done running, terminating program") '''
def __init__(self, log_dir): """Initialize summary writer.""" self.writer = tf_summary.create_file_writer(log_dir)
# update weights self.weights -= learning_rate * gradient / np.sqrt(self.grad_magnitude) return loss / self.batch_size if __name__ == "__main__": time_string = datetime.now().strftime("%Y%m%d-%H%M%S") # base_dir = tempfile.TemporaryDirectory().name base_dir = "/Users/bert/Desktop" log_dir = '{}/logs/'.format(base_dir) print("Storing logs in {}".format(log_dir)) writer = summary.create_file_writer(log_dir + time_string) step = 0 start_time = time.time() # Use this next line to load bot weights from disk #engine_white = LearningEngine(None, None, sys.stderr) # Use this next line to re-initialize bot engine_white = LearningEngine(None, None, sys.stderr, weights=None, weight_file=None) board = chess.Board()
if USE_FLOW is None: prefix = "true" elif USE_FLOW == True: prefix = "flow" else: prefix = "mse" if len(sys.argv) > 1: prefix = sys.argv[1] + prefix prefix += "w" + str(WIDTH) + "h" + str(HEIGHT) log_dir = "./tfboard/" + prefix + str(time.time()) tf_summary_writer = tf_summary.create_file_writer(log_dir) train_samples = gen_dataset(NUM_SAMPLES, WIDTH, HEIGHT, min_val=MIN_VAL, max_val=MAX_VAL) print("generated dataset!") if NORMALIZE_FEATURES: trainX, trainY = get_training_features(train_samples, MIN_VAL, MAX_VAL, MAX_COST) else: trainX, trainY = get_training_features(train_samples, None, None, None) if USE_FLOW is None:
'').split(','))) for item in args.cnn_kernels ] assert args.evaluate_dev_steps % args.print_loss_steps == 0 train_summary_writer, test_summary_writer = None, None if args.use_tensorboard: import tensorflow as tf from tensorflow import summary # !rm -rf logs current_time = str(datetime.datetime.now().timestamp()) train_log_dir = '../logs/tensorboard/train/' + current_time test_log_dir = '../logs/tensorboard/test/' + current_time train_summary_writer = summary.create_file_writer(train_log_dir) test_summary_writer = summary.create_file_writer(test_log_dir) print(f"log file: {current_time}") torch.cuda.is_available() def evaluate_on_dev(model, corpus_dev_reader, dictionary_word, dictionary_char): loss_values = [] batch_generator_dev = corpus_dev_reader.batchify(dictionary_word, args.batch_size, args.seq_len) with torch.no_grad():
target = sample['target'].long().to(device) pred = model(input) pred_loss = criterion(pred, target) top3_val, top3_idx = torch.topk(pred, 3) num_correct = torch.sum(top3_idx == target.view(-1, 1)) return pred_loss.item(), num_correct.item() """### Prepare the Tensorboard""" train_log_dir = './runs/train' train_summary_writer = summary.create_file_writer(train_log_dir) val_log_dir = './runs/validate' val_summary_writer = summary.create_file_writer(val_log_dir) # Commented out IPython magic to ensure Python compatibility. # %tensorboard --logdir runs """### Run Training""" max_epoch = 200 save_stride = 10 tmp_path = './checkpoint.pth' max_accu = -1 for epoch in tqdm(range(max_epoch)): ### Train Phase
testImage = list(glob(join(relativeData, 'Test', '*'))) shuffle(testImage) testImage = array(testImage).reshape((len(testImage), 1)) testLabels, totalTime, times = predict(currentPath, testImage, bestModel, classNames) def plotImages(figure): buf = BytesIO() plt.savefig(buf, format='png') plt.close(figure) buf.seek(0) return expand_dims(_image.decode_png(buf.getvalue(), channels=4), 0) fileWriter = create_file_writer(logDir) fileWriter.set_as_default() with fileWriter.as_default(): image('Test:\nTotal time: ' + str(totalTime) + '\nTime per image: ' + str(sum(times) / len(times)), plotImages( imageGrid(testImage, testLabels, classNames=classNames, perRow=4, imageDimensions=(32, 32), nImages=16)), step=0)