def dump_configs(x, data_dir, rank=0, local_rank=0): """Save configs `x` separately for each rank.""" xfile = os.path.join(data_dir, f'x_rank{rank}-{local_rank}.z') io.log('Saving configs from rank ' f'{rank}-{local_rank} to: {xfile}.') head, _ = os.path.split(xfile) io.check_else_make_dir(head) joblib.dump(x, xfile)
def plot_charges(steps, charges, title=None, out_dir=None): charges = charges.T if charges.shape[0] > 4: charges = charges[:4, :] fig, ax = plt.subplots() for idx, q in enumerate(charges): ax.plot(steps, np.around(q) + 5 * idx, marker='', ls='-') ax.set_yticks([]) ax.set_yticklabels([]) ax.xmargin: 0 ax.yaxis.set_label_coords(-0.03, 0.5) ax.set_ylabel( r"$\mathcal{Q}$", # , fontsize='x-large', rotation='horizontal') ax.set_xlabel('MC Step') # , fontsize='x-large') if title is not None: ax.set_title(title) # , fontsize='x-large') plt.tight_layout() if out_dir is not None: fpath = os.path.join(out_dir, 'charge_chains.png') io.log(f'Saving figure to: {fpath}.') plt.savefig(fpath, dpi=400, bbox_inches='tight') return fig, ax
def load_and_run( args: AttrDict, x: tf.Tensor = None, runs_dir: str = None, ) -> (GaugeDynamics, DataContainer, tf.Tensor): """Load trained model from checkpoint and run inference.""" if not IS_CHIEF: return None, None, None io.print_dict(args) ckpt_dir = os.path.join(args.log_dir, 'training', 'checkpoints') flags = restore_from_train_flags(args) eps_file = os.path.join(args.log_dir, 'training', 'train_data', 'eps.z') flags.eps = io.loadz(eps_file)[-1] dynamics = build_dynamics(flags) ckpt = tf.train.Checkpoint(dynamics=dynamics, optimizer=dynamics.optimizer) manager = tf.train.CheckpointManager(ckpt, max_to_keep=5, directory=ckpt_dir) if manager.latest_checkpoint: io.log(f'Restored model from: {manager.latest_checkpoint}') status = ckpt.restore(manager.latest_checkpoint) status.assert_existing_objects_matched() xfile = os.path.join(args.log_dir, 'training', 'train_data', 'x_rank0.z') io.log(f'Restored x from: {xfile}.') x = io.loadz(xfile) dynamics, run_data, x = run(dynamics, args, x=x, runs_dir=runs_dir) return dynamics, run_data, x
def restore_flags(flags, train_dir): """Update `FLAGS` using restored flags from `log_dir`.""" rf_file = os.path.join(train_dir, 'FLAGS.z') restored = AttrDict(dict(io.loadz(rf_file))) io.log(f'Restoring FLAGS from: {rf_file}...') flags.update(restored) return flags
def load_data(data_dir): """Load data from `data_dir` and populate `self.data`.""" contents = os.listdir(data_dir) fnames = [i for i in contents if i.endswith('.z')] keys = [i.rstrip('.z') for i in fnames] data_files = [os.path.join(data_dir, i) for i in fnames] data = {} for key, val in zip(keys, data_files): if 'x_rank' in key: continue io.log(f'Restored {key} from {val}.') data[key] = io.loadz(val) return AttrDict(data)
def on_epoch_end(self, step, logs=None): if step < self.warmup_steps: return # def _get_lr(): # try: # lr = K.get_value(self.model.optimizer.lr) # except ValueError: # lr = self.model.lr(self.model.optimizer.iterations) # finally: # lr = None # # return lr logs = logs or {} current = logs.get(self.monitor) if current is None: # logging.warning( io.log(f'ReduceLROnPlateau conditioned on metric' f' {self.monitor} which is not available.' f' Available metrics are: {",".join(list(logs.keys()))}') else: if self.in_cooldown(): self.cooldown_counter -= 1 self.wait = 0 if self.monitor_op(current, self.best): self.best = current self.wait = 0 elif not self.in_cooldown(): self.wait += 1 if self.wait >= self.patience: step = self.model.optimizer.iterations old_lr = self.model._get_lr(step) if old_lr > self.min_lr: new_lr = old_lr * self.factor new_lr = max(new_lr, self.min_lr) K.set_value(self.model.optimizer.lr, new_lr) if self.verbose > 0: # logging.warning( print( f'ReduceLROnPlateau (step {step}):' ' Reducing learning rate from:' f' {old_lr} to {new_lr}.', ) print(f'current: {current}, best: {self.best}') # print(f'\nstep {epoch}: ReduceLROnPlateau' # ' reducing learning rate from:' # f' {old_lr} to {new_lr:g}.') self.cooldown_counter = self.cooldown self.wait = 0
def save_networks(self, log_dir): """Save networks to disk.""" models_dir = os.path.join(log_dir, 'training', 'models') io.check_else_make_dir(models_dir) eps_file = os.path.join(models_dir, 'eps.z') io.savez(self.eps.numpy(), eps_file, name='eps') if self.config.separate_networks: xnet_paths = [ os.path.join(models_dir, f'dynamics_xnet{i}') for i in range(self.config.num_steps) ] vnet_paths = [ os.path.join(models_dir, f'dynamics_vnet{i}') for i in range(self.config.num_steps) ] for idx, (xf, vf) in enumerate(zip(xnet_paths, vnet_paths)): xnet = self.xnet[idx] # type: tf.keras.models.Model vnet = self.vnet[idx] # type: tf.keras.models.Model io.log(f'Saving `xnet{idx}` to {xf}.') io.log(f'Saving `vnet{idx}` to {vf}.') xnet.save(xf) vnet.save(vf) else: xnet_paths = os.path.join(models_dir, 'dynamics_xnet') vnet_paths = os.path.join(models_dir, 'dynamics_vnet') io.log(f'Saving `xnet` to {xnet_paths}.') io.log(f'Saving `vnet` to {vnet_paths}.') self.xnet.save(xnet_paths) self.vnet.save(vnet_paths)
def short_training( train_steps: int, beta: float, log_dir: str, dynamics: GaugeDynamics, x: tf.Tensor = None, ): """Perform a brief training run prior to running inference.""" ckpt_dir = os.path.join(log_dir, 'training', 'checkpoints') ckpt = tf.train.Checkpoint(dynamics=dynamics, optimizer=dynamics.optimizer) manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5) current_step = 0 if manager.latest_checkpoint: io.log(f'Restored model from: {manager.latest_checkpoint}') ckpt.restore(manager.latest_checkpoint) current_step = dynamics.optimizer.iterations.numpy() if x is None: x = convert_to_angle(tf.random.normal(dynamics.x_shape)) train_data = DataContainer(current_step+train_steps, print_steps=1) dynamics.compile(loss=dynamics.calc_losses, optimizer=dynamics.optimizer, experimental_run_tf_function=False) x, metrics = dynamics.train_step((x, tf.constant(beta))) header = train_data.get_header(metrics, skip=SKEYS, prepend=['{:^12s}'.format('step')]) io.log(header.split('\n')) for step in range(current_step, current_step + train_steps): start = time.time() x, metrics = dynamics.train_step((x, tf.constant(beta))) metrics.dt = time.time() - start train_data.update(step, metrics) data_str = train_data.print_metrics(metrics) logger.info(data_str) # logger.print_metrics(metrics) # data_str = train_data.get_fstr(step, metrics, skip=SKEYS) # io.log(data_str) return dynamics, train_data, x
def run_hmc( args: AttrDict, hmc_dir: str = None, skip_existing: bool = False, ) -> (GaugeDynamics, DataContainer, tf.Tensor): """Run HMC using `inference_args` on a model specified by `params`. NOTE: ----- args should be a dict with the following keys: - 'hmc' - 'eps' - 'beta' - 'num_steps' - 'run_steps' - 'lattice_shape' """ if not IS_CHIEF: return None, None, None if hmc_dir is None: root_dir = os.path.join(GAUGE_LOGS_DIR, 'hmc_logs') month_str = io.get_timestamp('%Y_%m') hmc_dir = os.path.join(root_dir, month_str) io.check_else_make_dir(hmc_dir) def get_run_fstr(run_dir): _, tail = os.path.split(run_dir) fstr = tail.split('-')[0] return fstr if skip_existing: run_dirs = [os.path.join(hmc_dir, i) for i in os.listdir(hmc_dir)] run_fstrs = [get_run_fstr(i) for i in run_dirs] run_fstr = io.get_run_dir_fstr(args) if run_fstr in run_fstrs: io.log('ERROR:Existing run found! Skipping.') return None, None, None dynamics = build_dynamics(args) dynamics, run_data, x = run(dynamics, args, runs_dir=hmc_dir) return dynamics, run_data, x
def load_test_configs(json_file: Union[str, Path] = None): """Load test configs, if specified. If not specified, load from `BIN_DIR/test_configs.json`. Returns: configs (AttrDict): Configs. """ if json_file is None: json_file = os.path.join(BIN_DIR, 'test_configs.json') try: with open(json_file, 'rt') as f: configs = json.load(f) except FileNotFoundError: io.log(f'Unable to load configs from: {json_file}. Exiting.') raise return configs
def write_to_csv(self, log_dir, run_dir, hmc=False): """Write data averages to bulk csv file for comparing runs.""" _, run_str = os.path.split(run_dir) avg_data = { 'log_dir': log_dir, 'run_dir': run_str, 'hmc': hmc, } for key, val in self.data.items(): tensor = tf.convert_to_tensor(val) arr, steps = therm_arr(tensor.numpy(), therm_frac=0.2) if 'steps' not in avg_data: avg_data['steps'] = len(steps) avg_data[key] = np.mean(arr) # avg_data[key] = tf.reduce_mean(arr) avg_df = pd.DataFrame(avg_data, index=[0]) csv_file = os.path.join(BASE_DIR, 'logs', 'GaugeModel_logs', 'inference_results.csv') io.log(f'Appending inference results to {csv_file}.') if not os.path.isfile(csv_file): avg_df.to_csv(csv_file, header=True, index=False, mode='w') else: avg_df.to_csv(csv_file, header=False, index=False, mode='a')
def main(args, random_start=True): """Run inference on trained model from `log_dir/checkpoints/`.""" if not IS_CHIEF: return io.print_flags(args) skip = not args.get('overwrite', False) # If no `log_dir` specified, run generic HMC log_dir = args.get('log_dir', None) if log_dir is None: io.log('`log_dir` not specified, running generic HMC...') _ = run_hmc(args=args, hmc_dir=None, skip_existing=skip) return # Otherwise, load training flags train_flags_file = os.path.join(log_dir, 'training', 'FLAGS.z') train_flags = io.loadz(train_flags_file) beta = args.get('beta', None) eps = args.get('eps', None) if beta is None: io.log('Using `beta_final` from training flags') beta = train_flags['beta_final'] if eps is None: eps_file = os.path.join(log_dir, 'training', 'train_data', 'eps.z') io.log(f'Loading `eps` from {eps_file}') eps_arr = io.loadz(eps_file) eps = tf.cast(eps_arr[-1], TF_FLOAT) # Update `args` with values from training flags args.update({ 'eps': eps, 'beta': beta, 'num_steps': int(train_flags['num_steps']), 'lattice_shape': train_flags['lattice_shape'], }) # Run generic HMC using trained step-size (by loading it from _ = run_hmc(args=args, hmc_dir=None, skip_existing=skip) # `x` will be randomly initialized if passed as `None` x = None if not random_start: # Load the last configuration from the end of training run x_file = os.path.join(args.log_dir, 'training', 'train_data', 'x_rank0.z') x = io.loadz(x_file) if os.path.isfile(x_file) else None # Run inference on trained model from `args.log_dir` args['hmc'] = False # Ensure we're running L2HMC _ = load_and_run(args, x=x) return
def train(flags: AttrDict, x: tf.Tensor = None, restore_x: bool = False): """Train model. Returns: x (tf.Tensor): Batch of configurations dynamics (GaugeDynamics): Dynamics object. train_data (DataContainer): Object containing train data. flags (AttrDict): AttrDict containing flags used. """ dirs = io.setup_directories(flags) flags.update({'dirs': dirs}) if restore_x: x = None try: xfile = os.path.join(dirs.train_dir, 'train_data', f'x_rank{RANK}-{LOCAL_RANK}.z') x = io.loadz(xfile) except FileNotFoundError: io.log(f'Unable to restore x from {xfile}. Using random init.') if x is None: x = tf.random.normal(flags.dynamics_config['lattice_shape']) x = tf.reshape(x, (x.shape[0], -1)) dynamics = build_dynamics(flags) dynamics.save_config(dirs.config_dir) io.log('\n'.join([120 * '*', 'Training L2HMC sampler...'])) x, train_data = train_dynamics(dynamics, flags, dirs, x=x) if IS_CHIEF: output_dir = os.path.join(dirs.train_dir, 'outputs') train_data.save_data(output_dir) params = { 'beta_init': train_data.data.beta[0], 'beta_final': train_data.data.beta[-1], 'eps': dynamics.eps.numpy(), 'lattice_shape': dynamics.config.lattice_shape, 'num_steps': dynamics.config.num_steps, 'net_weights': dynamics.net_weights, } plot_data(train_data, dirs.train_dir, flags, thermalize=True, params=params) io.log('\n'.join(['Done training model', 120 * '*'])) io.save_dict(dict(flags), dirs.log_dir, 'configs') return x, dynamics, train_data, flags
def get_observables(self, run_dir=None): """Get all observables from inference_data in `run_dir`.""" run_params = io.loadz(os.path.join(run_dir, 'run_params.pkl')) beta = run_params['beta'] net_weights = tuple([int(i) for i in run_params['net_weights']]) keep = True if self._nw_include is not None: keep = net_weights in self._nw_include # If none (< 10 %) of the proposed configs are rejected, # don't bother loading data and calculating statistics. px = self._load_sqz('px.pkl') avg_px = np.mean(px) if avg_px < 0.1 or not keep: io.log(f'Skipping! nw: {net_weights}, avg_px: {avg_px:.3g}') return None, run_params io.log(f'Loading data for net_weights: {net_weights}...') io.log(f' run_dir: {run_dir}') # load chages, plaqs data charges = self._load_sqz('charges.pkl') plaqs = self._load_sqz('plaqs.pkl') dplq = u1_plaq_exact(beta) - plaqs # thermalize configs px, _ = therm_arr(px, self._therm_frac) dplq, _ = therm_arr(dplq, self._therm_frac) charges, _ = np.insert(charges, 0, 0, axis=0) charges, _ = therm_arr(charges) dq, _ = calc_tunneling_rate(charges) dq = dq.T dx = self._get_dx('dx.pkl') dxf = self.get_dx('dxf.pkl') dxb = self._get_dx('dxb.pkl') observables = { 'plaqs_diffs': dplq, 'accept_prob': px, 'tunneling_rate': dq, } _names = ['dx', 'dxf', 'dxb'] _vals = [dx, dxf, dxb] for name, val in zip(_names, _vals): if val is not None: observables[name] = val return observables
def run_from_log_dir(log_dir: str, net_weights: NetWeights, run_steps=5000): configs = load_configs_from_log_dir(log_dir) if 'x_shape' not in configs['dynamics_config'].keys(): x_shape = configs['dynamics_config'].pop('lattice_shape') configs['dynamics_config']['x_shape'] = x_shape beta = configs['beta_final'] nwstr = 'nw' + ''.join([f'{int(i)}' for i in net_weights]) run_dir = os.path.join(PROJECT_DIR, 'l2hmc_function_tests', 'inference', f'beta{beta}', f'{nwstr}') if os.path.isdir(run_dir): io.log(f'EXISTING RUN FOUND AT: {run_dir}, SKIPPING!', style='bold red') io.check_else_make_dir(run_dir) log_dir = configs.get('log_dir', None) configs['log_dir_orig'] = log_dir configs['log_dir'] = run_dir configs['run_steps'] = run_steps configs = AttrDict(configs) dynamics = build_dynamics(configs) xnet, vnet = dynamics._load_networks(log_dir) dynamics.xnet = xnet dynamics.vnet = vnet io.log(f'Original dynamics.net_weights: {dynamics.net_weights}') io.log(f'Setting `dynamics.net_weights` to: {net_weights}') dynamics._set_net_weights(net_weights) dynamics.net_weights = net_weights io.log(f'Now, dynamics.net_weights: {dynamics.net_weights}') dynamics, train_data, x = short_training(1000, beta, log_dir=log_dir, dynamics=dynamics, x=None) inference_results = run(dynamics, configs, beta=beta, runs_dir=run_dir, md_steps=500, make_plots=True, therm_frac=0.2, num_chains=16) return inference_results
def restore(self, data_dir, rank=0, local_rank=0, step=None, x_shape=None): """Restore `self.data` from `data_dir`.""" if step is not None: self.steps += step x_file = os.path.join(data_dir, f'x_rank{rank}-{local_rank}.z') try: x = io.loadz(x_file) io.log(f'Restored `x` from: {x_file}.', should_print=True) except FileNotFoundError: io.log(f'Unable to load `x` from {x_file}.', level='WARNING') io.log('Using random normal init.', level='WARNING') x = tf.random.normal(x_shape) data = self.load_data(data_dir) for key, val in data.items(): self.data[key] = np.array(val).tolist() return x
def savefig(fig, fpath): io.check_else_make_dir(os.path.dirname(fpath)) io.log(f'Saving figure to: {fpath}.') fig.savefig(fpath, dpi=400, bbox_inches='tight')
def _savefig(fig, out_file): """Save `fig` to `out_file`.""" io.log(f'Saving figure to: {out_file}.') fig.savefig(out_file, dpi=200, bbox_inches='tight')
def main(args): """Main method for training.""" hmc_steps = args.get('hmc_steps', 0) tf.keras.backend.set_floatx('float32') log_file = os.path.join(os.getcwd(), 'log_dirs.txt') x = None log_dir = args.get('log_dir', None) beta_init = args.get('beta_init', None) beta_final = args.get('beta_final', None) if log_dir is not None: # we want to restore from latest checkpoint train_steps = args.get('train_steps', None) args = restore_flags(args, os.path.join(args.log_dir, 'training')) args.train_steps = train_steps # use newly passed value args.restore = True if beta_init != args.get('beta_init', None): args.beta_init = beta_init if beta_final != args.get('beta_final', None): args.beta_final = beta_final args.train_steps = train_steps else: # New training session timestamps = AttrDict({ 'month': io.get_timestamp('%Y_%m'), 'time': io.get_timestamp('%Y-%M-%d-%H%M%S'), 'hour': io.get_timestamp('%Y-%m-%d-%H'), 'minute': io.get_timestamp('%Y-%m-%d-%H%M'), 'second': io.get_timestamp('%Y-%m-%d-%H%M%S'), }) args.log_dir = io.make_log_dir(args, 'GaugeModel', log_file, timestamps=timestamps) io.write(f'{args.log_dir}', log_file, 'a') args.restore = False if hmc_steps > 0: x, _, eps = train_hmc(args) args.dynamics_config['eps'] = eps dynamics_config = args.get('dynamics_config', None) if dynamics_config is not None: log_dir = dynamics_config.get('log_dir', None) if log_dir is not None: eps_file = os.path.join(log_dir, 'training', 'models', 'eps.z') if os.path.isfile(eps_file): io.log(f'Loading eps from: {eps_file}') eps = io.loadz(eps_file) args.dynamics_config['eps'] = eps _, dynamics, _, args = train(args, x=x) # ==== # Run inference on trained model if args.get('run_steps', 5000) > 0: # ==== # Run with random start dynamics, _, _ = run(dynamics, args) # ==== # Run HMC args.hmc = True args.dynamics_config['eps'] = 0.15 hmc_dir = os.path.join(args.log_dir, 'inference_hmc') _ = run_hmc(args=args, hmc_dir=hmc_dir)
def __init__(self, params: AttrDict, config: GaugeDynamicsConfig, network_config: Optional[NetworkConfig] = None, lr_config: Optional[LearningRateConfig] = None, conv_config: Optional[ConvolutionConfig] = None): # ==== # Set attributes from `config` self.aux_weight = config.get('aux_weight', 0.) self.plaq_weight = config.get('plaq_weight', 0.) self.charge_weight = config.get('charge_weight', 0.01) self._gauge_eq_masks = config.get('gauge_eq_masks', False) self.lattice_shape = config.get('lattice_shape', None) self._combined_updates = config.get('combined_updates', False) self._alpha = tf.constant(1.) # self._alpha = tf.Variable(initial_value=1., trainable=False) self.lattice = GaugeLattice(self.lattice_shape) self.batch_size = self.lattice_shape[0] self.xdim = np.cumprod(self.lattice_shape[1:])[-1] self.config = config self.lr_config = lr_config self.conv_config = conv_config self.net_config = network_config if not self.config.use_conv_net: self.conv_config = None params.update({ 'batch_size': self.lattice_shape[0], 'xdim': np.cumprod(self.lattice_shape[1:])[-1], }) super().__init__( params=params, config=config, name='GaugeDynamics', normalizer=convert_to_angle, network_config=network_config, lr_config=lr_config, potential_fn=self.lattice.calc_actions, should_build=False, ) self._has_trainable_params = True if self.config.hmc: net_weights = NetWeights(0., 0., 0., 0., 0., 0.) self.config.use_ncp = False self.config.separate_networks = False self.config.use_conv_net = False self.conv_config = None self.xnet, self.vnet = self._build_hmc_networks() if self.config.eps_fixed: self._has_trainable_params = False else: if self.config.use_ncp: net_weights = NetWeights(1., 1., 1., 1., 1., 1.) else: net_weights = NetWeights(0., 1., 1., 1., 1., 1.) log_dir = self.config.get('log_dir', None) if log_dir is None: self.xnet, self.vnet = self._build_networks( self.net_config, self.conv_config) else: io.log(f'Loading `xnet`, `vnet`, from {log_dir} !!') self.xnet, self.vnet = self._load_networks(log_dir) # ============ self.net_weights = self._parse_net_weights(net_weights) if self._has_trainable_params: self.lr_config = lr_config self.lr = self._create_lr(lr_config, auto=True) self.optimizer = self._create_optimizer()
def train_dynamics( dynamics: Union[BaseDynamics, GaugeDynamics], flags: AttrDict, dirs: str = None, x: tf.Tensor = None, betas: tf.Tensor = None, ): """Train model.""" # setup... factor = flags.get('reduce_lr_factor', 0.5) patience = flags.get('patience', 10) min_lr = flags.get('min_lr') warmup_steps = dynamics.lr_config.get('warmup_steps', 1000) reduce_lr = ReduceLROnPlateau(monitor='loss', mode='min', warmup_steps=warmup_steps, factor=factor, min_lr=min_lr, verbose=1, patience=patience) reduce_lr.set_model(dynamics) config = setup(dynamics, flags, dirs, x, betas) x = config.x steps = config.steps betas = config.betas train_step = config.train_step ckpt = config.checkpoint manager = config.manager train_data = config.train_data if IS_CHIEF: writer = config.writer if writer is not None: writer.set_as_default() # +-----------------------------------------------------------------+ # | Try running compiled `train_step` fn otherwise run imperatively | # +-----------------------------------------------------------------+ io.log(120 * '*') try: if flags.profiler: tf.summary.trace_on(graph=True, profiler=True) x, metrics = train_step((x, tf.constant(betas[0]))) io.log('Compiled `dynamics.train_step` using tf.function!') if IS_CHIEF and flags.profiler: tf.summary.trace_export(name='train_step_trace', step=0, profiler_outdir=dirs.summary_dir) tf.summary.trace_off() except Exception as exception: io.log(exception, level='CRITICAL') train_step = dynamics.train_step x, metrics = train_step((x, tf.constant(betas[0]))) lstr = '\n'.join([ '`tf.function(dynamics.train_step)` failed!', 'Running `dynamics.train_step` imperatively...' ]) io.log(lstr, level='CRITICAL') io.log(120 * '*') if IS_CHIEF: xf = os.path.join(dirs.log_dir, 'dynamics_xnet.png') vf = os.path.join(dirs.log_dir, 'dynamics_vnet.png') try: xnet = dynamics.xnet vnet = dynamics.vnet if dynamics.config.separate_networks: xnet = xnet[0] vnet = vnet[0] tf.keras.utils.plot_model(xnet, show_shapes=True, to_file=xf) tf.keras.utils.plot_model(vnet, show_shapes=True, to_file=vf) except Exception as exception: print(exception) # +--------------------------------+ # | Run MD update to not get stuck | # +--------------------------------+ md_steps = flags.get('md_steps', 0) if md_steps > 0: io.log(f'Running {md_steps} MD updates...') for _ in range(md_steps): mc_states, _ = dynamics.md_update((x, tf.constant(betas[0])), training=True) x = mc_states.out.x # +--------------------------------------------------------------+ # | Final setup; create timing wrapper for `train_step` function | # | and get formatted header string to display during training. | # +--------------------------------------------------------------+ ps_ = flags.get('print_steps', None) ls_ = flags.get('logging_steps', None) def timed_step(x: tf.Tensor, beta: tf.Tensor): start = time.time() x, metrics = train_step((x, tf.constant(beta))) metrics.dt = time.time() - start return x, metrics def should_print(step): if IS_CHIEF and step % ps_ == 0: return True return False def should_log(step): if IS_CHIEF and step % ls_ == 0: return True return False def should_save(step): if step % flags.save_steps == 0 and ckpt is not None: return True return False header = train_data.get_header(metrics, skip=['charges'], prepend=['{:^12s}'.format('step')]) if IS_CHIEF: io.log(header.split('\n'), should_print=True) if NUM_NODES == 1: ctup = (CBARS['blue'], CBARS['yellow'], CBARS['blue'], CBARS['reset']) steps = tqdm(steps, desc='training', unit='step', bar_format=("%s{l_bar}%s{bar}%s{r_bar}%s" % ctup)) # +---------------+ # | Training loop | # +---------------+ warmup_steps = dynamics.lr_config.get('warmup_steps', 100) steps_per_epoch = flags.get('steps_per_epoch', 1000) print(f'steps_per_epoch: {steps_per_epoch}') for step, beta in zip(steps, betas): # Perform a single training step x, metrics = timed_step(x, beta) # if step % 10 == 0: if (step + 1) > warmup_steps and (step + 1) % steps_per_epoch == 0: # logs = {'loss': train_data.data.get('loss', None)} reduce_lr.on_epoch_end(step + 1, {'loss': metrics.loss}) # Save checkpoints and dump configs `x` from each rank if should_save(step + 1): train_data.dump_configs(x, dirs.data_dir, rank=RANK, local_rank=LOCAL_RANK) if IS_CHIEF: manager.save() dynamics.save_networks(dirs.log_dir) # save_models(dynamics, dirs) train_data.save_and_flush(dirs.data_dir, dirs.log_file, rank=RANK, mode='a') # Print current training state and metrics if should_print(step): data_str = train_data.get_fstr(step, metrics, skip=['charges']) io.log(data_str, should_print=True) # Update summary objects if should_log(step): train_data.update(step, metrics) if writer is not None: update_summaries(step, metrics, dynamics) writer.flush() # Print header every so often if IS_CHIEF and (step + 1) % (50 * flags.print_steps) == 0: io.log(header.split('\n'), should_print=True) train_data.dump_configs(x, dirs.data_dir, rank=RANK, local_rank=LOCAL_RANK) if IS_CHIEF: manager.save() io.log(f'Checkpoint saved to: {manager.latest_checkpoint}') train_data.save_and_flush(dirs.data_dir, dirs.log_file, rank=RANK, mode='a') if writer is not None: writer.flush() writer.close() return x, train_data
def setup(dynamics, flags, dirs=None, x=None, betas=None): """Setup training.""" train_data = DataContainer(flags.train_steps, dirs=dirs, print_steps=flags.print_steps) ckpt = tf.train.Checkpoint(dynamics=dynamics, optimizer=dynamics.optimizer) manager = tf.train.CheckpointManager(ckpt, dirs.ckpt_dir, max_to_keep=5) if manager.latest_checkpoint: # restore from checkpoint io.log(f'Restored model from: {manager.latest_checkpoint}') ckpt.restore(manager.latest_checkpoint) current_step = dynamics.optimizer.iterations.numpy() x = train_data.restore(dirs.data_dir, step=current_step, rank=RANK, local_rank=LOCAL_RANK, x_shape=dynamics.x_shape) else: io.log('Starting new training run...') # Create initial samples if not restoring from ckpt if x is None: x = np.pi * tf.random.normal(shape=dynamics.x_shape) # Setup summary writer writer = None make_summaries = flags.get('make_summaries', True) if IS_CHIEF and make_summaries and TF_VERSION == 2: writer = tf.summary.create_file_writer(dirs.summary_dir) current_step = dynamics.optimizer.iterations.numpy() # get global step num_steps = max([flags.train_steps + 1, current_step + 1]) steps = tf.range(current_step, num_steps, dtype=tf.int64) train_data.steps = steps[-1] if betas is None: if flags.beta_init == flags.beta_final: # train at fixed beta betas = flags.beta_init * np.ones(len(steps)) else: # get annealing schedule w/ same length as `steps` betas = get_betas(len(steps), flags.beta_init, flags.beta_final) betas = betas[current_step:] if len(betas) == 0: if flags.beta_init == flags.beta_final: # train at fixed beta betas = flags.beta_init * np.ones(len(steps)) else: # get annealing schedule w/ same length as `steps` betas = get_betas(len(steps), flags.beta_init, flags.beta_final) betas = betas[current_step:] betas = tf.constant(betas, dtype=TF_FLOAT) dynamics.compile(loss=dynamics.calc_losses, optimizer=dynamics.optimizer, experimental_run_tf_function=False) # x_tspec = tf.TensorSpec(dynamics.x_shape, dtype=x.dtype, name='x') # beta_tspec = tf.TensorSpec([], dtype=TF_FLOAT, name='beta') # input_signature=[x_tspec, beta_tspec]) try: inputs = (x, tf.constant(betas[0])) except IndexError: if flags.beta_init == flags.beta_final: # train at fixed beta betas = flags.beta_init * np.ones(len(steps)) else: # get annealing schedule w/ same length as `steps` betas = get_betas(len(steps), flags.beta_init, flags.beta_final) betas = betas[current_step:] _ = dynamics.apply_transition(inputs, training=True) if flags.get('compile', True): train_step = tf.function(dynamics.train_step) else: train_step = dynamics.train_step pstart = 0 pstop = 0 if flags.profiler: pstart = len(betas) // 2 pstop = pstart + 10 output = AttrDict({ 'x': x, 'betas': betas, 'steps': steps, 'writer': writer, 'manager': manager, 'checkpoint': ckpt, 'train_step': train_step, 'train_data': train_data, 'pstart': pstart, 'pstop': pstop, }) if dynamics.config.separate_networks: xnet_files = [ os.path.join(dirs.models_dir, f'dynamics_xnet{i}') for i in range(dynamics.config.num_steps) ] vnet_files = [ os.path.join(dirs.models_dir, f'dynamics_vnet{i}') for i in range(dynamics.config.num_steps) ] for idx, (xf, vf) in enumerate(zip(xnet_files, vnet_files)): xnet = dynamics.xnet[idx] vnet = dynamics.vnet[idx] io.log(f'Saving `GaugeDynamics.xnet{idx}` to {xf}.') io.log(f'Saving `GaugeDynamics.vnet{idx}` to {vf}.') xnet.save(xf) vnet.save(vf) else: # Save only if not running generic HMC if not dynamics.config.get('hmc', False): xnet_files = os.path.join(dirs.models_dir, 'dynamics_xnet') vnet_files = os.path.join(dirs.models_dir, 'dynamics_vnet') io.log(f'Saving `GaugeDynamics.xnet` to {xnet_files}.') io.log(f'Saving `GaugeDynamics.vnet` to {vnet_files}.') dynamics.xnet.save(xnet_files) dynamics.vnet.save(vnet_files) return output
from tqdm.auto import tqdm import numpy as np import tensorflow as tf import utils.file_io as io try: import horovod.tensorflow as hvd HAS_HOROVOD = True RANK = hvd.rank() LOCAL_RANK = hvd.local_rank() IS_CHIEF = (RANK == 0) NUM_NODES = hvd.size() io.log(f'Number of devices: {NUM_NODES}') except (ImportError, ModuleNotFoundError): HAS_HOROVOD = False RANK = 0 LOCAL_RANK = 0 IS_CHIEF = (RANK == 0) NUM_NODES = 1 io.log(f'Number of devices: {NUM_NODES}') from config import CBARS, NET_WEIGHTS_HMC, TF_FLOAT from network.config import LearningRateConfig from utils.file_io import timeit from utils.attr_dict import AttrDict from utils.summary_utils import update_summaries from utils.learning_rate import ReduceLROnPlateau from utils.plotting_utils import plot_data
def run_dynamics( dynamics: GaugeDynamics, flags: dict[str, Any], writer: tf.summary.SummaryWriter = None, x: tf.Tensor = None, beta: float = None, save_x: bool = False, md_steps: int = 0, # window: int = 0, # should_track: bool = False, ) -> (InferenceResults): """Run inference on trained dynamics.""" if not IS_CHIEF: return InferenceResults(None, None, None, None, None) # -- Setup ----------------------------- print_steps = flags.get('print_steps', 5) if beta is None: beta = flags.get('beta', flags.get('beta_final', None)) # type: float if beta is None: logger.warning(f'beta unspecified! setting to 1') beta = 1. assert beta is not None and isinstance(beta, float) test_step = dynamics.test_step if flags.get('compile', True): test_step = tf.function(dynamics.test_step) io.log('Compiled `dynamics.test_step` using tf.function!') if x is None: x = tf.random.uniform(shape=dynamics.x_shape, *(-PI, PI)) # minval, maxval=PI, # dtype=TF_FLOAT) assert tf.is_tensor(x) run_steps = flags.get('run_steps', 20000) run_data = DataContainer(run_steps) template = '\n'.join([f'beta={beta}', f'net_weights={dynamics.net_weights}']) logger.info(f'Running inference with {template}') # Run `md_steps MD updates (w/o accept/reject) # to ensure chains don't get stuck if md_steps > 0: for _ in range(md_steps): mc_states, _ = dynamics.md_update((x, beta), training=False) x = mc_states.out.x try: x, metrics = test_step((x, tf.constant(beta))) except Exception as err: # pylint:disable=broad-except logger.warning(err) # io.log(f'Exception: {exception}') test_step = dynamics.test_step x, metrics = test_step((x, tf.constant(beta))) x_arr = [] def timed_step(x: tf.Tensor, beta: tf.Tensor): start = time.time() x, metrics = test_step((x, tf.constant(beta))) metrics.dt = time.time() - start if 'sin_charges' not in metrics: charges = dynamics.lattice.calc_both_charges(x=x) metrics['charges'] = charges.intQ metrics['sin_charges'] = charges.sinQ if save_x: x_arr.append(x.numpy()) return x, metrics summary_steps = max(run_steps // 100, 50) if writer is not None: writer.set_as_default() steps = tf.range(run_steps, dtype=tf.int64) keep_ = ['step', 'dt', 'loss', 'accept_prob', 'beta', 'dq_int', 'dq_sin', 'dQint', 'dQsin', 'plaqs', 'p4x4'] beta = tf.constant(beta, dtype=TF_FLOAT) # type: tf.Tensor data_strs = [] for idx, step in enumerate(steps): x, metrics = timed_step(x, beta) run_data.update(step, metrics) # update data after every accept/reject if step % summary_steps == 0: update_summaries(step, metrics, dynamics) # summarize_dict(metrics, step, prefix='testing') if step % print_steps == 0: pre = [f'{step}/{steps[-1]}'] ms = run_data.print_metrics(metrics, pre=pre, keep=keep_) data_strs.append(ms) return InferenceResults(dynamics=dynamics, x=x, x_arr=x_arr, run_data=run_data, data_strs=data_strs)
def run_dynamics( dynamics: GaugeDynamics, flags: AttrDict, x: tf.Tensor = None, save_x: bool = False, md_steps: int = 0, ) -> (DataContainer, tf.Tensor, list): """Run inference on trained dynamics.""" if not IS_CHIEF: return None, None, None # Setup print_steps = flags.get('print_steps', 5) beta = flags.get('beta', flags.get('beta_final', None)) test_step = dynamics.test_step if flags.get('compile', True): test_step = tf.function(dynamics.test_step) io.log('Compiled `dynamics.test_step` using tf.function!') if x is None: x = tf.random.uniform(shape=dynamics.x_shape, minval=-PI, maxval=PI, dtype=TF_FLOAT) run_data = DataContainer(flags.run_steps) template = '\n'.join([f'beta: {beta}', f'eps: {dynamics.eps.numpy():.4g}', f'net_weights: {dynamics.net_weights}']) io.log(f'Running inference with:\n {template}') # Run 50 MD updates (w/o accept/reject) to ensure chains don't get stuck if md_steps > 0: for _ in range(md_steps): mc_states, _ = dynamics.md_update(x, beta, training=False) x = mc_states.out.x try: x, metrics = test_step((x, tf.constant(beta))) except Exception as exception: # pylint:disable=broad-except io.log(f'Exception: {exception}') test_step = dynamics.test_step x, metrics = test_step((x, tf.constant(beta))) header = run_data.get_header(metrics, skip=['charges'], prepend=['{:^12s}'.format('step')]) # io.log(header) io.log(header.split('\n'), should_print=True) # ------------------------------------------------------------- x_arr = [] def timed_step(x: tf.Tensor, beta: tf.Tensor): start = time.time() x, metrics = test_step((x, tf.constant(beta))) metrics.dt = time.time() - start if save_x: x_arr.append(x.numpy()) return x, metrics steps = tf.range(flags.run_steps, dtype=tf.int64) if NUM_NODES == 1: ctup = (CBARS['red'], CBARS['green'], CBARS['red'], CBARS['reset']) steps = tqdm(steps, desc='running', unit='step', bar_format=("%s{l_bar}%s{bar}%s{r_bar}%s" % ctup)) for step in steps: x, metrics = timed_step(x, beta) run_data.update(step, metrics) if step % print_steps == 0: summarize_dict(metrics, step, prefix='testing') data_str = run_data.get_fstr(step, metrics, skip=['charges']) io.log(data_str, should_print=True) if (step + 1) % 1000 == 0: io.log(header, should_print=True) return run_data, x, x_arr