Example #1
0
 def dump_configs(x, data_dir, rank=0, local_rank=0):
     """Save configs `x` separately for each rank."""
     xfile = os.path.join(data_dir, f'x_rank{rank}-{local_rank}.z')
     io.log('Saving configs from rank ' f'{rank}-{local_rank} to: {xfile}.')
     head, _ = os.path.split(xfile)
     io.check_else_make_dir(head)
     joblib.dump(x, xfile)
Example #2
0
def plot_charges(steps, charges, title=None, out_dir=None):
    charges = charges.T
    if charges.shape[0] > 4:
        charges = charges[:4, :]
    fig, ax = plt.subplots()
    for idx, q in enumerate(charges):
        ax.plot(steps, np.around(q) + 5 * idx, marker='', ls='-')
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.xmargin: 0
    ax.yaxis.set_label_coords(-0.03, 0.5)
    ax.set_ylabel(
        r"$\mathcal{Q}$",  # , fontsize='x-large',
        rotation='horizontal')
    ax.set_xlabel('MC Step')  # , fontsize='x-large')
    if title is not None:
        ax.set_title(title)  # , fontsize='x-large')
    plt.tight_layout()

    if out_dir is not None:
        fpath = os.path.join(out_dir, 'charge_chains.png')
        io.log(f'Saving figure to: {fpath}.')
        plt.savefig(fpath, dpi=400, bbox_inches='tight')

    return fig, ax
Example #3
0
def load_and_run(
        args: AttrDict,
        x: tf.Tensor = None,
        runs_dir: str = None,
) -> (GaugeDynamics, DataContainer, tf.Tensor):
    """Load trained model from checkpoint and run inference."""
    if not IS_CHIEF:
        return None, None, None

    io.print_dict(args)
    ckpt_dir = os.path.join(args.log_dir, 'training', 'checkpoints')
    flags = restore_from_train_flags(args)
    eps_file = os.path.join(args.log_dir, 'training', 'train_data', 'eps.z')
    flags.eps = io.loadz(eps_file)[-1]
    dynamics = build_dynamics(flags)

    ckpt = tf.train.Checkpoint(dynamics=dynamics,
                               optimizer=dynamics.optimizer)
    manager = tf.train.CheckpointManager(ckpt, max_to_keep=5,
                                         directory=ckpt_dir)
    if manager.latest_checkpoint:
        io.log(f'Restored model from: {manager.latest_checkpoint}')
        status = ckpt.restore(manager.latest_checkpoint)
        status.assert_existing_objects_matched()
        xfile = os.path.join(args.log_dir, 'training',
                             'train_data', 'x_rank0.z')
        io.log(f'Restored x from: {xfile}.')
        x = io.loadz(xfile)

    dynamics, run_data, x = run(dynamics, args, x=x, runs_dir=runs_dir)

    return dynamics, run_data, x
Example #4
0
def restore_flags(flags, train_dir):
    """Update `FLAGS` using restored flags from `log_dir`."""
    rf_file = os.path.join(train_dir, 'FLAGS.z')
    restored = AttrDict(dict(io.loadz(rf_file)))
    io.log(f'Restoring FLAGS from: {rf_file}...')
    flags.update(restored)

    return flags
Example #5
0
    def load_data(data_dir):
        """Load data from `data_dir` and populate `self.data`."""
        contents = os.listdir(data_dir)
        fnames = [i for i in contents if i.endswith('.z')]
        keys = [i.rstrip('.z') for i in fnames]
        data_files = [os.path.join(data_dir, i) for i in fnames]
        data = {}
        for key, val in zip(keys, data_files):
            if 'x_rank' in key:
                continue
            io.log(f'Restored {key} from {val}.')
            data[key] = io.loadz(val)

        return AttrDict(data)
Example #6
0
    def on_epoch_end(self, step, logs=None):
        if step < self.warmup_steps:
            return

        #  def _get_lr():
        #      try:
        #          lr = K.get_value(self.model.optimizer.lr)
        #      except ValueError:
        #          lr = self.model.lr(self.model.optimizer.iterations)
        #      finally:
        #          lr = None
        #
        #      return lr

        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            #  logging.warning(
            io.log(f'ReduceLROnPlateau conditioned on metric'
                   f' {self.monitor} which is not available.'
                   f' Available metrics are: {",".join(list(logs.keys()))}')

        else:
            if self.in_cooldown():
                self.cooldown_counter -= 1
                self.wait = 0
            if self.monitor_op(current, self.best):
                self.best = current
                self.wait = 0
            elif not self.in_cooldown():
                self.wait += 1
                if self.wait >= self.patience:
                    step = self.model.optimizer.iterations
                    old_lr = self.model._get_lr(step)
                    if old_lr > self.min_lr:
                        new_lr = old_lr * self.factor
                        new_lr = max(new_lr, self.min_lr)
                        K.set_value(self.model.optimizer.lr, new_lr)
                        if self.verbose > 0:
                            #  logging.warning(
                            print(
                                f'ReduceLROnPlateau (step {step}):'
                                ' Reducing learning rate from:'
                                f' {old_lr} to {new_lr}.', )
                            print(f'current: {current}, best: {self.best}')
                            #  print(f'\nstep {epoch}: ReduceLROnPlateau'
                            #        ' reducing learning rate from:'
                            #        f' {old_lr} to {new_lr:g}.')
                        self.cooldown_counter = self.cooldown
                        self.wait = 0
Example #7
0
 def save_networks(self, log_dir):
     """Save networks to disk."""
     models_dir = os.path.join(log_dir, 'training', 'models')
     io.check_else_make_dir(models_dir)
     eps_file = os.path.join(models_dir, 'eps.z')
     io.savez(self.eps.numpy(), eps_file, name='eps')
     if self.config.separate_networks:
         xnet_paths = [
             os.path.join(models_dir, f'dynamics_xnet{i}')
             for i in range(self.config.num_steps)
         ]
         vnet_paths = [
             os.path.join(models_dir, f'dynamics_vnet{i}')
             for i in range(self.config.num_steps)
         ]
         for idx, (xf, vf) in enumerate(zip(xnet_paths, vnet_paths)):
             xnet = self.xnet[idx]  # type: tf.keras.models.Model
             vnet = self.vnet[idx]  # type: tf.keras.models.Model
             io.log(f'Saving `xnet{idx}` to {xf}.')
             io.log(f'Saving `vnet{idx}` to {vf}.')
             xnet.save(xf)
             vnet.save(vf)
     else:
         xnet_paths = os.path.join(models_dir, 'dynamics_xnet')
         vnet_paths = os.path.join(models_dir, 'dynamics_vnet')
         io.log(f'Saving `xnet` to {xnet_paths}.')
         io.log(f'Saving `vnet` to {vnet_paths}.')
         self.xnet.save(xnet_paths)
         self.vnet.save(vnet_paths)
Example #8
0
def short_training(
        train_steps: int,
        beta: float,
        log_dir: str,
        dynamics: GaugeDynamics,
        x: tf.Tensor = None,
):
    """Perform a brief training run prior to running inference."""
    ckpt_dir = os.path.join(log_dir, 'training', 'checkpoints')
    ckpt = tf.train.Checkpoint(dynamics=dynamics, optimizer=dynamics.optimizer)
    manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5)
    current_step = 0
    if manager.latest_checkpoint:
        io.log(f'Restored model from: {manager.latest_checkpoint}')
        ckpt.restore(manager.latest_checkpoint)
        current_step = dynamics.optimizer.iterations.numpy()

    if x is None:
        x = convert_to_angle(tf.random.normal(dynamics.x_shape))

    train_data = DataContainer(current_step+train_steps, print_steps=1)

    dynamics.compile(loss=dynamics.calc_losses,
                     optimizer=dynamics.optimizer,
                     experimental_run_tf_function=False)

    x, metrics = dynamics.train_step((x, tf.constant(beta)))

    header = train_data.get_header(metrics, skip=SKEYS,
                                   prepend=['{:^12s}'.format('step')])
    io.log(header.split('\n'))
    for step in range(current_step, current_step + train_steps):
        start = time.time()
        x, metrics = dynamics.train_step((x, tf.constant(beta)))
        metrics.dt = time.time() - start
        train_data.update(step, metrics)
        data_str = train_data.print_metrics(metrics)
        logger.info(data_str)

        # logger.print_metrics(metrics)
        #  data_str = train_data.get_fstr(step, metrics, skip=SKEYS)
        #  io.log(data_str)

    return dynamics, train_data, x
Example #9
0
def run_hmc(
        args: AttrDict,
        hmc_dir: str = None,
        skip_existing: bool = False,
) -> (GaugeDynamics, DataContainer, tf.Tensor):
    """Run HMC using `inference_args` on a model specified by `params`.

    NOTE:
    -----
    args should be a dict with the following keys:
        - 'hmc'
        - 'eps'
        - 'beta'
        - 'num_steps'
        - 'run_steps'
        - 'lattice_shape'
    """
    if not IS_CHIEF:
        return None, None, None

    if hmc_dir is None:
        root_dir = os.path.join(GAUGE_LOGS_DIR, 'hmc_logs')
        month_str = io.get_timestamp('%Y_%m')
        hmc_dir = os.path.join(root_dir, month_str)

    io.check_else_make_dir(hmc_dir)

    def get_run_fstr(run_dir):
        _, tail = os.path.split(run_dir)
        fstr = tail.split('-')[0]
        return fstr

    if skip_existing:
        run_dirs = [os.path.join(hmc_dir, i) for i in os.listdir(hmc_dir)]
        run_fstrs = [get_run_fstr(i) for i in run_dirs]
        run_fstr = io.get_run_dir_fstr(args)
        if run_fstr in run_fstrs:
            io.log('ERROR:Existing run found! Skipping.')
            return None, None, None

    dynamics = build_dynamics(args)
    dynamics, run_data, x = run(dynamics, args, runs_dir=hmc_dir)

    return dynamics, run_data, x
Example #10
0
def load_test_configs(json_file: Union[str, Path] = None):
    """Load test configs, if specified.

    If not specified, load from `BIN_DIR/test_configs.json`.

    Returns:
        configs (AttrDict): Configs.
    """
    if json_file is None:
        json_file = os.path.join(BIN_DIR, 'test_configs.json')

    try:
        with open(json_file, 'rt') as f:
            configs = json.load(f)
    except FileNotFoundError:
        io.log(f'Unable to load configs from: {json_file}. Exiting.')
        raise

    return configs
Example #11
0
    def write_to_csv(self, log_dir, run_dir, hmc=False):
        """Write data averages to bulk csv file for comparing runs."""
        _, run_str = os.path.split(run_dir)
        avg_data = {
            'log_dir': log_dir,
            'run_dir': run_str,
            'hmc': hmc,
        }
        for key, val in self.data.items():
            tensor = tf.convert_to_tensor(val)
            arr, steps = therm_arr(tensor.numpy(), therm_frac=0.2)
            if 'steps' not in avg_data:
                avg_data['steps'] = len(steps)
            avg_data[key] = np.mean(arr)
            #  avg_data[key] = tf.reduce_mean(arr)

        avg_df = pd.DataFrame(avg_data, index=[0])
        csv_file = os.path.join(BASE_DIR, 'logs', 'GaugeModel_logs',
                                'inference_results.csv')
        io.log(f'Appending inference results to {csv_file}.')
        if not os.path.isfile(csv_file):
            avg_df.to_csv(csv_file, header=True, index=False, mode='w')
        else:
            avg_df.to_csv(csv_file, header=False, index=False, mode='a')
Example #12
0
def main(args, random_start=True):
    """Run inference on trained model from `log_dir/checkpoints/`."""
    if not IS_CHIEF:
        return

    io.print_flags(args)
    skip = not args.get('overwrite', False)

    # If no `log_dir` specified, run generic HMC
    log_dir = args.get('log_dir', None)
    if log_dir is None:
        io.log('`log_dir` not specified, running generic HMC...')
        _ = run_hmc(args=args, hmc_dir=None, skip_existing=skip)
        return

    # Otherwise, load training flags
    train_flags_file = os.path.join(log_dir, 'training', 'FLAGS.z')
    train_flags = io.loadz(train_flags_file)

    beta = args.get('beta', None)
    eps = args.get('eps', None)

    if beta is None:
        io.log('Using `beta_final` from training flags')
        beta = train_flags['beta_final']
    if eps is None:
        eps_file = os.path.join(log_dir, 'training', 'train_data', 'eps.z')
        io.log(f'Loading `eps` from {eps_file}')
        eps_arr = io.loadz(eps_file)
        eps = tf.cast(eps_arr[-1], TF_FLOAT)

    # Update `args` with values from training flags
    args.update({
        'eps': eps,
        'beta': beta,
        'num_steps': int(train_flags['num_steps']),
        'lattice_shape': train_flags['lattice_shape'],
    })

    # Run generic HMC using trained step-size (by loading it from
    _ = run_hmc(args=args, hmc_dir=None, skip_existing=skip)

    # `x` will be randomly initialized if passed as `None`
    x = None
    if not random_start:
        # Load the last configuration from the end of training run
        x_file = os.path.join(args.log_dir, 'training',
                              'train_data', 'x_rank0.z')
        x = io.loadz(x_file) if os.path.isfile(x_file) else None

    # Run inference on trained model from `args.log_dir`
    args['hmc'] = False  # Ensure we're running L2HMC
    _ = load_and_run(args, x=x)

    return
Example #13
0
def train(flags: AttrDict, x: tf.Tensor = None, restore_x: bool = False):
    """Train model.

    Returns:
        x (tf.Tensor): Batch of configurations
        dynamics (GaugeDynamics): Dynamics object.
        train_data (DataContainer): Object containing train data.
        flags (AttrDict): AttrDict containing flags used.
    """
    dirs = io.setup_directories(flags)
    flags.update({'dirs': dirs})

    if restore_x:
        x = None
        try:
            xfile = os.path.join(dirs.train_dir, 'train_data',
                                 f'x_rank{RANK}-{LOCAL_RANK}.z')
            x = io.loadz(xfile)
        except FileNotFoundError:
            io.log(f'Unable to restore x from {xfile}. Using random init.')

    if x is None:
        x = tf.random.normal(flags.dynamics_config['lattice_shape'])
        x = tf.reshape(x, (x.shape[0], -1))

    dynamics = build_dynamics(flags)
    dynamics.save_config(dirs.config_dir)

    io.log('\n'.join([120 * '*', 'Training L2HMC sampler...']))
    x, train_data = train_dynamics(dynamics, flags, dirs, x=x)

    if IS_CHIEF:
        output_dir = os.path.join(dirs.train_dir, 'outputs')
        train_data.save_data(output_dir)

        params = {
            'beta_init': train_data.data.beta[0],
            'beta_final': train_data.data.beta[-1],
            'eps': dynamics.eps.numpy(),
            'lattice_shape': dynamics.config.lattice_shape,
            'num_steps': dynamics.config.num_steps,
            'net_weights': dynamics.net_weights,
        }
        plot_data(train_data,
                  dirs.train_dir,
                  flags,
                  thermalize=True,
                  params=params)

    io.log('\n'.join(['Done training model', 120 * '*']))
    io.save_dict(dict(flags), dirs.log_dir, 'configs')

    return x, dynamics, train_data, flags
Example #14
0
    def get_observables(self, run_dir=None):
        """Get all observables from inference_data in `run_dir`."""
        run_params = io.loadz(os.path.join(run_dir, 'run_params.pkl'))
        beta = run_params['beta']
        net_weights = tuple([int(i) for i in run_params['net_weights']])

        keep = True
        if self._nw_include is not None:
            keep = net_weights in self._nw_include

        # If none (< 10 %) of the proposed configs are rejected,
        # don't bother loading data and calculating statistics.
        px = self._load_sqz('px.pkl')
        avg_px = np.mean(px)
        if avg_px < 0.1 or not keep:
            io.log(f'Skipping! nw: {net_weights}, avg_px: {avg_px:.3g}')
            return None, run_params

        io.log(f'Loading data for net_weights: {net_weights}...')
        io.log(f'  run_dir: {run_dir}')

        # load chages, plaqs data
        charges = self._load_sqz('charges.pkl')
        plaqs = self._load_sqz('plaqs.pkl')
        dplq = u1_plaq_exact(beta) - plaqs

        # thermalize configs
        px, _ = therm_arr(px, self._therm_frac)
        dplq, _ = therm_arr(dplq, self._therm_frac)
        charges, _ = np.insert(charges, 0, 0, axis=0)
        charges, _ = therm_arr(charges)
        dq, _ = calc_tunneling_rate(charges)
        dq = dq.T

        dx = self._get_dx('dx.pkl')
        dxf = self.get_dx('dxf.pkl')
        dxb = self._get_dx('dxb.pkl')
        observables = {
            'plaqs_diffs': dplq,
            'accept_prob': px,
            'tunneling_rate': dq,
        }
        _names = ['dx', 'dxf', 'dxb']
        _vals = [dx, dxf, dxb]
        for name, val in zip(_names, _vals):
            if val is not None:
                observables[name] = val

        return observables
Example #15
0
def run_from_log_dir(log_dir: str, net_weights: NetWeights, run_steps=5000):
    configs = load_configs_from_log_dir(log_dir)
    if 'x_shape' not in configs['dynamics_config'].keys():
        x_shape = configs['dynamics_config'].pop('lattice_shape')
        configs['dynamics_config']['x_shape'] = x_shape

    beta = configs['beta_final']
    nwstr = 'nw' + ''.join([f'{int(i)}' for i in net_weights])
    run_dir = os.path.join(PROJECT_DIR, 'l2hmc_function_tests',
                           'inference', f'beta{beta}', f'{nwstr}')
    if os.path.isdir(run_dir):
        io.log(f'EXISTING RUN FOUND AT: {run_dir}, SKIPPING!', style='bold red')

    io.check_else_make_dir(run_dir)
    log_dir = configs.get('log_dir', None)
    configs['log_dir_orig'] = log_dir
    configs['log_dir'] = run_dir
    configs['run_steps'] = run_steps
    configs = AttrDict(configs)

    dynamics = build_dynamics(configs)
    xnet, vnet = dynamics._load_networks(log_dir)
    dynamics.xnet = xnet
    dynamics.vnet = vnet
    io.log(f'Original dynamics.net_weights: {dynamics.net_weights}')
    io.log(f'Setting `dynamics.net_weights` to: {net_weights}')
    dynamics._set_net_weights(net_weights)
    dynamics.net_weights = net_weights
    io.log(f'Now, dynamics.net_weights: {dynamics.net_weights}')
    dynamics, train_data, x = short_training(1000, beta, log_dir=log_dir,
                                             dynamics=dynamics, x=None)
    inference_results = run(dynamics, configs, beta=beta, runs_dir=run_dir,
                            md_steps=500, make_plots=True, therm_frac=0.2,
                            num_chains=16)

    return inference_results
Example #16
0
    def restore(self, data_dir, rank=0, local_rank=0, step=None, x_shape=None):
        """Restore `self.data` from `data_dir`."""
        if step is not None:
            self.steps += step

        x_file = os.path.join(data_dir, f'x_rank{rank}-{local_rank}.z')
        try:
            x = io.loadz(x_file)
            io.log(f'Restored `x` from: {x_file}.', should_print=True)
        except FileNotFoundError:
            io.log(f'Unable to load `x` from {x_file}.', level='WARNING')
            io.log('Using random normal init.', level='WARNING')
            x = tf.random.normal(x_shape)

        data = self.load_data(data_dir)
        for key, val in data.items():
            self.data[key] = np.array(val).tolist()

        return x
Example #17
0
def savefig(fig, fpath):
    io.check_else_make_dir(os.path.dirname(fpath))
    io.log(f'Saving figure to: {fpath}.')
    fig.savefig(fpath, dpi=400, bbox_inches='tight')
Example #18
0
 def _savefig(fig, out_file):
     """Save `fig` to `out_file`."""
     io.log(f'Saving figure to: {out_file}.')
     fig.savefig(out_file, dpi=200, bbox_inches='tight')
Example #19
0
def main(args):
    """Main method for training."""
    hmc_steps = args.get('hmc_steps', 0)
    tf.keras.backend.set_floatx('float32')
    log_file = os.path.join(os.getcwd(), 'log_dirs.txt')

    x = None
    log_dir = args.get('log_dir', None)
    beta_init = args.get('beta_init', None)
    beta_final = args.get('beta_final', None)
    if log_dir is not None:  # we want to restore from latest checkpoint
        train_steps = args.get('train_steps', None)
        args = restore_flags(args, os.path.join(args.log_dir, 'training'))
        args.train_steps = train_steps  # use newly passed value
        args.restore = True
        if beta_init != args.get('beta_init', None):
            args.beta_init = beta_init
        if beta_final != args.get('beta_final', None):
            args.beta_final = beta_final
        args.train_steps = train_steps

    else:  # New training session
        timestamps = AttrDict({
            'month': io.get_timestamp('%Y_%m'),
            'time': io.get_timestamp('%Y-%M-%d-%H%M%S'),
            'hour': io.get_timestamp('%Y-%m-%d-%H'),
            'minute': io.get_timestamp('%Y-%m-%d-%H%M'),
            'second': io.get_timestamp('%Y-%m-%d-%H%M%S'),
        })
        args.log_dir = io.make_log_dir(args,
                                       'GaugeModel',
                                       log_file,
                                       timestamps=timestamps)
        io.write(f'{args.log_dir}', log_file, 'a')
        args.restore = False
        if hmc_steps > 0:
            x, _, eps = train_hmc(args)
            args.dynamics_config['eps'] = eps

    dynamics_config = args.get('dynamics_config', None)
    if dynamics_config is not None:
        log_dir = dynamics_config.get('log_dir', None)
        if log_dir is not None:
            eps_file = os.path.join(log_dir, 'training', 'models', 'eps.z')
            if os.path.isfile(eps_file):
                io.log(f'Loading eps from: {eps_file}')
                eps = io.loadz(eps_file)
                args.dynamics_config['eps'] = eps

    _, dynamics, _, args = train(args, x=x)

    # ====
    # Run inference on trained model
    if args.get('run_steps', 5000) > 0:
        # ====
        # Run with random start
        dynamics, _, _ = run(dynamics, args)

        # ====
        # Run HMC
        args.hmc = True
        args.dynamics_config['eps'] = 0.15
        hmc_dir = os.path.join(args.log_dir, 'inference_hmc')
        _ = run_hmc(args=args, hmc_dir=hmc_dir)
Example #20
0
    def __init__(self,
                 params: AttrDict,
                 config: GaugeDynamicsConfig,
                 network_config: Optional[NetworkConfig] = None,
                 lr_config: Optional[LearningRateConfig] = None,
                 conv_config: Optional[ConvolutionConfig] = None):
        # ====
        # Set attributes from `config`
        self.aux_weight = config.get('aux_weight', 0.)
        self.plaq_weight = config.get('plaq_weight', 0.)
        self.charge_weight = config.get('charge_weight', 0.01)
        self._gauge_eq_masks = config.get('gauge_eq_masks', False)
        self.lattice_shape = config.get('lattice_shape', None)
        self._combined_updates = config.get('combined_updates', False)
        self._alpha = tf.constant(1.)
        #  self._alpha = tf.Variable(initial_value=1., trainable=False)

        self.lattice = GaugeLattice(self.lattice_shape)
        self.batch_size = self.lattice_shape[0]
        self.xdim = np.cumprod(self.lattice_shape[1:])[-1]

        self.config = config
        self.lr_config = lr_config
        self.conv_config = conv_config
        self.net_config = network_config
        if not self.config.use_conv_net:
            self.conv_config = None

        params.update({
            'batch_size': self.lattice_shape[0],
            'xdim': np.cumprod(self.lattice_shape[1:])[-1],
        })

        super().__init__(
            params=params,
            config=config,
            name='GaugeDynamics',
            normalizer=convert_to_angle,
            network_config=network_config,
            lr_config=lr_config,
            potential_fn=self.lattice.calc_actions,
            should_build=False,
        )
        self._has_trainable_params = True
        if self.config.hmc:
            net_weights = NetWeights(0., 0., 0., 0., 0., 0.)
            self.config.use_ncp = False
            self.config.separate_networks = False
            self.config.use_conv_net = False
            self.conv_config = None
            self.xnet, self.vnet = self._build_hmc_networks()
            if self.config.eps_fixed:
                self._has_trainable_params = False
        else:
            if self.config.use_ncp:
                net_weights = NetWeights(1., 1., 1., 1., 1., 1.)
            else:
                net_weights = NetWeights(0., 1., 1., 1., 1., 1.)

            log_dir = self.config.get('log_dir', None)
            if log_dir is None:
                self.xnet, self.vnet = self._build_networks(
                    self.net_config, self.conv_config)
            else:
                io.log(f'Loading `xnet`, `vnet`, from {log_dir} !!')
                self.xnet, self.vnet = self._load_networks(log_dir)
            # ============

        self.net_weights = self._parse_net_weights(net_weights)
        if self._has_trainable_params:
            self.lr_config = lr_config
            self.lr = self._create_lr(lr_config, auto=True)
            self.optimizer = self._create_optimizer()
Example #21
0
def train_dynamics(
    dynamics: Union[BaseDynamics, GaugeDynamics],
    flags: AttrDict,
    dirs: str = None,
    x: tf.Tensor = None,
    betas: tf.Tensor = None,
):
    """Train model."""
    # setup...
    factor = flags.get('reduce_lr_factor', 0.5)
    patience = flags.get('patience', 10)
    min_lr = flags.get('min_lr')
    warmup_steps = dynamics.lr_config.get('warmup_steps', 1000)
    reduce_lr = ReduceLROnPlateau(monitor='loss',
                                  mode='min',
                                  warmup_steps=warmup_steps,
                                  factor=factor,
                                  min_lr=min_lr,
                                  verbose=1,
                                  patience=patience)
    reduce_lr.set_model(dynamics)

    config = setup(dynamics, flags, dirs, x, betas)
    x = config.x
    steps = config.steps
    betas = config.betas
    train_step = config.train_step
    ckpt = config.checkpoint
    manager = config.manager
    train_data = config.train_data
    if IS_CHIEF:
        writer = config.writer
        if writer is not None:
            writer.set_as_default()

    # +-----------------------------------------------------------------+
    # | Try running compiled `train_step` fn otherwise run imperatively |
    # +-----------------------------------------------------------------+
    io.log(120 * '*')
    try:
        if flags.profiler:
            tf.summary.trace_on(graph=True, profiler=True)
        x, metrics = train_step((x, tf.constant(betas[0])))
        io.log('Compiled `dynamics.train_step` using tf.function!')
        if IS_CHIEF and flags.profiler:
            tf.summary.trace_export(name='train_step_trace',
                                    step=0,
                                    profiler_outdir=dirs.summary_dir)
            tf.summary.trace_off()
    except Exception as exception:
        io.log(exception, level='CRITICAL')
        train_step = dynamics.train_step
        x, metrics = train_step((x, tf.constant(betas[0])))
        lstr = '\n'.join([
            '`tf.function(dynamics.train_step)` failed!',
            'Running `dynamics.train_step` imperatively...'
        ])
        io.log(lstr, level='CRITICAL')
    io.log(120 * '*')

    if IS_CHIEF:
        xf = os.path.join(dirs.log_dir, 'dynamics_xnet.png')
        vf = os.path.join(dirs.log_dir, 'dynamics_vnet.png')
        try:
            xnet = dynamics.xnet
            vnet = dynamics.vnet
            if dynamics.config.separate_networks:
                xnet = xnet[0]
                vnet = vnet[0]

            tf.keras.utils.plot_model(xnet, show_shapes=True, to_file=xf)
            tf.keras.utils.plot_model(vnet, show_shapes=True, to_file=vf)

        except Exception as exception:
            print(exception)

    # +--------------------------------+
    # | Run MD update to not get stuck |
    # +--------------------------------+
    md_steps = flags.get('md_steps', 0)
    if md_steps > 0:
        io.log(f'Running {md_steps} MD updates...')
        for _ in range(md_steps):
            mc_states, _ = dynamics.md_update((x, tf.constant(betas[0])),
                                              training=True)
            x = mc_states.out.x

    # +--------------------------------------------------------------+
    # | Final setup; create timing wrapper for `train_step` function |
    # | and get formatted header string to display during training.  |
    # +--------------------------------------------------------------+
    ps_ = flags.get('print_steps', None)
    ls_ = flags.get('logging_steps', None)

    def timed_step(x: tf.Tensor, beta: tf.Tensor):
        start = time.time()
        x, metrics = train_step((x, tf.constant(beta)))
        metrics.dt = time.time() - start
        return x, metrics

    def should_print(step):
        if IS_CHIEF and step % ps_ == 0:
            return True
        return False

    def should_log(step):
        if IS_CHIEF and step % ls_ == 0:
            return True
        return False

    def should_save(step):
        if step % flags.save_steps == 0 and ckpt is not None:
            return True
        return False

    header = train_data.get_header(metrics,
                                   skip=['charges'],
                                   prepend=['{:^12s}'.format('step')])
    if IS_CHIEF:
        io.log(header.split('\n'), should_print=True)
        if NUM_NODES == 1:
            ctup = (CBARS['blue'], CBARS['yellow'], CBARS['blue'],
                    CBARS['reset'])
            steps = tqdm(steps,
                         desc='training',
                         unit='step',
                         bar_format=("%s{l_bar}%s{bar}%s{r_bar}%s" % ctup))

    # +---------------+
    # | Training loop |
    # +---------------+
    warmup_steps = dynamics.lr_config.get('warmup_steps', 100)
    steps_per_epoch = flags.get('steps_per_epoch', 1000)
    print(f'steps_per_epoch: {steps_per_epoch}')
    for step, beta in zip(steps, betas):
        # Perform a single training step
        x, metrics = timed_step(x, beta)

        #  if step % 10 == 0:
        if (step + 1) > warmup_steps and (step + 1) % steps_per_epoch == 0:
            #  logs = {'loss': train_data.data.get('loss', None)}
            reduce_lr.on_epoch_end(step + 1, {'loss': metrics.loss})

        # Save checkpoints and dump configs `x` from each rank
        if should_save(step + 1):
            train_data.dump_configs(x,
                                    dirs.data_dir,
                                    rank=RANK,
                                    local_rank=LOCAL_RANK)
            if IS_CHIEF:
                manager.save()
                dynamics.save_networks(dirs.log_dir)
                #  save_models(dynamics, dirs)
                train_data.save_and_flush(dirs.data_dir,
                                          dirs.log_file,
                                          rank=RANK,
                                          mode='a')

        # Print current training state and metrics
        if should_print(step):
            data_str = train_data.get_fstr(step, metrics, skip=['charges'])
            io.log(data_str, should_print=True)

        # Update summary objects
        if should_log(step):
            train_data.update(step, metrics)
            if writer is not None:
                update_summaries(step, metrics, dynamics)
                writer.flush()

        # Print header every so often
        if IS_CHIEF and (step + 1) % (50 * flags.print_steps) == 0:
            io.log(header.split('\n'), should_print=True)

    train_data.dump_configs(x, dirs.data_dir, rank=RANK, local_rank=LOCAL_RANK)
    if IS_CHIEF:
        manager.save()
        io.log(f'Checkpoint saved to: {manager.latest_checkpoint}')
        train_data.save_and_flush(dirs.data_dir,
                                  dirs.log_file,
                                  rank=RANK,
                                  mode='a')
        if writer is not None:
            writer.flush()
            writer.close()

    return x, train_data
Example #22
0
def setup(dynamics, flags, dirs=None, x=None, betas=None):
    """Setup training."""
    train_data = DataContainer(flags.train_steps,
                               dirs=dirs,
                               print_steps=flags.print_steps)
    ckpt = tf.train.Checkpoint(dynamics=dynamics, optimizer=dynamics.optimizer)
    manager = tf.train.CheckpointManager(ckpt, dirs.ckpt_dir, max_to_keep=5)
    if manager.latest_checkpoint:  # restore from checkpoint
        io.log(f'Restored model from: {manager.latest_checkpoint}')
        ckpt.restore(manager.latest_checkpoint)
        current_step = dynamics.optimizer.iterations.numpy()
        x = train_data.restore(dirs.data_dir,
                               step=current_step,
                               rank=RANK,
                               local_rank=LOCAL_RANK,
                               x_shape=dynamics.x_shape)
    else:
        io.log('Starting new training run...')

    # Create initial samples if not restoring from ckpt
    if x is None:
        x = np.pi * tf.random.normal(shape=dynamics.x_shape)

    # Setup summary writer
    writer = None
    make_summaries = flags.get('make_summaries', True)
    if IS_CHIEF and make_summaries and TF_VERSION == 2:
        writer = tf.summary.create_file_writer(dirs.summary_dir)

    current_step = dynamics.optimizer.iterations.numpy()  # get global step
    num_steps = max([flags.train_steps + 1, current_step + 1])
    steps = tf.range(current_step, num_steps, dtype=tf.int64)
    train_data.steps = steps[-1]
    if betas is None:
        if flags.beta_init == flags.beta_final:  # train at fixed beta
            betas = flags.beta_init * np.ones(len(steps))
        else:  # get annealing schedule w/ same length as `steps`
            betas = get_betas(len(steps), flags.beta_init, flags.beta_final)
        betas = betas[current_step:]

    if len(betas) == 0:
        if flags.beta_init == flags.beta_final:  # train at fixed beta
            betas = flags.beta_init * np.ones(len(steps))
        else:  # get annealing schedule w/ same length as `steps`
            betas = get_betas(len(steps), flags.beta_init, flags.beta_final)
            betas = betas[current_step:]

    betas = tf.constant(betas, dtype=TF_FLOAT)
    dynamics.compile(loss=dynamics.calc_losses,
                     optimizer=dynamics.optimizer,
                     experimental_run_tf_function=False)
    #  x_tspec = tf.TensorSpec(dynamics.x_shape, dtype=x.dtype, name='x')
    #  beta_tspec = tf.TensorSpec([], dtype=TF_FLOAT, name='beta')
    #  input_signature=[x_tspec, beta_tspec])

    try:
        inputs = (x, tf.constant(betas[0]))
    except IndexError:
        if flags.beta_init == flags.beta_final:  # train at fixed beta
            betas = flags.beta_init * np.ones(len(steps))
        else:  # get annealing schedule w/ same length as `steps`
            betas = get_betas(len(steps), flags.beta_init, flags.beta_final)
            betas = betas[current_step:]

    _ = dynamics.apply_transition(inputs, training=True)

    if flags.get('compile', True):
        train_step = tf.function(dynamics.train_step)
    else:
        train_step = dynamics.train_step

    pstart = 0
    pstop = 0
    if flags.profiler:
        pstart = len(betas) // 2
        pstop = pstart + 10

    output = AttrDict({
        'x': x,
        'betas': betas,
        'steps': steps,
        'writer': writer,
        'manager': manager,
        'checkpoint': ckpt,
        'train_step': train_step,
        'train_data': train_data,
        'pstart': pstart,
        'pstop': pstop,
    })

    if dynamics.config.separate_networks:
        xnet_files = [
            os.path.join(dirs.models_dir, f'dynamics_xnet{i}')
            for i in range(dynamics.config.num_steps)
        ]
        vnet_files = [
            os.path.join(dirs.models_dir, f'dynamics_vnet{i}')
            for i in range(dynamics.config.num_steps)
        ]
        for idx, (xf, vf) in enumerate(zip(xnet_files, vnet_files)):
            xnet = dynamics.xnet[idx]
            vnet = dynamics.vnet[idx]
            io.log(f'Saving `GaugeDynamics.xnet{idx}` to {xf}.')
            io.log(f'Saving `GaugeDynamics.vnet{idx}` to {vf}.')
            xnet.save(xf)
            vnet.save(vf)
    else:
        # Save only if not running generic HMC
        if not dynamics.config.get('hmc', False):
            xnet_files = os.path.join(dirs.models_dir, 'dynamics_xnet')
            vnet_files = os.path.join(dirs.models_dir, 'dynamics_vnet')
            io.log(f'Saving `GaugeDynamics.xnet` to {xnet_files}.')
            io.log(f'Saving `GaugeDynamics.vnet` to {vnet_files}.')
            dynamics.xnet.save(xnet_files)
            dynamics.vnet.save(vnet_files)

    return output
Example #23
0
from tqdm.auto import tqdm

import numpy as np
import tensorflow as tf

import utils.file_io as io

try:
    import horovod.tensorflow as hvd

    HAS_HOROVOD = True
    RANK = hvd.rank()
    LOCAL_RANK = hvd.local_rank()
    IS_CHIEF = (RANK == 0)
    NUM_NODES = hvd.size()
    io.log(f'Number of devices: {NUM_NODES}')
except (ImportError, ModuleNotFoundError):
    HAS_HOROVOD = False
    RANK = 0
    LOCAL_RANK = 0
    IS_CHIEF = (RANK == 0)
    NUM_NODES = 1
    io.log(f'Number of devices: {NUM_NODES}')

from config import CBARS, NET_WEIGHTS_HMC, TF_FLOAT
from network.config import LearningRateConfig
from utils.file_io import timeit
from utils.attr_dict import AttrDict
from utils.summary_utils import update_summaries
from utils.learning_rate import ReduceLROnPlateau
from utils.plotting_utils import plot_data
Example #24
0
def run_dynamics(
        dynamics: GaugeDynamics,
        flags: dict[str, Any],
        writer: tf.summary.SummaryWriter = None,
        x: tf.Tensor = None,
        beta: float = None,
        save_x: bool = False,
        md_steps: int = 0,
        # window: int = 0,
        #  should_track: bool = False,
) -> (InferenceResults):
    """Run inference on trained dynamics."""
    if not IS_CHIEF:
        return InferenceResults(None, None, None, None, None)

    # -- Setup -----------------------------
    print_steps = flags.get('print_steps', 5)
    if beta is None:
        beta = flags.get('beta', flags.get('beta_final', None))  # type: float
        if beta is None:
            logger.warning(f'beta unspecified! setting to 1')
            beta = 1.
        assert beta is not None and isinstance(beta, float)

    test_step = dynamics.test_step
    if flags.get('compile', True):
        test_step = tf.function(dynamics.test_step)
        io.log('Compiled `dynamics.test_step` using tf.function!')

    if x is None:
        x = tf.random.uniform(shape=dynamics.x_shape, *(-PI, PI))
                              # minval, maxval=PI,
                              # dtype=TF_FLOAT)
    assert tf.is_tensor(x)

    run_steps = flags.get('run_steps', 20000)
    run_data = DataContainer(run_steps)

    template = '\n'.join([f'beta={beta}',
                          f'net_weights={dynamics.net_weights}'])
    logger.info(f'Running inference with {template}')

    # Run `md_steps MD updates (w/o accept/reject)
    # to ensure chains don't get stuck
    if md_steps > 0:
        for _ in range(md_steps):
            mc_states, _ = dynamics.md_update((x, beta), training=False)
            x = mc_states.out.x

    try:
        x, metrics = test_step((x, tf.constant(beta)))
    except Exception as err:  # pylint:disable=broad-except
        logger.warning(err)
        #  io.log(f'Exception: {exception}')
        test_step = dynamics.test_step
        x, metrics = test_step((x, tf.constant(beta)))

    x_arr = []

    def timed_step(x: tf.Tensor, beta: tf.Tensor):
        start = time.time()
        x, metrics = test_step((x, tf.constant(beta)))
        metrics.dt = time.time() - start
        if 'sin_charges' not in metrics:
            charges = dynamics.lattice.calc_both_charges(x=x)
            metrics['charges'] = charges.intQ
            metrics['sin_charges'] = charges.sinQ
        if save_x:
            x_arr.append(x.numpy())

        return x, metrics

    summary_steps = max(run_steps // 100, 50)

    if writer is not None:
        writer.set_as_default()

    steps = tf.range(run_steps, dtype=tf.int64)
    keep_ = ['step', 'dt', 'loss', 'accept_prob', 'beta',
             'dq_int', 'dq_sin', 'dQint', 'dQsin', 'plaqs', 'p4x4']

    beta = tf.constant(beta, dtype=TF_FLOAT)  # type: tf.Tensor
    data_strs = []
    for idx, step in enumerate(steps):
        x, metrics = timed_step(x, beta)
        run_data.update(step, metrics)  # update data after every accept/reject

        if step % summary_steps == 0:
            update_summaries(step, metrics, dynamics)
            # summarize_dict(metrics, step, prefix='testing')

        if step % print_steps == 0:
            pre = [f'{step}/{steps[-1]}']
            ms = run_data.print_metrics(metrics,
                                        pre=pre, keep=keep_)
            data_strs.append(ms)

    return InferenceResults(dynamics=dynamics, x=x, x_arr=x_arr,
                            run_data=run_data, data_strs=data_strs)
Example #25
0
def run_dynamics(
        dynamics: GaugeDynamics,
        flags: AttrDict,
        x: tf.Tensor = None,
        save_x: bool = False,
        md_steps: int = 0,
) -> (DataContainer, tf.Tensor, list):
    """Run inference on trained dynamics."""
    if not IS_CHIEF:
        return None, None, None

    # Setup
    print_steps = flags.get('print_steps', 5)
    beta = flags.get('beta', flags.get('beta_final', None))

    test_step = dynamics.test_step
    if flags.get('compile', True):
        test_step = tf.function(dynamics.test_step)
        io.log('Compiled `dynamics.test_step` using tf.function!')

    if x is None:
        x = tf.random.uniform(shape=dynamics.x_shape,
                              minval=-PI, maxval=PI,
                              dtype=TF_FLOAT)

    run_data = DataContainer(flags.run_steps)

    template = '\n'.join([f'beta: {beta}',
                          f'eps: {dynamics.eps.numpy():.4g}',
                          f'net_weights: {dynamics.net_weights}'])
    io.log(f'Running inference with:\n {template}')

    # Run 50 MD updates (w/o accept/reject) to ensure chains don't get stuck
    if md_steps > 0:
        for _ in range(md_steps):
            mc_states, _ = dynamics.md_update(x, beta, training=False)
            x = mc_states.out.x

    try:
        x, metrics = test_step((x, tf.constant(beta)))
    except Exception as exception:  # pylint:disable=broad-except
        io.log(f'Exception: {exception}')
        test_step = dynamics.test_step
        x, metrics = test_step((x, tf.constant(beta)))

    header = run_data.get_header(metrics,
                                 skip=['charges'],
                                 prepend=['{:^12s}'.format('step')])
    #  io.log(header)
    io.log(header.split('\n'), should_print=True)
    # -------------------------------------------------------------

    x_arr = []

    def timed_step(x: tf.Tensor, beta: tf.Tensor):
        start = time.time()
        x, metrics = test_step((x, tf.constant(beta)))
        metrics.dt = time.time() - start
        if save_x:
            x_arr.append(x.numpy())

        return x, metrics

    steps = tf.range(flags.run_steps, dtype=tf.int64)
    if NUM_NODES == 1:
        ctup = (CBARS['red'], CBARS['green'], CBARS['red'], CBARS['reset'])
        steps = tqdm(steps, desc='running', unit='step',
                     bar_format=("%s{l_bar}%s{bar}%s{r_bar}%s" % ctup))

    for step in steps:
        x, metrics = timed_step(x, beta)
        run_data.update(step, metrics)

        if step % print_steps == 0:
            summarize_dict(metrics, step, prefix='testing')
            data_str = run_data.get_fstr(step, metrics, skip=['charges'])
            io.log(data_str, should_print=True)

        if (step + 1) % 1000 == 0:
            io.log(header, should_print=True)

    return run_data, x, x_arr