Beispiel #1
0
def grad_clipper_AVT(
        gradients,  # gradients to clip
        clip_value=None,  # clipping value, for None clips with avt
        avt_SVal=0.1,  # start value for AVT (smaller value makes warmup)
        avt_window=100,  # width of averaging window (number of steps)
        avt_max_upd=1.5,  # single step max factor of avt update
        do_clip=True,  # disables clipping (just GN calculations)
        verb=0):

    gg_norm = tf.global_norm(gradients)  # gradients global norm
    avt_gg_norm = tf.get_variable(  # time averaged gradients global norm variable
        name='avt_gg_norm',
        shape=[],
        trainable=False,
        initializer=tf.constant_initializer(avt_SVal),
        dtype=tf.float32)

    avt_update = tf.reduce_min(
        [gg_norm, avt_max_upd * avt_gg_norm]
    )  # single value to update AVTG with (current GNorm or clipped to max value)
    # assign new value
    avt_gg_norm = tf.assign(ref=avt_gg_norm,
                            value=(avt_gg_norm *
                                   (avt_window - 1) + avt_update) / avt_window)
    if verb > 0:
        print(
            f'grad_clipper_AVT: avt_SVal {avt_SVal:.1f}, avt_window {avt_window}, avt_max_upd {avt_max_upd:.1f}'
        )

    if do_clip:
        gradients, _ = tf.clip_by_global_norm(
            t_list=gradients,
            clip_norm=clip_value if clip_value else avt_gg_norm,
            use_norm=gg_norm)
        if verb > 0:
            print(
                f' >> is clipping gradients {"with value" if clip_value else "with AVT"}'
            )
    elif verb > 0:
        print(' >> not doing clipping')

    return {
        'gradients': gradients,
        'gg_norm': gg_norm,
        'avt_gg_norm': avt_gg_norm
    }
Beispiel #2
0
def lay_res(
    lay_in,  # layer input
    lay_out,  # layer output
    name='residual',
    use_RCW=False,  # use residual connection weights
    use_PWRCW=False,  # pointwise weights
    match_dims=True):  # concatenates zeros to input when thinner

    # TODO: not working for higher dimm tensors
    with tf.variable_scope(name):

        output = lay_out
        iW = int(lay_in.shape[-1])
        oW = int(output.shape[-1])
        matchedDims = iW == oW

        # pad input with zeros to match dimension of output
        if iW < oW and match_dims:
            lay_in = tf.pad(tensor=lay_in,
                            paddings=tf.constant([[0, 0], [0, oW - iW]]))
            matchedDims = True

        if matchedDims:
            if use_RCW:
                if use_PWRCW: shape = [oW]
                else: shape = []

                convRCW = tf.get_variable(
                    name='rcw',
                    shape=shape,
                    initializer=tf.constant_initializer(0))

                output = lay_in * (
                    1 - tf.sigmoid(convRCW)) + output * tf.sigmoid(convRCW)
            else:
                output = lay_in + output

    return output
Beispiel #3
0
def lr_scaler(
        iLR,  # initial learning rate
        g_step: tf.
    Tensor = None,  # global step tf.variable of tf.int type, for None creates one
        warm_up: int = 1000,  # warmup steps, None or 0 turns-off
        ann_base: float = 0.999,  # annealing base, None or 1 for turn-off
        ann_step:
    float = 1.0,  # annealing step, higher value speeds up annealing
        n_wup_off: float = 2.0,  # N warmUp offset of annealing
        verb=0):

    if verb > 0: print(f'*** lr_scaler *** initial LR: {iLR}')
    iLR = tf.convert_to_tensor(iLR)

    # create global step variable if not given
    if g_step is None:
        g_step = tf.get_variable(name='g_step',
                                 shape=[],
                                 trainable=False,
                                 initializer=tf.constant_initializer(0),
                                 dtype=tf.int32)

    g_step_fl = tf.cast(g_step, dtype=tf.float32)
    if warm_up is None: warm_up = 0
    lR = iLR
    if warm_up:
        ratioWm = tf.reduce_min([g_step_fl, warm_up]) / warm_up  # warmUp ratio
        lR = iLR * ratioWm  # learning rate with warmup
        if verb > 0: print(f'applied warmUp ({warm_up}) to lR')
    if ann_base is not None and ann_base != 1:
        gStep_offs = tf.reduce_max([0, g_step_fl - warm_up * n_wup_off
                                    ])  # offset by warmUpSteps
        lR *= ann_base**(gStep_offs * ann_step)  # learning rate with annealing
        if verb > 0:
            print(f'applied annealing to lR ({ann_base:.5f},{ann_step:.5f})')
    return {'scaled_LR': lR, 'g_step': g_step}
Beispiel #4
0
def enc_TNS(
        in_seq,  # input sequence embeddings [batch, seq, emb], for TAT in_seq should be LNormalized
        name='enc_TNS',
        seq_out:
    bool = True,  # transformer seq2seq, if False seq2one (Task Attention Transformer)
        add_PE: bool = True,  # add positional embeddings
        do_LN: bool = True,  # do layer norm
        shared_lays: bool = False,  # shared variables in blocks
        n_blocks=12,
        n_heads=8,
        dense_mul: int or float = 4,  # dense (after att) scale
        activation=tf.nn.relu,
        max_seq_len=100,  # used only to set shape (axis 0) of positional embeddings
        dropout=0.0,  # dropout of FC after attention
        dropout_att=0.0,  # dropout of attention probabilities
        training_flag: tf.Tensor
    or bool = None,  # dropout training flag (bool or tensor)
        initializer=None,
        seed=12321,
        n_hist=4,  # number of histogram layers
        verb=0):

    if initializer is None:
        initializer = tf.truncated_normal_initializer(stddev=0.01, seed=seed)

    # split feats(-1) for heads
    def split_heads(x):
        x = tf.split(x, n_heads, axis=-1)  # list of tensors
        return tf.stack(x, axis=-3)  # [batch, head, seq, feats]

    # merge heads over feats(-1)
    def merge_heads(x):
        x = tf.unstack(x, axis=-3)
        return tf.concat(x, axis=-1)

    # multi_head_attention for input
    def mh_attn(
            in_seq,  # input sequence [batch, seq, feats]
            query=None,  # None for self attention, otherwise TAT [batch, n_queries, feats]
            activation=None,  # activation of KQV dense
            dropout_att=0.0,
            drop_flag=None,
            seed=seed):

        # input projection of in_seq for KQV or KV(if query)
        width = in_seq.shape[-1].value
        proj_size = 3 if query is None else 2
        c = lay_dense(
            input=in_seq,  # [batch, seq, feats]
            units=width * proj_size,
            name='mhProj',
            activation=activation,
            initializer=initializer,
            seed=seed)
        ins_split = tf.split(c, proj_size, axis=-1)  # split projected

        if query is not None:
            q = query  # projection for Q is not needed (at least with 1 head)
            k, v = ins_split
        else:
            q, k, v = ins_split
        q, k, v = map(split_heads, [q, k, v])

        # attention
        att_out = attn(q, k, v, dropout_att, drop_flag, seed)
        a = att_out['attention']
        a = merge_heads(a)
        return {'attention': a, 'att_vals': att_out['att_weights']}

    # transformer block
    def tblock(in_seq, seed, task_query=None):

        hist_summ = []

        output = in_seq
        taskQueryNorm = None
        if task_query is None:
            hist_summ.append(
                tf.summary.histogram('a_inputSeq', output, family=name))
            # layer norm 1 on seq
            if do_LN:
                output = tf.keras.layers.LayerNormalization(axis=-1)(output)
                hist_summ.append(
                    tf.summary.histogram('b_inputSeqLN', output, family=name))
        else:
            hist_summ.append(
                tf.summary.histogram('a_inTaskQuery', task_query, family=name))
            taskQueryNorm = task_query
            # layer norm 1 on taskQuery
            if do_LN:
                taskQueryNorm = tf.keras.layers.LayerNormalization(
                    axis=-1)(task_query)
                hist_summ.append(
                    tf.summary.histogram('b_taskQueryLN',
                                         task_query,
                                         family=name))

        # multi head self attention
        mha_out = mh_attn(in_seq=output,
                          query=taskQueryNorm,
                          dropout_att=dropout_att,
                          drop_flag=training_flag,
                          seed=seed)
        output = mha_out['attention']
        att_vals = mha_out['att_vals']
        hist_summ.append(tf.summary.histogram('c_mhAttn', output, family=name))

        # dense without activation
        output = lay_dense(input=output,
                           units=output.shape[-1].value,
                           name='afterAttProj',
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('d_denseAftAtt', output, family=name))

        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual 1
        if task_query is None:
            res1_out = in_seq + output
            hist_summ.append(
                tf.summary.histogram('e_res_onInputSeq', res1_out,
                                     family=name))
        else:
            res1_out = task_query + output
            hist_summ.append(
                tf.summary.histogram('e_res_onTaskQuery',
                                     res1_out,
                                     family=name))

        output = res1_out
        # layer norm 2
        if do_LN:
            output = tf.keras.layers.LayerNormalization(axis=-1)(output)
            hist_summ.append(
                tf.summary.histogram('f_layNorm', output, family=name))

        # 2x dense
        base_width = output.shape[-1].value
        output = lay_dense(input=output,
                           units=int(base_width * dense_mul),
                           name='dense1afterAtt',
                           activation=activation,
                           initializer=initializer,
                           seed=seed)
        zsL = [zeroes(output)]
        hist_summ.append(
            tf.summary.histogram('g_1denseOut', output, family=name))
        output = lay_dense(input=output,
                           units=base_width,
                           name='dense2afterAtt',
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('h_2denseOut', output, family=name))

        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual2
        output += res1_out
        hist_summ.append(tf.summary.histogram('i_res', output, family=name))

        return {
            'output': output,
            'hist_summ': hist_summ,
            'att_vals': att_vals,
            'zeroes': zsL
        }

    width = in_seq.shape[-1]  # sequence width (feats)
    seq_len = tf.shape(in_seq)[-2]  # sequence length (time)

    if verb > 0:
        print('\nBuilding %s (transformer encoder) (%dx%d, denseMul %.1f), ' %
              (name, n_blocks, width, dense_mul))
        print(' > dropout: %.2f %.2f(att)' % (dropout, dropout_att))
        print(' > seq2seq mode...') if seq_out else print(
            ' > task attention mode...')

    hist_layers = list_of_layers(n_blocks, n_select=n_hist)
    if verb > 1:
        print(' > histogram layers of transformer encoder:', hist_layers)

    with tf.variable_scope(name):

        hist_summ = []  # list of histogram summaries

        if verb > 1: print(' > transformer input', in_seq)
        hist_summ.append(
            tf.summary.histogram('a_transformerInput', in_seq, family=name))

        # init task_query (for first block - input averaged over time (seq))
        task_query = None
        if not seq_out:
            task_query = tf.reduce_mean(in_seq, axis=-2,
                                        keep_dims=True)  # [batch,1,feats]
            if verb > 1:
                print(' > first task_query (reduced input) for TAT',
                      task_query)

        # positional embedding
        if add_PE:
            pos_emb_var = tf.get_variable(name='tnsPosEmb',
                                          shape=[max_seq_len, width],
                                          initializer=initializer)
            in_seq += tf.nn.embedding_lookup(params=pos_emb_var,
                                             ids=tf.range(seq_len))
            if verb > 1: print(' > added positional embedding to the input...')
            hist_summ.append(
                tf.summary.histogram('b_transformerPosEmbInput',
                                     in_seq,
                                     family=name))

        if verb > 1:
            print(' > building %d blocks of transformer...' % n_blocks)
        att_vals = []  # list of block attention values
        zsL = []
        block_output = None
        for nB in range(n_blocks):
            hist_lay = nB in hist_layers
            lay_name = f'block_{nB}' if not shared_lays else 'block_shared'
            with tf.variable_scope(lay_name, reuse=tf.AUTO_REUSE):
                bo_dict = tblock(in_seq=in_seq,
                                 seed=seed,
                                 task_query=task_query)
                block_output = bo_dict['output']
                if task_query is None: in_seq = block_output
                else: task_query = block_output

                zsL += bo_dict['zeroes']
                if hist_lay: hist_summ += bo_dict['hist_summ']
                att_block_vals = bo_dict[
                    'att_vals']  #[batch,head,query_n or seq,seq]
                att_vals.append(att_block_vals)

        if task_query is None: output = block_output
        else: output = tf.squeeze(task_query, axis=-2)

        if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output)

        hist_summ.append(
            tf.summary.histogram('c_transformer_out', output, family=name))

    if verb > 1: print(' > %s output' % name, output)
    return {
        'output': output,
        'hist_summ': hist_summ,
        'att_vals': att_vals,
        'zeroes': zsL
    }
Beispiel #5
0
    def __init__(
        self,
        fwd_func:
        GRAPH_FUNC,  # function building forward graph (from PH to loss)
        mdict: DNA,  # model(graph) parameters dictionary
        devices=-1,  # check neuralmess.dev_manager.ft_devices for details
        do_optimization:
        bool = True,  # add optimization part to the graph (for training)
        # values below complement mdict
        name='NEM',
        name_timestamp=False,  # adds timestamp to name
        seed=12321,
        opt_class=tf.train.
        AdamOptimizer,  # default optimizer, other examples: tf.train.GradientDescentOptimizer, partial(tf.train.AdamOptimizer, beta1=0.7, beta2=0.7)
        iLR=1e-3,
        warm_up=None,
        ann_base=None,
        ann_step=1,
        n_wup_off: float = 1,
        avt_SVal=1,
        avt_window=100,
        avt_max_upd=1.5,
        do_clip=False,
        # save
        read_only=False,  # sets model to be read only (..still may log)
        save_TFD: str = SAVE_TFD,  # top folder of model_FD
        savers_names: tuple = (
            None,
        ),  # names of savers for MultiSaver # TODO: what does for << this default value?
        load_saver: bool
        or str = True,  # for None does not load, for True loads default
        do_logfile=True,  # enables saving log file in save_TFD
        # GPU management
        sep_device=True,  # separate first device for variables, gradients_avg, optimizer (otherwise those ar placed on the first FWD calculations tower)
        collocate_GWO=False,  # collocates gradient calculations with tf.OPs (gradients are calculated on every tower with its operations, but remember that vars are on one device...) (otherwise with first FWD calculations tower)
        # other
        verb: int = 0
    ):  # verb of NEModel (object/constructor), fwd_func has own verb in mdict

        dict.__init__(self)  # init self as a dict

        self.verb = verb
        if self.verb > 0: print('\n*** NEModel *** initializes...')

        self_args_dict = {  # params dict from NEModel constructor
            'name': name,
            'seed': seed,
            'opt_class': opt_class,
            'iLR': iLR,
            'warm_up': warm_up,
            'ann_base': ann_base,
            'ann_step': ann_step,
            'n_wup_off': n_wup_off,
            'avt_SVal': avt_SVal,
            'avt_window': avt_window,
            'avt_max_upd': avt_max_upd,
            'do_clip': do_clip
        }

        fwdf_mdict = get_defaults(
            function=fwd_func)  # params dict with defaults of fwd_func

        # resolve model name and extend with timestamp when needed
        resolved_name = self_args_dict['name']
        if 'name' in fwdf_mdict: resolved_name = fwdf_mdict['name']
        if 'name' in mdict: resolved_name = mdict['name']
        if name_timestamp: resolved_name += '.' + stamp()
        mdict['name'] = resolved_name
        self.model_dir = f'{save_TFD}/{mdict["name"]}'  # here goes everything from the model
        if self.verb > 0:
            print(f' > NEModel name: {mdict["name"]}')
            print(f' > NEModel dir: {self.model_dir}')

        # build folder managed dna with empty dna, it gets dna FROM FOLDER
        self.__dna = ParaDict(dna_TFD=save_TFD,
                              dna_SFD=mdict['name'],
                              fn_pfx=NEMODEL_DNA_PFX,
                              verb=self.verb)

        # set logfile
        if do_logfile:
            set_logger(log_folder=self.model_dir,
                       custom_name=mdict['name'],
                       verb=self.verb)

        # resolve model dict (dna) in proper order
        md = {}
        md.update(self_args_dict
                  )  # 1 update with params dict from NEModel constructor
        md.update(fwdf_mdict)  # 2 update with defaults of fwd_func
        md.update(self.__dna)  # 3 update with params from folder
        md.update(mdict)  # 4 update with given mdict
        self.__dna.update(md)
        self.__dna.check_params_sim(SPEC_KEYS)  # safety check
        self.readonly = read_only
        if self.model_dir and not self.readonly: self.__dna.save()

        self.update(
            self.__dna)  # finally update self with all model building params

        devices = tf_devices(devices, verb=self.verb)

        # report devices
        if self.verb > 0:
            print()
            if len(devices) == 1:
                if 'CPU' in devices[0]:
                    print(f'NEModel builds CPU device setup')
                else:
                    print(f'NEModel builds single-GPU setup')
            else:
                print(
                    f'NEModel builds multi-dev setup for {len(devices)} devices'
                )

        if len(devices) < 3:
            sep_device = False  # SEP is available for 3 or more devices

        # build FWD graph(s) >> manage variables >> build OPT graph
        self.gFWD = []  # list of dicts of all FWD graphs (from all devices)
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self['seed'])  # set graph seed
            np.random.seed(self['seed'])
            if self.verb > 0:
                print(f'\nNEModel set TF & NP seed to {self["seed"]}')

            # builds graph @SEP, this graph wont be run, it is only needed to place variables, if not vars_sep >> variables will be placed with first tower
            if sep_device:
                if self.verb > 0:
                    print(
                        f'\nNEModel places {self["name"]} VARs on {devices[0]}...'
                    )
                with tf.device(devices[0]):
                    fwd_func(**self)

            tower_devices = [] + devices
            if sep_device: tower_devices = tower_devices[1:]  # trim SEP
            for dev in tower_devices:
                if self.verb > 0:
                    print(
                        f'\nNEModel builds FWD graph of {self["name"]} model @device: {dev}'
                    )
                with tf.device(dev):
                    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                        self.gFWD.append(fwd_func(**self))

            self.update(self.gFWD[0]
                        )  # update self with dictionary returned by fwd_func

            # get FWD variables returned by fwd_func (4 saver)
            train_vars = []  # variables to train
            saver_vars = {}  # dict of variables to save
            for key in self.keys():
                if 'var' in key.lower():
                    if key == 'train_vars':
                        train_vars = self[key]
                        if type(train_vars) is not list:
                            train_vars = [train_vars]
                    else:
                        if type(self[key]) is not list:
                            saver_vars[key] = [self[key]]
                        else:
                            saver_vars[key] = self[key]

            all_vars = tf.global_variables()

            # there are returned variables >> assert there are all variables returned in lists
            if saver_vars:
                all_vars_returned = []
                for key in saver_vars:
                    all_vars_returned += saver_vars[key]
                there_are_all = True
                for var in all_vars:
                    if var not in all_vars_returned:
                        print(
                            f' *** variable {var.name} not returned by fwd_func'
                        )
                        there_are_all = False
                assert there_are_all, 'ERR: there are some variables not returned by fwd_func in lists!'

            else:
                saver_vars['fwd_vars'] = all_vars  # put all

            if self.verb > 0:
                print('\nNEModel variables to save from fwd_func:')
                for key in sorted(list(saver_vars.keys())):
                    varList = saver_vars[key]
                    if varList:
                        print(
                            f' ### vars @{key} - num: {len(varList)}, floats: {short_scin(num_var_floats(varList))} ({varList[0].device})'
                        )
                    else:
                        print(' ### no vars')
                    if self.verb > 1: log_vars(varList)

            if 'loss' not in self:
                do_optimization = False
                if self.verb > 0:
                    print(
                        '\nthere is no loss in FWD graph, OPT graph wont be build'
                    )

            if not do_optimization:
                if self.verb > 0: print('\nOPT graph wont be build')
            # build optimization graph
            else:
                if self.verb > 0:
                    print(f'\nPreparing OPT part with {self["opt_class"]}')
                # select trainable variables for OPT
                all_tvars = tf.trainable_variables()
                if train_vars:
                    # check if all train_vars are trainable:
                    for var in train_vars:
                        if var not in all_tvars:
                            if self.verb > 0:
                                print(
                                    f'variable {var.name} is not trainable but is in train_vars, please check the graph!'
                                )
                else:
                    for key in saver_vars:
                        for var in saver_vars[key]:
                            if var in all_tvars:
                                train_vars.append(var)
                    assert train_vars, 'ERR: there are no trainable variables at the graph!'
                # log train_vars
                if self.verb > 0:
                    print('\nNEModel trainable variables:')
                    print(
                        f' ### train_vars: {len(train_vars)} floats: {short_scin(num_var_floats(train_vars))}'
                    )
                    if self.verb > 1: log_vars(train_vars)

                # build gradients for towers
                for ix in range(len(self.gFWD)):
                    tower = self.gFWD[ix]
                    tower['gradients'] = tf.gradients(
                        ys=tower['loss'],
                        xs=train_vars,
                        colocate_gradients_with_ops=not collocate_GWO
                    )  # TF default is False >> calculates gradients where OPS, for True >> where train_vars

                    # log gradients
                    if self.verb > 0:
                        nGrad = len(tower['gradients'])

                        # None_as_gradient case
                        device = 'UNKNOWN'
                        for t in tower['gradients']:
                            if t is not None:
                                device = t.device
                                break

                        print(
                            f' > gradients for {ix} tower got {nGrad} tensors ({device})'
                        )
                        if self.verb > 1:
                            print('NEModel variables and their gradients:')
                            for gix in range(len(tower['gradients'])):
                                grad = tower['gradients'][gix]
                                var = train_vars[gix]
                                print(var, var.device)
                                print(
                                    f' > {grad}'
                                )  # grad as a tensor displays device when printed (unless colocated with OP!)

                self['gradients'] = self.gFWD[0]['gradients']

                # None @gradients check
                none_grads = 0
                for grad in self['gradients']:
                    if grad is None: none_grads += 1
                if none_grads and self.verb > 0:
                    print(
                        f'There are None gradients: {none_grads}/{len(self["gradients"])}, some trainVars may be unrelated to loss, please check the graph!'
                    )

                # average gradients
                if len(devices) > 1:

                    if self.verb > 0:
                        print(
                            f'\nNEModel builds gradients averaging graph with device {devices[0]} for {len(self.gFWD)} towers'
                        )
                    with tf.device(devices[0]):
                        towerGrads = [
                            tower['gradients'] for tower in self.gFWD
                        ]
                        avgGrads = []
                        for mGrads in zip(*towerGrads):
                            grads = []
                            for grad in mGrads:
                                if grad is not None:  # None for variables not used while training now...
                                    expandedG = tf.expand_dims(input=grad,
                                                               axis=-1)
                                    grads.append(expandedG)
                            if grads:
                                grad = tf.concat(values=grads, axis=-1)
                                grad = tf.reduce_mean(input_tensor=grad,
                                                      axis=-1)
                                avgGrads.append(grad)
                            else:
                                avgGrads.append(None)

                        self[
                            'gradients'] = avgGrads  # update with averaged gradients
                        if self.verb > 0:
                            print(
                                f' > NEModel averaged gradients ({self["gradients"][0].device})'
                            )

                # build OPT graph
                with tf.variable_scope('OPT', reuse=tf.AUTO_REUSE):

                    if self.verb > 0:
                        print(
                            f'\nBuilding OPT graph for {self["name"]} model @device: {devices[0]}'
                        )
                    with tf.device(devices[0]):

                        self['g_step'] = tf.get_variable(  # global step
                            name='g_step',
                            shape=[],
                            trainable=False,
                            initializer=tf.constant_initializer(0),
                            dtype=tf.int32)

                        self['iLR_var'] = tf.get_variable(  # base LR variable
                            name='iLR',
                            shape=[],
                            trainable=False,
                            initializer=tf.constant_initializer(self['iLR']),
                            dtype=tf.float32)

                        self['scaled_LR'] = lr_scaler(
                            iLR=self['iLR_var'],
                            g_step=self['g_step'],
                            warm_up=self['warm_up'],
                            ann_base=self['ann_base'],
                            ann_step=self['ann_step'],
                            n_wup_off=self['n_wup_off'],
                            verb=self.verb)['scaled_LR']

                        # updates with: optimizer, gg_norm, avt_gg_norm
                        self.update(
                            gc_loss_reductor(optimizer=self['opt_class'](
                                learning_rate=self['scaled_LR']),
                                             vars=train_vars,
                                             g_step=self['g_step'],
                                             gradients=self['gradients'],
                                             avt_SVal=self['avt_SVal'],
                                             avt_window=self['avt_window'],
                                             avt_max_upd=self['avt_max_upd'],
                                             do_clip=self['do_clip'],
                                             verb=self.verb))

                        # select OPT vars
                        saver_vars['opt_vars'] = tf.global_variables(
                            scope=tf.get_variable_scope().name)
                        if self.verb > 0:
                            print(
                                f' ### opt_vars: {len(saver_vars["opt_vars"])} floats: {short_scin(num_var_floats(saver_vars["opt_vars"]))} ({saver_vars["opt_vars"][0].device})'
                            )
                            if self.verb > 1: log_vars(saver_vars['opt_vars'])

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.session = tf.Session(graph=self.graph, config=config)

        # remove keys with no variables (corner case, for proper saver)
        sKeys = list(saver_vars.keys())
        for key in sKeys:
            if not saver_vars[key]: saver_vars.pop(key)
        # TODO: saver_vars, savers_names, load_saver - need a little refactor!!!
        # add saver and load
        self.__saver = MultiSaver(model_name=self['name'],
                                  vars=saver_vars,
                                  save_TFD=save_TFD,
                                  savers=savers_names,
                                  session=self.session,
                                  verb=self.verb)
        if load_saver:
            if type(load_saver) is bool: load_saver = None
            self.__saver.load(saver=load_saver)
            self.update_LR(self['iLR'])  # safety update of iLR

        self.__summ_writer = tf.summary.FileWriter(
            logdir=self.model_dir,
            #graph=          self.graph, # you can call add_graph() later
            flush_secs=10) if not self.readonly else None

        if self.verb > 0: print(f'{self["name"]} (NEModel) build finished!')
        if self.verb > 2: print(self)