Example #1
0
def lay_conv2D(
        input,
        name='conv2d',
        kernels=(3, 5, 7),  # layer kernels
        filters=(36, 12,
                 6),  # int divisible by len(kernels) or tuple of len(kernels)
        dilation=1,
        activation=None,
        useBias=True,
        gatedLU=False,  # Gated Linear Unit architecture
        initializer=None,
        seed=12321,
        verbLev=0):

    if initializer is None: initializer = my_initializer(seed)
    with tf.variable_scope(name):
        variables = []
        subOutList = []
        if type(kernels) is not tuple: kernels = (kernels, )
        if verbLev > 0:
            print(' > %s: kernels %s, filetrs %s, dilation %s' %
                  (name, kernels, filters, dilation))
        for k in range(len(kernels)):
            with tf.variable_scope('kernel_%d' % k):
                subKernel = kernels[k]
                if type(filters) is not tuple:
                    subFilters = filters / len(kernels)
                else:
                    subFilters = filters[k]
                if gatedLU: subFilters *= 2

                convLay = tf.layers.Conv2D(filters=subFilters,
                                           kernel_size=subKernel,
                                           dilation_rate=dilation,
                                           activation=None,
                                           use_bias=useBias,
                                           kernel_initializer=initializer,
                                           padding='valid',
                                           data_format='channels_last')
                subOutput = convLay(input)
                for var in convLay.variables:
                    variables.append(var)

                if verbLev > 1:
                    print(' >> subConv: filters %s, kernel %s' %
                          (subFilters, subKernel))
                subOutList.append(subOutput)

        output = tf.concat(subOutList, axis=-1)
        if gatedLU:
            s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1)
            output = s1 * tf.sigmoid(s2)
        else:
            if activation: output = activation(output)

        variables = flatten_LOTens(variables)

    return output, variables
Example #2
0
def lay_conv1D(
        input,
        name='conv1D',
        kernels=(3, 5, 7),  # layer kernels
        filters=(36, 12,
                 6),  # int divisible by len(kernels) or tuple of len(kernels)
        dilation=1,
        activation=None,
        use_bias=True,
        gated_LU=False,  # Gated Linear Unit architecture
        initializer=None,
        padding='valid',  # 'same' adds padding, 'valid' does not
        seed=12321,
        verb=0):

    if initializer is None: initializer = my_initializer(seed)
    with tf.variable_scope(name):
        sub_out_list = []
        if type(kernels) is not tuple: kernels = (kernels, )
        if verb > 1:
            print(' > %s: kernels %s, filters %s, dilation %s' %
                  (name, kernels, filters, dilation))
        for k in range(len(kernels)):
            with tf.variable_scope('kernel_%d' % k):
                sub_kernel = kernels[k]
                if type(filters) is not tuple:
                    sub_filters = filters // len(kernels)
                else:
                    sub_filters = filters[k]
                if gated_LU: sub_filters *= 2

                conv_lay = tf.layers.Conv1D(filters=sub_filters,
                                            kernel_size=sub_kernel,
                                            dilation_rate=dilation,
                                            activation=None,
                                            use_bias=use_bias,
                                            kernel_initializer=initializer,
                                            padding=padding,
                                            data_format='channels_last')
                sub_output = conv_lay(input)

                if verb > 1:
                    print(' >> sub_conv: filters %s, kernel %s' %
                          (sub_filters, sub_kernel))
                sub_out_list.append(sub_output)

        output = tf.concat(sub_out_list, axis=-1)
        if gated_LU:
            s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1)
            output = s1 * tf.sigmoid(s2)
        elif activation:
            output = activation(output)

    return output
Example #3
0
def lay_res(
    lay_in,  # layer input
    lay_out,  # layer output
    name='residual',
    use_RCW=False,  # use residual connection weights
    use_PWRCW=False,  # pointwise weights
    match_dims=True):  # concatenates zeros to input when thinner

    # TODO: not working for higher dimm tensors
    with tf.variable_scope(name):

        output = lay_out
        iW = int(lay_in.shape[-1])
        oW = int(output.shape[-1])
        matchedDims = iW == oW

        # pad input with zeros to match dimension of output
        if iW < oW and match_dims:
            lay_in = tf.pad(tensor=lay_in,
                            paddings=tf.constant([[0, 0], [0, oW - iW]]))
            matchedDims = True

        if matchedDims:
            if use_RCW:
                if use_PWRCW: shape = [oW]
                else: shape = []

                convRCW = tf.get_variable(
                    name='rcw',
                    shape=shape,
                    initializer=tf.constant_initializer(0))

                output = lay_in * (
                    1 - tf.sigmoid(convRCW)) + output * tf.sigmoid(convRCW)
            else:
                output = lay_in + output

    return output
Example #4
0
def enc_DRT(
        input,
        name='enc_DRT',
        shared_lays: bool = False,  # shared variables in enc_layers
        n_layers=12,
        lay_width: int = None,  # for None matches input width
        dns_scale=6,  # scale(*) of first dense
        activation=tf.nn.relu,  # gelu is really worth a try
        dropout=0.0,  # dropout after two denses
        training_flag=None,  # training flag tensor (for dropout)
        initializer=None,
        seed=12321,
        n_hist=4,  # number of histogram layers (for TB)
        verb=0):

    lay_width_matched = ''
    if lay_width is None:
        lay_width = input.shape.as_list()[-1]
        lay_width_matched = '(lay_width taken form input width)'
    if verb > 0:
        drp = 0.0 if not dropout else dropout
        print(
            f'\nBuilding DRTencoder ({n_layers}x{lay_width} drop:{drp:.2f}) {lay_width_matched}...'
        )

    if initializer is None: initializer = my_initializer(seed)

    hist_summ = []
    hist_layers = list_of_layers(n_layers, n_select=n_hist)
    if verb > 1: print(' > histogram layers of DRTencoder:', hist_layers)

    zsL = []  # zeroes list
    with tf.variable_scope(name):

        # input projection
        iW = input.shape[-1]
        if iW != lay_width:
            input = lay_dense(input=input,
                              units=lay_width,
                              use_bias=False,
                              initializer=initializer,
                              seed=seed)
            if verb > 0:
                print('projected input to layWidth(%d) since it differs(%d)' %
                      (lay_width, iW))

        input = tf.keras.layers.LayerNormalization(axis=-1)(
            input)  # input layer_norm

        output = input  # for 0 layers case
        for nL in range(n_layers):

            lay_name = f'DRLay_{nL}' if not shared_lays else 'DRLay_shared'
            lay_out = lay_DRT(input=output,
                              name=lay_name,
                              hist_name=name,
                              dns_scale=dns_scale,
                              activation=activation,
                              dropout=dropout,
                              training_flag=training_flag,
                              initializer=initializer,
                              seed=seed)

            output = lay_out['output']
            if nL in hist_layers: hist_summ.append(lay_out['hist_summ'])
            zsL += lay_out['zeroes']

    return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
Example #5
0
def enc_TNS(
        in_seq,  # input sequence embeddings [batch, seq, emb], for TAT in_seq should be LNormalized
        name='enc_TNS',
        seq_out:
    bool = True,  # transformer seq2seq, if False seq2one (Task Attention Transformer)
        add_PE: bool = True,  # add positional embeddings
        do_LN: bool = True,  # do layer norm
        shared_lays: bool = False,  # shared variables in blocks
        n_blocks=12,
        n_heads=8,
        dense_mul: int or float = 4,  # dense (after att) scale
        activation=tf.nn.relu,
        max_seq_len=100,  # used only to set shape (axis 0) of positional embeddings
        dropout=0.0,  # dropout of FC after attention
        dropout_att=0.0,  # dropout of attention probabilities
        training_flag: tf.Tensor
    or bool = None,  # dropout training flag (bool or tensor)
        initializer=None,
        seed=12321,
        n_hist=4,  # number of histogram layers
        verb=0):

    if initializer is None:
        initializer = tf.truncated_normal_initializer(stddev=0.01, seed=seed)

    # split feats(-1) for heads
    def split_heads(x):
        x = tf.split(x, n_heads, axis=-1)  # list of tensors
        return tf.stack(x, axis=-3)  # [batch, head, seq, feats]

    # merge heads over feats(-1)
    def merge_heads(x):
        x = tf.unstack(x, axis=-3)
        return tf.concat(x, axis=-1)

    # multi_head_attention for input
    def mh_attn(
            in_seq,  # input sequence [batch, seq, feats]
            query=None,  # None for self attention, otherwise TAT [batch, n_queries, feats]
            activation=None,  # activation of KQV dense
            dropout_att=0.0,
            drop_flag=None,
            seed=seed):

        # input projection of in_seq for KQV or KV(if query)
        width = in_seq.shape[-1].value
        proj_size = 3 if query is None else 2
        c = lay_dense(
            input=in_seq,  # [batch, seq, feats]
            units=width * proj_size,
            name='mhProj',
            activation=activation,
            initializer=initializer,
            seed=seed)
        ins_split = tf.split(c, proj_size, axis=-1)  # split projected

        if query is not None:
            q = query  # projection for Q is not needed (at least with 1 head)
            k, v = ins_split
        else:
            q, k, v = ins_split
        q, k, v = map(split_heads, [q, k, v])

        # attention
        att_out = attn(q, k, v, dropout_att, drop_flag, seed)
        a = att_out['attention']
        a = merge_heads(a)
        return {'attention': a, 'att_vals': att_out['att_weights']}

    # transformer block
    def tblock(in_seq, seed, task_query=None):

        hist_summ = []

        output = in_seq
        taskQueryNorm = None
        if task_query is None:
            hist_summ.append(
                tf.summary.histogram('a_inputSeq', output, family=name))
            # layer norm 1 on seq
            if do_LN:
                output = tf.keras.layers.LayerNormalization(axis=-1)(output)
                hist_summ.append(
                    tf.summary.histogram('b_inputSeqLN', output, family=name))
        else:
            hist_summ.append(
                tf.summary.histogram('a_inTaskQuery', task_query, family=name))
            taskQueryNorm = task_query
            # layer norm 1 on taskQuery
            if do_LN:
                taskQueryNorm = tf.keras.layers.LayerNormalization(
                    axis=-1)(task_query)
                hist_summ.append(
                    tf.summary.histogram('b_taskQueryLN',
                                         task_query,
                                         family=name))

        # multi head self attention
        mha_out = mh_attn(in_seq=output,
                          query=taskQueryNorm,
                          dropout_att=dropout_att,
                          drop_flag=training_flag,
                          seed=seed)
        output = mha_out['attention']
        att_vals = mha_out['att_vals']
        hist_summ.append(tf.summary.histogram('c_mhAttn', output, family=name))

        # dense without activation
        output = lay_dense(input=output,
                           units=output.shape[-1].value,
                           name='afterAttProj',
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('d_denseAftAtt', output, family=name))

        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual 1
        if task_query is None:
            res1_out = in_seq + output
            hist_summ.append(
                tf.summary.histogram('e_res_onInputSeq', res1_out,
                                     family=name))
        else:
            res1_out = task_query + output
            hist_summ.append(
                tf.summary.histogram('e_res_onTaskQuery',
                                     res1_out,
                                     family=name))

        output = res1_out
        # layer norm 2
        if do_LN:
            output = tf.keras.layers.LayerNormalization(axis=-1)(output)
            hist_summ.append(
                tf.summary.histogram('f_layNorm', output, family=name))

        # 2x dense
        base_width = output.shape[-1].value
        output = lay_dense(input=output,
                           units=int(base_width * dense_mul),
                           name='dense1afterAtt',
                           activation=activation,
                           initializer=initializer,
                           seed=seed)
        zsL = [zeroes(output)]
        hist_summ.append(
            tf.summary.histogram('g_1denseOut', output, family=name))
        output = lay_dense(input=output,
                           units=base_width,
                           name='dense2afterAtt',
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('h_2denseOut', output, family=name))

        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual2
        output += res1_out
        hist_summ.append(tf.summary.histogram('i_res', output, family=name))

        return {
            'output': output,
            'hist_summ': hist_summ,
            'att_vals': att_vals,
            'zeroes': zsL
        }

    width = in_seq.shape[-1]  # sequence width (feats)
    seq_len = tf.shape(in_seq)[-2]  # sequence length (time)

    if verb > 0:
        print('\nBuilding %s (transformer encoder) (%dx%d, denseMul %.1f), ' %
              (name, n_blocks, width, dense_mul))
        print(' > dropout: %.2f %.2f(att)' % (dropout, dropout_att))
        print(' > seq2seq mode...') if seq_out else print(
            ' > task attention mode...')

    hist_layers = list_of_layers(n_blocks, n_select=n_hist)
    if verb > 1:
        print(' > histogram layers of transformer encoder:', hist_layers)

    with tf.variable_scope(name):

        hist_summ = []  # list of histogram summaries

        if verb > 1: print(' > transformer input', in_seq)
        hist_summ.append(
            tf.summary.histogram('a_transformerInput', in_seq, family=name))

        # init task_query (for first block - input averaged over time (seq))
        task_query = None
        if not seq_out:
            task_query = tf.reduce_mean(in_seq, axis=-2,
                                        keep_dims=True)  # [batch,1,feats]
            if verb > 1:
                print(' > first task_query (reduced input) for TAT',
                      task_query)

        # positional embedding
        if add_PE:
            pos_emb_var = tf.get_variable(name='tnsPosEmb',
                                          shape=[max_seq_len, width],
                                          initializer=initializer)
            in_seq += tf.nn.embedding_lookup(params=pos_emb_var,
                                             ids=tf.range(seq_len))
            if verb > 1: print(' > added positional embedding to the input...')
            hist_summ.append(
                tf.summary.histogram('b_transformerPosEmbInput',
                                     in_seq,
                                     family=name))

        if verb > 1:
            print(' > building %d blocks of transformer...' % n_blocks)
        att_vals = []  # list of block attention values
        zsL = []
        block_output = None
        for nB in range(n_blocks):
            hist_lay = nB in hist_layers
            lay_name = f'block_{nB}' if not shared_lays else 'block_shared'
            with tf.variable_scope(lay_name, reuse=tf.AUTO_REUSE):
                bo_dict = tblock(in_seq=in_seq,
                                 seed=seed,
                                 task_query=task_query)
                block_output = bo_dict['output']
                if task_query is None: in_seq = block_output
                else: task_query = block_output

                zsL += bo_dict['zeroes']
                if hist_lay: hist_summ += bo_dict['hist_summ']
                att_block_vals = bo_dict[
                    'att_vals']  #[batch,head,query_n or seq,seq]
                att_vals.append(att_block_vals)

        if task_query is None: output = block_output
        else: output = tf.squeeze(task_query, axis=-2)

        if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output)

        hist_summ.append(
            tf.summary.histogram('c_transformer_out', output, family=name))

    if verb > 1: print(' > %s output' % name, output)
    return {
        'output': output,
        'hist_summ': hist_summ,
        'att_vals': att_vals,
        'zeroes': zsL
    }
Example #6
0
def lay_DRT(
        input,
        name='lay_DRT',  # scope name, be careful when stacked since auto_reuse
        hist_name=None,  # family name of histogram
        dns_scale=4,
        activation=tf.nn.relu,  # gelu is really worth a try
        dropout=None,  # dropout (after two denses)
        training_flag=None,  # training flag tensor (for dropout)
        initializer=None,
        seed=12321):

    if not hist_name: hist_name = name
    lay_width = input.shape[-1]
    if initializer is None: initializer = my_initializer(seed)
    hist_summ = []

    with tf.variable_scope(name_or_scope=name, reuse=tf.AUTO_REUSE):

        hist_summ.append(
            tf.summary.histogram('a_denseSin', input, family=hist_name))

        # dense (scale up)
        output = lay_dense(input=input,
                           units=int(lay_width * dns_scale),
                           activation=None,
                           use_bias=True,
                           initializer=initializer,
                           seed=seed,
                           name='denseS')
        hist_summ.append(
            tf.summary.histogram('b_denseSout', output, family=hist_name))

        # activation
        output = activation(output)
        zsL = [zeroes(output)]  # zeroes list
        hist_summ.append(
            tf.summary.histogram('c_activation', output, family=hist_name))

        # dense (scale down) no activ
        output = lay_dense(input=output,
                           units=lay_width,
                           name='DRTdenseNA',
                           use_bias=True,
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('d_denseNAout', output, family=hist_name))

        # layer dropout
        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual
        output = lay_res(input, output)
        hist_summ.append(
            tf.summary.histogram('e_residual', output, family=hist_name))

        # layer_norm
        output = tf.keras.layers.LayerNormalization(axis=-1)(output)
        hist_summ.append(
            tf.summary.histogram('f_LAYout', output, family=hist_name))

    return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
Example #7
0
def enc_CNN(
        input: tf.Tensor,
        history: tf.
    Tensor = None,  # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn
        name='enc_CNN',
        # layer params
        shared_lays: bool = False,  # shared variables in enc_layers
        n_layers: int = 12,  # num of layers
        kernel: int = 3,  # layer kernel
        n_filters: int = 128,  # num of filters
        activation=tf.nn.
    relu,  # global enc activation func, gelu is really worth a try
        lay_drop: float or None = 0.0,
        ldrt_scale: int or
    None = 0,  # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build
        ldrt_drop: float or None = 0.0,  # DRT @enc_lay - dropout
        # other
    training_flag: tf.Tensor or bool = None,  # dropout training flag tensor
        initializer=None,
        seed: int = 12321,
        n_hist: int = 4,  # number of histogram layers
        verb=0):

    if verb > 0:
        print(
            f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...')

    if initializer is None: initializer = my_initializer(seed)

    # manage history
    history_lays = None
    if history is not None:
        history_lays = tf.unstack(history, axis=-3)
        if verb > 1:
            print(
                f' > state_lays len {len(history_lays)} of: {history_lays[0]}')

    hist_summ = []
    hist_layers = list_of_layers(n_layers, n_select=n_hist)
    if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}')

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        input_lays = [
        ]  # here we will store inputs of the following layers to extract the state (history)
        zsL = []  # zeroes

        # input projection - to match n_filters and input width
        if verb > 1: print(f' > encoder input: {input}')
        if input.shape[-1] != n_filters:
            input = lay_dense(input=input,
                              units=n_filters,
                              name='enc_input_projection',
                              initializer=initializer)
            if verb > 1: print(f' > encoder projected input: {input}')

        output = input  # for 0 layers case
        sub_output = input  # first input
        for depth in range(n_layers):

            lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared'
            if verb > 1: print(f'<< layer {lay_name}:')

            lay_input = tf.concat([history_lays[depth], sub_output],
                                  axis=-2) if history_lays else sub_output
            if verb > 1:
                print(f' > sub_output (previous): {sub_output}')
                print(f' > lay_input (eventually padded): {lay_input}')
            input_lays.append(lay_input)

            hist_lay = depth in hist_layers

            with tf.variable_scope(lay_name):

                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('a_lay_in',
                                             lay_input,
                                             family=name))

                # LN
                lay_input = tf.keras.layers.LayerNormalization(
                    axis=-1)(lay_input)
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('b_LN', lay_input, family=name))

                # conv no activation
                output = lay_conv1D(
                    input=lay_input,
                    name='conv1D',
                    kernels=kernel,
                    filters=n_filters,
                    activation=None,
                    initializer=initializer,
                    padding='same' if history is None else 'valid',
                    seed=seed,
                    verb=0)
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('c_cnn', output, family=name))

                # activation
                if activation:
                    output = activation(output)
                    zsL += [zeroes(output)]  # catch zeroes
                    if hist_lay:
                        hist_summ.append(
                            tf.summary.histogram('d_activation',
                                                 output,
                                                 family=name))

                # dropout
                if lay_drop:
                    output = tf.layers.dropout(inputs=output,
                                               rate=lay_drop,
                                               training=training_flag,
                                               seed=seed)
                    if hist_lay:
                        hist_summ.append(
                            tf.summary.histogram('e_drop', output,
                                                 family=name))

                # RES, here we take sub_output, since lay_input may be padded by history
                output += sub_output
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('f_residual', output,
                                             family=name))

                if verb > 1: print(f' > output (layer): {output}')

                if ldrt_scale:
                    lay_out = lay_DRT(input=output,
                                      name=lay_name + '_lay_DRT',
                                      hist_name=name,
                                      dns_scale=ldrt_scale,
                                      activation=activation,
                                      dropout=ldrt_drop,
                                      training_flag=training_flag,
                                      initializer=initializer,
                                      seed=seed)
                    output = lay_out['output']
                    zsL += lay_out['zeroes']
                    if hist_lay: hist_summ.append(lay_out['hist_summ'])

                sub_output = output

    output = tf.keras.layers.LayerNormalization(axis=-1)(output)  # final LN

    # prepare fin_state
    fin_state = None
    if history is not None:
        state = tf.stack(input_lays, axis=-3)
        if verb > 1: print(f' > state (stacked): {state}')
        fin_state = tf.split(state,
                             num_or_size_splits=[-1, kernel - 1],
                             axis=-2)[1]
        if verb > 1: print(f' > fin_state (split): {fin_state}')

    if verb > 1: print(f' > {name} output: {output}')
    return {
        'output': output,
        'state': fin_state,  # history for next
        'hist_summ': hist_summ,
        'zeroes': zsL
    }
Example #8
0
def mrg_ckpts(
        ckptA: str,  # checkpoint A (folder name)
        ckptA_FD: str,  # root folder of cpktA (absolute or relative)
        ckptB: str
    or None,  # checkpoint B (folder name), for None takes 100% ckptA
        ckptB_FD: str or None,  # root folder of cpktB (absolute or relative)
        ckptM: str,  # checkpoint merged (folder name)
        ckptM_FD: str,  # root folder of cpktM (absolute or relative)
        mrgF: float = 0.5,  # merge factor (weight)
        noiseF:
    float = 0.0,  # noise factor, amount of noise added to new value (0.0-1.0...)
        replace_scope: str = None,  # replaces outer scope with given string
        verb=0):

    if ckptA_FD[-1] != '/': ckptA_FD += '/'
    if ckptB_FD and ckptB_FD[-1] != '/': ckptB_FD += '/'
    if ckptM_FD[-1] != '/': ckptM_FD += '/'

    var_namesA = sorted(
        [v[0] for v in tf.train.list_variables(ckptA_FD + ckptA)])
    if verb > 0:
        print(f'variables from ckptA ({len(var_namesA):4d}): {var_namesA}')
    var_namesB = sorted(
        [v[0]
         for v in tf.train.list_variables(ckptB_FD + ckptB)]) if ckptB else []
    if verb > 0:
        print(f'variables from ckptB ({len(var_namesB):4d}): {var_namesB}')

    oscope_len = 0
    if replace_scope:
        for c in var_namesA[0]:
            if c == '/': break
            oscope_len += 1
    if verb > 0:
        print(f'oscope_len {oscope_len}')
        if oscope_len:
            print(
                f' > will replace {var_namesA[0][:oscope_len]} with {replace_scope}'
            )

    avL = []
    with tf.variable_scope('av'):
        for var_name in var_namesA:
            var = tf.train.load_variable(f'{ckptA_FD}{ckptA}', var_name)
            avL.append(tf.Variable(var, name=var_name))

    bvL = []
    if ckptB:
        with tf.variable_scope('bv'):
            for var_name in var_namesB:
                var = tf.train.load_variable(f'{ckptB_FD}{ckptB}', var_name)
                bvL.append(tf.Variable(var, name=var_name))

    cvL = []
    for ix in range(len(var_namesA)):
        var_name = var_namesA[ix]
        if verb > 0: print(f'old var_name: {var_name}')
        if replace_scope: var_name = replace_scope + var_name[oscope_len:]

        varA = avL[ix]
        if bvL and varA.dtype == 'float32':
            varB = bvL[ix]
            noise = tf.random.truncated_normal(  # random values from a normal distribution truncated by 2stddev
                shape=varA.shape,
                stddev=tf.math.reduce_std(varA))  # stddev of varA
            var = tf.Variable(mrgF * varA + (1 - mrgF) * varB + noiseF * noise,
                              name=var_name)
        else:
            var = tf.Variable(varA, name=var_name)
        cvL.append(var)

    # save
    if verb > 0: print('\nWriting checkpoint... ', end='')
    child_saver = tf.train.Saver(cvL)
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    with tf.Session(
            #config=config
    ) as sess:
        sess.run(tf.global_variables_initializer())
        child_saver.save(sess,
                         f'{ckptM_FD}{ckptM}/{ckptM}',
                         write_meta_graph=False)
    tf.reset_default_graph()
    if verb > 0: print('done!')
Example #9
0
    def __init__(
        self,
        fwd_func:
        GRAPH_FUNC,  # function building forward graph (from PH to loss)
        mdict: DNA,  # model(graph) parameters dictionary
        devices=-1,  # check neuralmess.dev_manager.ft_devices for details
        do_optimization:
        bool = True,  # add optimization part to the graph (for training)
        # values below complement mdict
        name='NEM',
        name_timestamp=False,  # adds timestamp to name
        seed=12321,
        opt_class=tf.train.
        AdamOptimizer,  # default optimizer, other examples: tf.train.GradientDescentOptimizer, partial(tf.train.AdamOptimizer, beta1=0.7, beta2=0.7)
        iLR=1e-3,
        warm_up=None,
        ann_base=None,
        ann_step=1,
        n_wup_off: float = 1,
        avt_SVal=1,
        avt_window=100,
        avt_max_upd=1.5,
        do_clip=False,
        # save
        read_only=False,  # sets model to be read only (..still may log)
        save_TFD: str = SAVE_TFD,  # top folder of model_FD
        savers_names: tuple = (
            None,
        ),  # names of savers for MultiSaver # TODO: what does for << this default value?
        load_saver: bool
        or str = True,  # for None does not load, for True loads default
        do_logfile=True,  # enables saving log file in save_TFD
        # GPU management
        sep_device=True,  # separate first device for variables, gradients_avg, optimizer (otherwise those ar placed on the first FWD calculations tower)
        collocate_GWO=False,  # collocates gradient calculations with tf.OPs (gradients are calculated on every tower with its operations, but remember that vars are on one device...) (otherwise with first FWD calculations tower)
        # other
        verb: int = 0
    ):  # verb of NEModel (object/constructor), fwd_func has own verb in mdict

        dict.__init__(self)  # init self as a dict

        self.verb = verb
        if self.verb > 0: print('\n*** NEModel *** initializes...')

        self_args_dict = {  # params dict from NEModel constructor
            'name': name,
            'seed': seed,
            'opt_class': opt_class,
            'iLR': iLR,
            'warm_up': warm_up,
            'ann_base': ann_base,
            'ann_step': ann_step,
            'n_wup_off': n_wup_off,
            'avt_SVal': avt_SVal,
            'avt_window': avt_window,
            'avt_max_upd': avt_max_upd,
            'do_clip': do_clip
        }

        fwdf_mdict = get_defaults(
            function=fwd_func)  # params dict with defaults of fwd_func

        # resolve model name and extend with timestamp when needed
        resolved_name = self_args_dict['name']
        if 'name' in fwdf_mdict: resolved_name = fwdf_mdict['name']
        if 'name' in mdict: resolved_name = mdict['name']
        if name_timestamp: resolved_name += '.' + stamp()
        mdict['name'] = resolved_name
        self.model_dir = f'{save_TFD}/{mdict["name"]}'  # here goes everything from the model
        if self.verb > 0:
            print(f' > NEModel name: {mdict["name"]}')
            print(f' > NEModel dir: {self.model_dir}')

        # build folder managed dna with empty dna, it gets dna FROM FOLDER
        self.__dna = ParaDict(dna_TFD=save_TFD,
                              dna_SFD=mdict['name'],
                              fn_pfx=NEMODEL_DNA_PFX,
                              verb=self.verb)

        # set logfile
        if do_logfile:
            set_logger(log_folder=self.model_dir,
                       custom_name=mdict['name'],
                       verb=self.verb)

        # resolve model dict (dna) in proper order
        md = {}
        md.update(self_args_dict
                  )  # 1 update with params dict from NEModel constructor
        md.update(fwdf_mdict)  # 2 update with defaults of fwd_func
        md.update(self.__dna)  # 3 update with params from folder
        md.update(mdict)  # 4 update with given mdict
        self.__dna.update(md)
        self.__dna.check_params_sim(SPEC_KEYS)  # safety check
        self.readonly = read_only
        if self.model_dir and not self.readonly: self.__dna.save()

        self.update(
            self.__dna)  # finally update self with all model building params

        devices = tf_devices(devices, verb=self.verb)

        # report devices
        if self.verb > 0:
            print()
            if len(devices) == 1:
                if 'CPU' in devices[0]:
                    print(f'NEModel builds CPU device setup')
                else:
                    print(f'NEModel builds single-GPU setup')
            else:
                print(
                    f'NEModel builds multi-dev setup for {len(devices)} devices'
                )

        if len(devices) < 3:
            sep_device = False  # SEP is available for 3 or more devices

        # build FWD graph(s) >> manage variables >> build OPT graph
        self.gFWD = []  # list of dicts of all FWD graphs (from all devices)
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self['seed'])  # set graph seed
            np.random.seed(self['seed'])
            if self.verb > 0:
                print(f'\nNEModel set TF & NP seed to {self["seed"]}')

            # builds graph @SEP, this graph wont be run, it is only needed to place variables, if not vars_sep >> variables will be placed with first tower
            if sep_device:
                if self.verb > 0:
                    print(
                        f'\nNEModel places {self["name"]} VARs on {devices[0]}...'
                    )
                with tf.device(devices[0]):
                    fwd_func(**self)

            tower_devices = [] + devices
            if sep_device: tower_devices = tower_devices[1:]  # trim SEP
            for dev in tower_devices:
                if self.verb > 0:
                    print(
                        f'\nNEModel builds FWD graph of {self["name"]} model @device: {dev}'
                    )
                with tf.device(dev):
                    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                        self.gFWD.append(fwd_func(**self))

            self.update(self.gFWD[0]
                        )  # update self with dictionary returned by fwd_func

            # get FWD variables returned by fwd_func (4 saver)
            train_vars = []  # variables to train
            saver_vars = {}  # dict of variables to save
            for key in self.keys():
                if 'var' in key.lower():
                    if key == 'train_vars':
                        train_vars = self[key]
                        if type(train_vars) is not list:
                            train_vars = [train_vars]
                    else:
                        if type(self[key]) is not list:
                            saver_vars[key] = [self[key]]
                        else:
                            saver_vars[key] = self[key]

            all_vars = tf.global_variables()

            # there are returned variables >> assert there are all variables returned in lists
            if saver_vars:
                all_vars_returned = []
                for key in saver_vars:
                    all_vars_returned += saver_vars[key]
                there_are_all = True
                for var in all_vars:
                    if var not in all_vars_returned:
                        print(
                            f' *** variable {var.name} not returned by fwd_func'
                        )
                        there_are_all = False
                assert there_are_all, 'ERR: there are some variables not returned by fwd_func in lists!'

            else:
                saver_vars['fwd_vars'] = all_vars  # put all

            if self.verb > 0:
                print('\nNEModel variables to save from fwd_func:')
                for key in sorted(list(saver_vars.keys())):
                    varList = saver_vars[key]
                    if varList:
                        print(
                            f' ### vars @{key} - num: {len(varList)}, floats: {short_scin(num_var_floats(varList))} ({varList[0].device})'
                        )
                    else:
                        print(' ### no vars')
                    if self.verb > 1: log_vars(varList)

            if 'loss' not in self:
                do_optimization = False
                if self.verb > 0:
                    print(
                        '\nthere is no loss in FWD graph, OPT graph wont be build'
                    )

            if not do_optimization:
                if self.verb > 0: print('\nOPT graph wont be build')
            # build optimization graph
            else:
                if self.verb > 0:
                    print(f'\nPreparing OPT part with {self["opt_class"]}')
                # select trainable variables for OPT
                all_tvars = tf.trainable_variables()
                if train_vars:
                    # check if all train_vars are trainable:
                    for var in train_vars:
                        if var not in all_tvars:
                            if self.verb > 0:
                                print(
                                    f'variable {var.name} is not trainable but is in train_vars, please check the graph!'
                                )
                else:
                    for key in saver_vars:
                        for var in saver_vars[key]:
                            if var in all_tvars:
                                train_vars.append(var)
                    assert train_vars, 'ERR: there are no trainable variables at the graph!'
                # log train_vars
                if self.verb > 0:
                    print('\nNEModel trainable variables:')
                    print(
                        f' ### train_vars: {len(train_vars)} floats: {short_scin(num_var_floats(train_vars))}'
                    )
                    if self.verb > 1: log_vars(train_vars)

                # build gradients for towers
                for ix in range(len(self.gFWD)):
                    tower = self.gFWD[ix]
                    tower['gradients'] = tf.gradients(
                        ys=tower['loss'],
                        xs=train_vars,
                        colocate_gradients_with_ops=not collocate_GWO
                    )  # TF default is False >> calculates gradients where OPS, for True >> where train_vars

                    # log gradients
                    if self.verb > 0:
                        nGrad = len(tower['gradients'])

                        # None_as_gradient case
                        device = 'UNKNOWN'
                        for t in tower['gradients']:
                            if t is not None:
                                device = t.device
                                break

                        print(
                            f' > gradients for {ix} tower got {nGrad} tensors ({device})'
                        )
                        if self.verb > 1:
                            print('NEModel variables and their gradients:')
                            for gix in range(len(tower['gradients'])):
                                grad = tower['gradients'][gix]
                                var = train_vars[gix]
                                print(var, var.device)
                                print(
                                    f' > {grad}'
                                )  # grad as a tensor displays device when printed (unless colocated with OP!)

                self['gradients'] = self.gFWD[0]['gradients']

                # None @gradients check
                none_grads = 0
                for grad in self['gradients']:
                    if grad is None: none_grads += 1
                if none_grads and self.verb > 0:
                    print(
                        f'There are None gradients: {none_grads}/{len(self["gradients"])}, some trainVars may be unrelated to loss, please check the graph!'
                    )

                # average gradients
                if len(devices) > 1:

                    if self.verb > 0:
                        print(
                            f'\nNEModel builds gradients averaging graph with device {devices[0]} for {len(self.gFWD)} towers'
                        )
                    with tf.device(devices[0]):
                        towerGrads = [
                            tower['gradients'] for tower in self.gFWD
                        ]
                        avgGrads = []
                        for mGrads in zip(*towerGrads):
                            grads = []
                            for grad in mGrads:
                                if grad is not None:  # None for variables not used while training now...
                                    expandedG = tf.expand_dims(input=grad,
                                                               axis=-1)
                                    grads.append(expandedG)
                            if grads:
                                grad = tf.concat(values=grads, axis=-1)
                                grad = tf.reduce_mean(input_tensor=grad,
                                                      axis=-1)
                                avgGrads.append(grad)
                            else:
                                avgGrads.append(None)

                        self[
                            'gradients'] = avgGrads  # update with averaged gradients
                        if self.verb > 0:
                            print(
                                f' > NEModel averaged gradients ({self["gradients"][0].device})'
                            )

                # build OPT graph
                with tf.variable_scope('OPT', reuse=tf.AUTO_REUSE):

                    if self.verb > 0:
                        print(
                            f'\nBuilding OPT graph for {self["name"]} model @device: {devices[0]}'
                        )
                    with tf.device(devices[0]):

                        self['g_step'] = tf.get_variable(  # global step
                            name='g_step',
                            shape=[],
                            trainable=False,
                            initializer=tf.constant_initializer(0),
                            dtype=tf.int32)

                        self['iLR_var'] = tf.get_variable(  # base LR variable
                            name='iLR',
                            shape=[],
                            trainable=False,
                            initializer=tf.constant_initializer(self['iLR']),
                            dtype=tf.float32)

                        self['scaled_LR'] = lr_scaler(
                            iLR=self['iLR_var'],
                            g_step=self['g_step'],
                            warm_up=self['warm_up'],
                            ann_base=self['ann_base'],
                            ann_step=self['ann_step'],
                            n_wup_off=self['n_wup_off'],
                            verb=self.verb)['scaled_LR']

                        # updates with: optimizer, gg_norm, avt_gg_norm
                        self.update(
                            gc_loss_reductor(optimizer=self['opt_class'](
                                learning_rate=self['scaled_LR']),
                                             vars=train_vars,
                                             g_step=self['g_step'],
                                             gradients=self['gradients'],
                                             avt_SVal=self['avt_SVal'],
                                             avt_window=self['avt_window'],
                                             avt_max_upd=self['avt_max_upd'],
                                             do_clip=self['do_clip'],
                                             verb=self.verb))

                        # select OPT vars
                        saver_vars['opt_vars'] = tf.global_variables(
                            scope=tf.get_variable_scope().name)
                        if self.verb > 0:
                            print(
                                f' ### opt_vars: {len(saver_vars["opt_vars"])} floats: {short_scin(num_var_floats(saver_vars["opt_vars"]))} ({saver_vars["opt_vars"][0].device})'
                            )
                            if self.verb > 1: log_vars(saver_vars['opt_vars'])

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.session = tf.Session(graph=self.graph, config=config)

        # remove keys with no variables (corner case, for proper saver)
        sKeys = list(saver_vars.keys())
        for key in sKeys:
            if not saver_vars[key]: saver_vars.pop(key)
        # TODO: saver_vars, savers_names, load_saver - need a little refactor!!!
        # add saver and load
        self.__saver = MultiSaver(model_name=self['name'],
                                  vars=saver_vars,
                                  save_TFD=save_TFD,
                                  savers=savers_names,
                                  session=self.session,
                                  verb=self.verb)
        if load_saver:
            if type(load_saver) is bool: load_saver = None
            self.__saver.load(saver=load_saver)
            self.update_LR(self['iLR'])  # safety update of iLR

        self.__summ_writer = tf.summary.FileWriter(
            logdir=self.model_dir,
            #graph=          self.graph, # you can call add_graph() later
            flush_secs=10) if not self.readonly else None

        if self.verb > 0: print(f'{self["name"]} (NEModel) build finished!')
        if self.verb > 2: print(self)