Esempio n. 1
0
def lay_conv2D(
        input,
        name='conv2d',
        kernels=(3, 5, 7),  # layer kernels
        filters=(36, 12,
                 6),  # int divisible by len(kernels) or tuple of len(kernels)
        dilation=1,
        activation=None,
        useBias=True,
        gatedLU=False,  # Gated Linear Unit architecture
        initializer=None,
        seed=12321,
        verbLev=0):

    if initializer is None: initializer = my_initializer(seed)
    with tf.variable_scope(name):
        variables = []
        subOutList = []
        if type(kernels) is not tuple: kernels = (kernels, )
        if verbLev > 0:
            print(' > %s: kernels %s, filetrs %s, dilation %s' %
                  (name, kernels, filters, dilation))
        for k in range(len(kernels)):
            with tf.variable_scope('kernel_%d' % k):
                subKernel = kernels[k]
                if type(filters) is not tuple:
                    subFilters = filters / len(kernels)
                else:
                    subFilters = filters[k]
                if gatedLU: subFilters *= 2

                convLay = tf.layers.Conv2D(filters=subFilters,
                                           kernel_size=subKernel,
                                           dilation_rate=dilation,
                                           activation=None,
                                           use_bias=useBias,
                                           kernel_initializer=initializer,
                                           padding='valid',
                                           data_format='channels_last')
                subOutput = convLay(input)
                for var in convLay.variables:
                    variables.append(var)

                if verbLev > 1:
                    print(' >> subConv: filters %s, kernel %s' %
                          (subFilters, subKernel))
                subOutList.append(subOutput)

        output = tf.concat(subOutList, axis=-1)
        if gatedLU:
            s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1)
            output = s1 * tf.sigmoid(s2)
        else:
            if activation: output = activation(output)

        variables = flatten_LOTens(variables)

    return output, variables
Esempio n. 2
0
def lay_conv1D(
        input,
        name='conv1D',
        kernels=(3, 5, 7),  # layer kernels
        filters=(36, 12,
                 6),  # int divisible by len(kernels) or tuple of len(kernels)
        dilation=1,
        activation=None,
        use_bias=True,
        gated_LU=False,  # Gated Linear Unit architecture
        initializer=None,
        padding='valid',  # 'same' adds padding, 'valid' does not
        seed=12321,
        verb=0):

    if initializer is None: initializer = my_initializer(seed)
    with tf.variable_scope(name):
        sub_out_list = []
        if type(kernels) is not tuple: kernels = (kernels, )
        if verb > 1:
            print(' > %s: kernels %s, filters %s, dilation %s' %
                  (name, kernels, filters, dilation))
        for k in range(len(kernels)):
            with tf.variable_scope('kernel_%d' % k):
                sub_kernel = kernels[k]
                if type(filters) is not tuple:
                    sub_filters = filters // len(kernels)
                else:
                    sub_filters = filters[k]
                if gated_LU: sub_filters *= 2

                conv_lay = tf.layers.Conv1D(filters=sub_filters,
                                            kernel_size=sub_kernel,
                                            dilation_rate=dilation,
                                            activation=None,
                                            use_bias=use_bias,
                                            kernel_initializer=initializer,
                                            padding=padding,
                                            data_format='channels_last')
                sub_output = conv_lay(input)

                if verb > 1:
                    print(' >> sub_conv: filters %s, kernel %s' %
                          (sub_filters, sub_kernel))
                sub_out_list.append(sub_output)

        output = tf.concat(sub_out_list, axis=-1)
        if gated_LU:
            s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1)
            output = s1 * tf.sigmoid(s2)
        elif activation:
            output = activation(output)

    return output
Esempio n. 3
0
 def merge_heads(x):
     x = tf.unstack(x, axis=-3)
     return tf.concat(x, axis=-1)
Esempio n. 4
0
def enc_CNN(
        input: tf.Tensor,
        history: tf.
    Tensor = None,  # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn
        name='enc_CNN',
        # layer params
        shared_lays: bool = False,  # shared variables in enc_layers
        n_layers: int = 12,  # num of layers
        kernel: int = 3,  # layer kernel
        n_filters: int = 128,  # num of filters
        activation=tf.nn.
    relu,  # global enc activation func, gelu is really worth a try
        lay_drop: float or None = 0.0,
        ldrt_scale: int or
    None = 0,  # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build
        ldrt_drop: float or None = 0.0,  # DRT @enc_lay - dropout
        # other
    training_flag: tf.Tensor or bool = None,  # dropout training flag tensor
        initializer=None,
        seed: int = 12321,
        n_hist: int = 4,  # number of histogram layers
        verb=0):

    if verb > 0:
        print(
            f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...')

    if initializer is None: initializer = my_initializer(seed)

    # manage history
    history_lays = None
    if history is not None:
        history_lays = tf.unstack(history, axis=-3)
        if verb > 1:
            print(
                f' > state_lays len {len(history_lays)} of: {history_lays[0]}')

    hist_summ = []
    hist_layers = list_of_layers(n_layers, n_select=n_hist)
    if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}')

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        input_lays = [
        ]  # here we will store inputs of the following layers to extract the state (history)
        zsL = []  # zeroes

        # input projection - to match n_filters and input width
        if verb > 1: print(f' > encoder input: {input}')
        if input.shape[-1] != n_filters:
            input = lay_dense(input=input,
                              units=n_filters,
                              name='enc_input_projection',
                              initializer=initializer)
            if verb > 1: print(f' > encoder projected input: {input}')

        output = input  # for 0 layers case
        sub_output = input  # first input
        for depth in range(n_layers):

            lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared'
            if verb > 1: print(f'<< layer {lay_name}:')

            lay_input = tf.concat([history_lays[depth], sub_output],
                                  axis=-2) if history_lays else sub_output
            if verb > 1:
                print(f' > sub_output (previous): {sub_output}')
                print(f' > lay_input (eventually padded): {lay_input}')
            input_lays.append(lay_input)

            hist_lay = depth in hist_layers

            with tf.variable_scope(lay_name):

                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('a_lay_in',
                                             lay_input,
                                             family=name))

                # LN
                lay_input = tf.keras.layers.LayerNormalization(
                    axis=-1)(lay_input)
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('b_LN', lay_input, family=name))

                # conv no activation
                output = lay_conv1D(
                    input=lay_input,
                    name='conv1D',
                    kernels=kernel,
                    filters=n_filters,
                    activation=None,
                    initializer=initializer,
                    padding='same' if history is None else 'valid',
                    seed=seed,
                    verb=0)
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('c_cnn', output, family=name))

                # activation
                if activation:
                    output = activation(output)
                    zsL += [zeroes(output)]  # catch zeroes
                    if hist_lay:
                        hist_summ.append(
                            tf.summary.histogram('d_activation',
                                                 output,
                                                 family=name))

                # dropout
                if lay_drop:
                    output = tf.layers.dropout(inputs=output,
                                               rate=lay_drop,
                                               training=training_flag,
                                               seed=seed)
                    if hist_lay:
                        hist_summ.append(
                            tf.summary.histogram('e_drop', output,
                                                 family=name))

                # RES, here we take sub_output, since lay_input may be padded by history
                output += sub_output
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('f_residual', output,
                                             family=name))

                if verb > 1: print(f' > output (layer): {output}')

                if ldrt_scale:
                    lay_out = lay_DRT(input=output,
                                      name=lay_name + '_lay_DRT',
                                      hist_name=name,
                                      dns_scale=ldrt_scale,
                                      activation=activation,
                                      dropout=ldrt_drop,
                                      training_flag=training_flag,
                                      initializer=initializer,
                                      seed=seed)
                    output = lay_out['output']
                    zsL += lay_out['zeroes']
                    if hist_lay: hist_summ.append(lay_out['hist_summ'])

                sub_output = output

    output = tf.keras.layers.LayerNormalization(axis=-1)(output)  # final LN

    # prepare fin_state
    fin_state = None
    if history is not None:
        state = tf.stack(input_lays, axis=-3)
        if verb > 1: print(f' > state (stacked): {state}')
        fin_state = tf.split(state,
                             num_or_size_splits=[-1, kernel - 1],
                             axis=-2)[1]
        if verb > 1: print(f' > fin_state (split): {fin_state}')

    if verb > 1: print(f' > {name} output: {output}')
    return {
        'output': output,
        'state': fin_state,  # history for next
        'hist_summ': hist_summ,
        'zeroes': zsL
    }
Esempio n. 5
0
def flatten_LOTens(tList):

    resh_vars = [tf.reshape(var, [-1]) for var in tList]
    return tf.concat(resh_vars, axis=-1)
Esempio n. 6
0
    def __init__(
        self,
        fwd_func:
        GRAPH_FUNC,  # function building forward graph (from PH to loss)
        mdict: DNA,  # model(graph) parameters dictionary
        devices=-1,  # check neuralmess.dev_manager.ft_devices for details
        do_optimization:
        bool = True,  # add optimization part to the graph (for training)
        # values below complement mdict
        name='NEM',
        name_timestamp=False,  # adds timestamp to name
        seed=12321,
        opt_class=tf.train.
        AdamOptimizer,  # default optimizer, other examples: tf.train.GradientDescentOptimizer, partial(tf.train.AdamOptimizer, beta1=0.7, beta2=0.7)
        iLR=1e-3,
        warm_up=None,
        ann_base=None,
        ann_step=1,
        n_wup_off: float = 1,
        avt_SVal=1,
        avt_window=100,
        avt_max_upd=1.5,
        do_clip=False,
        # save
        read_only=False,  # sets model to be read only (..still may log)
        save_TFD: str = SAVE_TFD,  # top folder of model_FD
        savers_names: tuple = (
            None,
        ),  # names of savers for MultiSaver # TODO: what does for << this default value?
        load_saver: bool
        or str = True,  # for None does not load, for True loads default
        do_logfile=True,  # enables saving log file in save_TFD
        # GPU management
        sep_device=True,  # separate first device for variables, gradients_avg, optimizer (otherwise those ar placed on the first FWD calculations tower)
        collocate_GWO=False,  # collocates gradient calculations with tf.OPs (gradients are calculated on every tower with its operations, but remember that vars are on one device...) (otherwise with first FWD calculations tower)
        # other
        verb: int = 0
    ):  # verb of NEModel (object/constructor), fwd_func has own verb in mdict

        dict.__init__(self)  # init self as a dict

        self.verb = verb
        if self.verb > 0: print('\n*** NEModel *** initializes...')

        self_args_dict = {  # params dict from NEModel constructor
            'name': name,
            'seed': seed,
            'opt_class': opt_class,
            'iLR': iLR,
            'warm_up': warm_up,
            'ann_base': ann_base,
            'ann_step': ann_step,
            'n_wup_off': n_wup_off,
            'avt_SVal': avt_SVal,
            'avt_window': avt_window,
            'avt_max_upd': avt_max_upd,
            'do_clip': do_clip
        }

        fwdf_mdict = get_defaults(
            function=fwd_func)  # params dict with defaults of fwd_func

        # resolve model name and extend with timestamp when needed
        resolved_name = self_args_dict['name']
        if 'name' in fwdf_mdict: resolved_name = fwdf_mdict['name']
        if 'name' in mdict: resolved_name = mdict['name']
        if name_timestamp: resolved_name += '.' + stamp()
        mdict['name'] = resolved_name
        self.model_dir = f'{save_TFD}/{mdict["name"]}'  # here goes everything from the model
        if self.verb > 0:
            print(f' > NEModel name: {mdict["name"]}')
            print(f' > NEModel dir: {self.model_dir}')

        # build folder managed dna with empty dna, it gets dna FROM FOLDER
        self.__dna = ParaDict(dna_TFD=save_TFD,
                              dna_SFD=mdict['name'],
                              fn_pfx=NEMODEL_DNA_PFX,
                              verb=self.verb)

        # set logfile
        if do_logfile:
            set_logger(log_folder=self.model_dir,
                       custom_name=mdict['name'],
                       verb=self.verb)

        # resolve model dict (dna) in proper order
        md = {}
        md.update(self_args_dict
                  )  # 1 update with params dict from NEModel constructor
        md.update(fwdf_mdict)  # 2 update with defaults of fwd_func
        md.update(self.__dna)  # 3 update with params from folder
        md.update(mdict)  # 4 update with given mdict
        self.__dna.update(md)
        self.__dna.check_params_sim(SPEC_KEYS)  # safety check
        self.readonly = read_only
        if self.model_dir and not self.readonly: self.__dna.save()

        self.update(
            self.__dna)  # finally update self with all model building params

        devices = tf_devices(devices, verb=self.verb)

        # report devices
        if self.verb > 0:
            print()
            if len(devices) == 1:
                if 'CPU' in devices[0]:
                    print(f'NEModel builds CPU device setup')
                else:
                    print(f'NEModel builds single-GPU setup')
            else:
                print(
                    f'NEModel builds multi-dev setup for {len(devices)} devices'
                )

        if len(devices) < 3:
            sep_device = False  # SEP is available for 3 or more devices

        # build FWD graph(s) >> manage variables >> build OPT graph
        self.gFWD = []  # list of dicts of all FWD graphs (from all devices)
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self['seed'])  # set graph seed
            np.random.seed(self['seed'])
            if self.verb > 0:
                print(f'\nNEModel set TF & NP seed to {self["seed"]}')

            # builds graph @SEP, this graph wont be run, it is only needed to place variables, if not vars_sep >> variables will be placed with first tower
            if sep_device:
                if self.verb > 0:
                    print(
                        f'\nNEModel places {self["name"]} VARs on {devices[0]}...'
                    )
                with tf.device(devices[0]):
                    fwd_func(**self)

            tower_devices = [] + devices
            if sep_device: tower_devices = tower_devices[1:]  # trim SEP
            for dev in tower_devices:
                if self.verb > 0:
                    print(
                        f'\nNEModel builds FWD graph of {self["name"]} model @device: {dev}'
                    )
                with tf.device(dev):
                    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                        self.gFWD.append(fwd_func(**self))

            self.update(self.gFWD[0]
                        )  # update self with dictionary returned by fwd_func

            # get FWD variables returned by fwd_func (4 saver)
            train_vars = []  # variables to train
            saver_vars = {}  # dict of variables to save
            for key in self.keys():
                if 'var' in key.lower():
                    if key == 'train_vars':
                        train_vars = self[key]
                        if type(train_vars) is not list:
                            train_vars = [train_vars]
                    else:
                        if type(self[key]) is not list:
                            saver_vars[key] = [self[key]]
                        else:
                            saver_vars[key] = self[key]

            all_vars = tf.global_variables()

            # there are returned variables >> assert there are all variables returned in lists
            if saver_vars:
                all_vars_returned = []
                for key in saver_vars:
                    all_vars_returned += saver_vars[key]
                there_are_all = True
                for var in all_vars:
                    if var not in all_vars_returned:
                        print(
                            f' *** variable {var.name} not returned by fwd_func'
                        )
                        there_are_all = False
                assert there_are_all, 'ERR: there are some variables not returned by fwd_func in lists!'

            else:
                saver_vars['fwd_vars'] = all_vars  # put all

            if self.verb > 0:
                print('\nNEModel variables to save from fwd_func:')
                for key in sorted(list(saver_vars.keys())):
                    varList = saver_vars[key]
                    if varList:
                        print(
                            f' ### vars @{key} - num: {len(varList)}, floats: {short_scin(num_var_floats(varList))} ({varList[0].device})'
                        )
                    else:
                        print(' ### no vars')
                    if self.verb > 1: log_vars(varList)

            if 'loss' not in self:
                do_optimization = False
                if self.verb > 0:
                    print(
                        '\nthere is no loss in FWD graph, OPT graph wont be build'
                    )

            if not do_optimization:
                if self.verb > 0: print('\nOPT graph wont be build')
            # build optimization graph
            else:
                if self.verb > 0:
                    print(f'\nPreparing OPT part with {self["opt_class"]}')
                # select trainable variables for OPT
                all_tvars = tf.trainable_variables()
                if train_vars:
                    # check if all train_vars are trainable:
                    for var in train_vars:
                        if var not in all_tvars:
                            if self.verb > 0:
                                print(
                                    f'variable {var.name} is not trainable but is in train_vars, please check the graph!'
                                )
                else:
                    for key in saver_vars:
                        for var in saver_vars[key]:
                            if var in all_tvars:
                                train_vars.append(var)
                    assert train_vars, 'ERR: there are no trainable variables at the graph!'
                # log train_vars
                if self.verb > 0:
                    print('\nNEModel trainable variables:')
                    print(
                        f' ### train_vars: {len(train_vars)} floats: {short_scin(num_var_floats(train_vars))}'
                    )
                    if self.verb > 1: log_vars(train_vars)

                # build gradients for towers
                for ix in range(len(self.gFWD)):
                    tower = self.gFWD[ix]
                    tower['gradients'] = tf.gradients(
                        ys=tower['loss'],
                        xs=train_vars,
                        colocate_gradients_with_ops=not collocate_GWO
                    )  # TF default is False >> calculates gradients where OPS, for True >> where train_vars

                    # log gradients
                    if self.verb > 0:
                        nGrad = len(tower['gradients'])

                        # None_as_gradient case
                        device = 'UNKNOWN'
                        for t in tower['gradients']:
                            if t is not None:
                                device = t.device
                                break

                        print(
                            f' > gradients for {ix} tower got {nGrad} tensors ({device})'
                        )
                        if self.verb > 1:
                            print('NEModel variables and their gradients:')
                            for gix in range(len(tower['gradients'])):
                                grad = tower['gradients'][gix]
                                var = train_vars[gix]
                                print(var, var.device)
                                print(
                                    f' > {grad}'
                                )  # grad as a tensor displays device when printed (unless colocated with OP!)

                self['gradients'] = self.gFWD[0]['gradients']

                # None @gradients check
                none_grads = 0
                for grad in self['gradients']:
                    if grad is None: none_grads += 1
                if none_grads and self.verb > 0:
                    print(
                        f'There are None gradients: {none_grads}/{len(self["gradients"])}, some trainVars may be unrelated to loss, please check the graph!'
                    )

                # average gradients
                if len(devices) > 1:

                    if self.verb > 0:
                        print(
                            f'\nNEModel builds gradients averaging graph with device {devices[0]} for {len(self.gFWD)} towers'
                        )
                    with tf.device(devices[0]):
                        towerGrads = [
                            tower['gradients'] for tower in self.gFWD
                        ]
                        avgGrads = []
                        for mGrads in zip(*towerGrads):
                            grads = []
                            for grad in mGrads:
                                if grad is not None:  # None for variables not used while training now...
                                    expandedG = tf.expand_dims(input=grad,
                                                               axis=-1)
                                    grads.append(expandedG)
                            if grads:
                                grad = tf.concat(values=grads, axis=-1)
                                grad = tf.reduce_mean(input_tensor=grad,
                                                      axis=-1)
                                avgGrads.append(grad)
                            else:
                                avgGrads.append(None)

                        self[
                            'gradients'] = avgGrads  # update with averaged gradients
                        if self.verb > 0:
                            print(
                                f' > NEModel averaged gradients ({self["gradients"][0].device})'
                            )

                # build OPT graph
                with tf.variable_scope('OPT', reuse=tf.AUTO_REUSE):

                    if self.verb > 0:
                        print(
                            f'\nBuilding OPT graph for {self["name"]} model @device: {devices[0]}'
                        )
                    with tf.device(devices[0]):

                        self['g_step'] = tf.get_variable(  # global step
                            name='g_step',
                            shape=[],
                            trainable=False,
                            initializer=tf.constant_initializer(0),
                            dtype=tf.int32)

                        self['iLR_var'] = tf.get_variable(  # base LR variable
                            name='iLR',
                            shape=[],
                            trainable=False,
                            initializer=tf.constant_initializer(self['iLR']),
                            dtype=tf.float32)

                        self['scaled_LR'] = lr_scaler(
                            iLR=self['iLR_var'],
                            g_step=self['g_step'],
                            warm_up=self['warm_up'],
                            ann_base=self['ann_base'],
                            ann_step=self['ann_step'],
                            n_wup_off=self['n_wup_off'],
                            verb=self.verb)['scaled_LR']

                        # updates with: optimizer, gg_norm, avt_gg_norm
                        self.update(
                            gc_loss_reductor(optimizer=self['opt_class'](
                                learning_rate=self['scaled_LR']),
                                             vars=train_vars,
                                             g_step=self['g_step'],
                                             gradients=self['gradients'],
                                             avt_SVal=self['avt_SVal'],
                                             avt_window=self['avt_window'],
                                             avt_max_upd=self['avt_max_upd'],
                                             do_clip=self['do_clip'],
                                             verb=self.verb))

                        # select OPT vars
                        saver_vars['opt_vars'] = tf.global_variables(
                            scope=tf.get_variable_scope().name)
                        if self.verb > 0:
                            print(
                                f' ### opt_vars: {len(saver_vars["opt_vars"])} floats: {short_scin(num_var_floats(saver_vars["opt_vars"]))} ({saver_vars["opt_vars"][0].device})'
                            )
                            if self.verb > 1: log_vars(saver_vars['opt_vars'])

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.session = tf.Session(graph=self.graph, config=config)

        # remove keys with no variables (corner case, for proper saver)
        sKeys = list(saver_vars.keys())
        for key in sKeys:
            if not saver_vars[key]: saver_vars.pop(key)
        # TODO: saver_vars, savers_names, load_saver - need a little refactor!!!
        # add saver and load
        self.__saver = MultiSaver(model_name=self['name'],
                                  vars=saver_vars,
                                  save_TFD=save_TFD,
                                  savers=savers_names,
                                  session=self.session,
                                  verb=self.verb)
        if load_saver:
            if type(load_saver) is bool: load_saver = None
            self.__saver.load(saver=load_saver)
            self.update_LR(self['iLR'])  # safety update of iLR

        self.__summ_writer = tf.summary.FileWriter(
            logdir=self.model_dir,
            #graph=          self.graph, # you can call add_graph() later
            flush_secs=10) if not self.readonly else None

        if self.verb > 0: print(f'{self["name"]} (NEModel) build finished!')
        if self.verb > 2: print(self)