def lay_conv2D( input, name='conv2d', kernels=(3, 5, 7), # layer kernels filters=(36, 12, 6), # int divisible by len(kernels) or tuple of len(kernels) dilation=1, activation=None, useBias=True, gatedLU=False, # Gated Linear Unit architecture initializer=None, seed=12321, verbLev=0): if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): variables = [] subOutList = [] if type(kernels) is not tuple: kernels = (kernels, ) if verbLev > 0: print(' > %s: kernels %s, filetrs %s, dilation %s' % (name, kernels, filters, dilation)) for k in range(len(kernels)): with tf.variable_scope('kernel_%d' % k): subKernel = kernels[k] if type(filters) is not tuple: subFilters = filters / len(kernels) else: subFilters = filters[k] if gatedLU: subFilters *= 2 convLay = tf.layers.Conv2D(filters=subFilters, kernel_size=subKernel, dilation_rate=dilation, activation=None, use_bias=useBias, kernel_initializer=initializer, padding='valid', data_format='channels_last') subOutput = convLay(input) for var in convLay.variables: variables.append(var) if verbLev > 1: print(' >> subConv: filters %s, kernel %s' % (subFilters, subKernel)) subOutList.append(subOutput) output = tf.concat(subOutList, axis=-1) if gatedLU: s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1) output = s1 * tf.sigmoid(s2) else: if activation: output = activation(output) variables = flatten_LOTens(variables) return output, variables
def lay_conv1D( input, name='conv1D', kernels=(3, 5, 7), # layer kernels filters=(36, 12, 6), # int divisible by len(kernels) or tuple of len(kernels) dilation=1, activation=None, use_bias=True, gated_LU=False, # Gated Linear Unit architecture initializer=None, padding='valid', # 'same' adds padding, 'valid' does not seed=12321, verb=0): if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): sub_out_list = [] if type(kernels) is not tuple: kernels = (kernels, ) if verb > 1: print(' > %s: kernels %s, filters %s, dilation %s' % (name, kernels, filters, dilation)) for k in range(len(kernels)): with tf.variable_scope('kernel_%d' % k): sub_kernel = kernels[k] if type(filters) is not tuple: sub_filters = filters // len(kernels) else: sub_filters = filters[k] if gated_LU: sub_filters *= 2 conv_lay = tf.layers.Conv1D(filters=sub_filters, kernel_size=sub_kernel, dilation_rate=dilation, activation=None, use_bias=use_bias, kernel_initializer=initializer, padding=padding, data_format='channels_last') sub_output = conv_lay(input) if verb > 1: print(' >> sub_conv: filters %s, kernel %s' % (sub_filters, sub_kernel)) sub_out_list.append(sub_output) output = tf.concat(sub_out_list, axis=-1) if gated_LU: s1, s2 = tf.split(output, num_or_size_splits=2, axis=-1) output = s1 * tf.sigmoid(s2) elif activation: output = activation(output) return output
def lay_res( lay_in, # layer input lay_out, # layer output name='residual', use_RCW=False, # use residual connection weights use_PWRCW=False, # pointwise weights match_dims=True): # concatenates zeros to input when thinner # TODO: not working for higher dimm tensors with tf.variable_scope(name): output = lay_out iW = int(lay_in.shape[-1]) oW = int(output.shape[-1]) matchedDims = iW == oW # pad input with zeros to match dimension of output if iW < oW and match_dims: lay_in = tf.pad(tensor=lay_in, paddings=tf.constant([[0, 0], [0, oW - iW]])) matchedDims = True if matchedDims: if use_RCW: if use_PWRCW: shape = [oW] else: shape = [] convRCW = tf.get_variable( name='rcw', shape=shape, initializer=tf.constant_initializer(0)) output = lay_in * ( 1 - tf.sigmoid(convRCW)) + output * tf.sigmoid(convRCW) else: output = lay_in + output return output
def enc_DRT( input, name='enc_DRT', shared_lays: bool = False, # shared variables in enc_layers n_layers=12, lay_width: int = None, # for None matches input width dns_scale=6, # scale(*) of first dense activation=tf.nn.relu, # gelu is really worth a try dropout=0.0, # dropout after two denses training_flag=None, # training flag tensor (for dropout) initializer=None, seed=12321, n_hist=4, # number of histogram layers (for TB) verb=0): lay_width_matched = '' if lay_width is None: lay_width = input.shape.as_list()[-1] lay_width_matched = '(lay_width taken form input width)' if verb > 0: drp = 0.0 if not dropout else dropout print( f'\nBuilding DRTencoder ({n_layers}x{lay_width} drop:{drp:.2f}) {lay_width_matched}...' ) if initializer is None: initializer = my_initializer(seed) hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(' > histogram layers of DRTencoder:', hist_layers) zsL = [] # zeroes list with tf.variable_scope(name): # input projection iW = input.shape[-1] if iW != lay_width: input = lay_dense(input=input, units=lay_width, use_bias=False, initializer=initializer, seed=seed) if verb > 0: print('projected input to layWidth(%d) since it differs(%d)' % (lay_width, iW)) input = tf.keras.layers.LayerNormalization(axis=-1)( input) # input layer_norm output = input # for 0 layers case for nL in range(n_layers): lay_name = f'DRLay_{nL}' if not shared_lays else 'DRLay_shared' lay_out = lay_DRT(input=output, name=lay_name, hist_name=name, dns_scale=dns_scale, activation=activation, dropout=dropout, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] if nL in hist_layers: hist_summ.append(lay_out['hist_summ']) zsL += lay_out['zeroes'] return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
def enc_TNS( in_seq, # input sequence embeddings [batch, seq, emb], for TAT in_seq should be LNormalized name='enc_TNS', seq_out: bool = True, # transformer seq2seq, if False seq2one (Task Attention Transformer) add_PE: bool = True, # add positional embeddings do_LN: bool = True, # do layer norm shared_lays: bool = False, # shared variables in blocks n_blocks=12, n_heads=8, dense_mul: int or float = 4, # dense (after att) scale activation=tf.nn.relu, max_seq_len=100, # used only to set shape (axis 0) of positional embeddings dropout=0.0, # dropout of FC after attention dropout_att=0.0, # dropout of attention probabilities training_flag: tf.Tensor or bool = None, # dropout training flag (bool or tensor) initializer=None, seed=12321, n_hist=4, # number of histogram layers verb=0): if initializer is None: initializer = tf.truncated_normal_initializer(stddev=0.01, seed=seed) # split feats(-1) for heads def split_heads(x): x = tf.split(x, n_heads, axis=-1) # list of tensors return tf.stack(x, axis=-3) # [batch, head, seq, feats] # merge heads over feats(-1) def merge_heads(x): x = tf.unstack(x, axis=-3) return tf.concat(x, axis=-1) # multi_head_attention for input def mh_attn( in_seq, # input sequence [batch, seq, feats] query=None, # None for self attention, otherwise TAT [batch, n_queries, feats] activation=None, # activation of KQV dense dropout_att=0.0, drop_flag=None, seed=seed): # input projection of in_seq for KQV or KV(if query) width = in_seq.shape[-1].value proj_size = 3 if query is None else 2 c = lay_dense( input=in_seq, # [batch, seq, feats] units=width * proj_size, name='mhProj', activation=activation, initializer=initializer, seed=seed) ins_split = tf.split(c, proj_size, axis=-1) # split projected if query is not None: q = query # projection for Q is not needed (at least with 1 head) k, v = ins_split else: q, k, v = ins_split q, k, v = map(split_heads, [q, k, v]) # attention att_out = attn(q, k, v, dropout_att, drop_flag, seed) a = att_out['attention'] a = merge_heads(a) return {'attention': a, 'att_vals': att_out['att_weights']} # transformer block def tblock(in_seq, seed, task_query=None): hist_summ = [] output = in_seq taskQueryNorm = None if task_query is None: hist_summ.append( tf.summary.histogram('a_inputSeq', output, family=name)) # layer norm 1 on seq if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('b_inputSeqLN', output, family=name)) else: hist_summ.append( tf.summary.histogram('a_inTaskQuery', task_query, family=name)) taskQueryNorm = task_query # layer norm 1 on taskQuery if do_LN: taskQueryNorm = tf.keras.layers.LayerNormalization( axis=-1)(task_query) hist_summ.append( tf.summary.histogram('b_taskQueryLN', task_query, family=name)) # multi head self attention mha_out = mh_attn(in_seq=output, query=taskQueryNorm, dropout_att=dropout_att, drop_flag=training_flag, seed=seed) output = mha_out['attention'] att_vals = mha_out['att_vals'] hist_summ.append(tf.summary.histogram('c_mhAttn', output, family=name)) # dense without activation output = lay_dense(input=output, units=output.shape[-1].value, name='afterAttProj', initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('d_denseAftAtt', output, family=name)) if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual 1 if task_query is None: res1_out = in_seq + output hist_summ.append( tf.summary.histogram('e_res_onInputSeq', res1_out, family=name)) else: res1_out = task_query + output hist_summ.append( tf.summary.histogram('e_res_onTaskQuery', res1_out, family=name)) output = res1_out # layer norm 2 if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('f_layNorm', output, family=name)) # 2x dense base_width = output.shape[-1].value output = lay_dense(input=output, units=int(base_width * dense_mul), name='dense1afterAtt', activation=activation, initializer=initializer, seed=seed) zsL = [zeroes(output)] hist_summ.append( tf.summary.histogram('g_1denseOut', output, family=name)) output = lay_dense(input=output, units=base_width, name='dense2afterAtt', initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('h_2denseOut', output, family=name)) if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual2 output += res1_out hist_summ.append(tf.summary.histogram('i_res', output, family=name)) return { 'output': output, 'hist_summ': hist_summ, 'att_vals': att_vals, 'zeroes': zsL } width = in_seq.shape[-1] # sequence width (feats) seq_len = tf.shape(in_seq)[-2] # sequence length (time) if verb > 0: print('\nBuilding %s (transformer encoder) (%dx%d, denseMul %.1f), ' % (name, n_blocks, width, dense_mul)) print(' > dropout: %.2f %.2f(att)' % (dropout, dropout_att)) print(' > seq2seq mode...') if seq_out else print( ' > task attention mode...') hist_layers = list_of_layers(n_blocks, n_select=n_hist) if verb > 1: print(' > histogram layers of transformer encoder:', hist_layers) with tf.variable_scope(name): hist_summ = [] # list of histogram summaries if verb > 1: print(' > transformer input', in_seq) hist_summ.append( tf.summary.histogram('a_transformerInput', in_seq, family=name)) # init task_query (for first block - input averaged over time (seq)) task_query = None if not seq_out: task_query = tf.reduce_mean(in_seq, axis=-2, keep_dims=True) # [batch,1,feats] if verb > 1: print(' > first task_query (reduced input) for TAT', task_query) # positional embedding if add_PE: pos_emb_var = tf.get_variable(name='tnsPosEmb', shape=[max_seq_len, width], initializer=initializer) in_seq += tf.nn.embedding_lookup(params=pos_emb_var, ids=tf.range(seq_len)) if verb > 1: print(' > added positional embedding to the input...') hist_summ.append( tf.summary.histogram('b_transformerPosEmbInput', in_seq, family=name)) if verb > 1: print(' > building %d blocks of transformer...' % n_blocks) att_vals = [] # list of block attention values zsL = [] block_output = None for nB in range(n_blocks): hist_lay = nB in hist_layers lay_name = f'block_{nB}' if not shared_lays else 'block_shared' with tf.variable_scope(lay_name, reuse=tf.AUTO_REUSE): bo_dict = tblock(in_seq=in_seq, seed=seed, task_query=task_query) block_output = bo_dict['output'] if task_query is None: in_seq = block_output else: task_query = block_output zsL += bo_dict['zeroes'] if hist_lay: hist_summ += bo_dict['hist_summ'] att_block_vals = bo_dict[ 'att_vals'] #[batch,head,query_n or seq,seq] att_vals.append(att_block_vals) if task_query is None: output = block_output else: output = tf.squeeze(task_query, axis=-2) if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('c_transformer_out', output, family=name)) if verb > 1: print(' > %s output' % name, output) return { 'output': output, 'hist_summ': hist_summ, 'att_vals': att_vals, 'zeroes': zsL }
def lay_DRT( input, name='lay_DRT', # scope name, be careful when stacked since auto_reuse hist_name=None, # family name of histogram dns_scale=4, activation=tf.nn.relu, # gelu is really worth a try dropout=None, # dropout (after two denses) training_flag=None, # training flag tensor (for dropout) initializer=None, seed=12321): if not hist_name: hist_name = name lay_width = input.shape[-1] if initializer is None: initializer = my_initializer(seed) hist_summ = [] with tf.variable_scope(name_or_scope=name, reuse=tf.AUTO_REUSE): hist_summ.append( tf.summary.histogram('a_denseSin', input, family=hist_name)) # dense (scale up) output = lay_dense(input=input, units=int(lay_width * dns_scale), activation=None, use_bias=True, initializer=initializer, seed=seed, name='denseS') hist_summ.append( tf.summary.histogram('b_denseSout', output, family=hist_name)) # activation output = activation(output) zsL = [zeroes(output)] # zeroes list hist_summ.append( tf.summary.histogram('c_activation', output, family=hist_name)) # dense (scale down) no activ output = lay_dense(input=output, units=lay_width, name='DRTdenseNA', use_bias=True, initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('d_denseNAout', output, family=hist_name)) # layer dropout if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual output = lay_res(input, output) hist_summ.append( tf.summary.histogram('e_residual', output, family=hist_name)) # layer_norm output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('f_LAYout', output, family=hist_name)) return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
def enc_CNN( input: tf.Tensor, history: tf. Tensor = None, # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn name='enc_CNN', # layer params shared_lays: bool = False, # shared variables in enc_layers n_layers: int = 12, # num of layers kernel: int = 3, # layer kernel n_filters: int = 128, # num of filters activation=tf.nn. relu, # global enc activation func, gelu is really worth a try lay_drop: float or None = 0.0, ldrt_scale: int or None = 0, # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build ldrt_drop: float or None = 0.0, # DRT @enc_lay - dropout # other training_flag: tf.Tensor or bool = None, # dropout training flag tensor initializer=None, seed: int = 12321, n_hist: int = 4, # number of histogram layers verb=0): if verb > 0: print( f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...') if initializer is None: initializer = my_initializer(seed) # manage history history_lays = None if history is not None: history_lays = tf.unstack(history, axis=-3) if verb > 1: print( f' > state_lays len {len(history_lays)} of: {history_lays[0]}') hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_lays = [ ] # here we will store inputs of the following layers to extract the state (history) zsL = [] # zeroes # input projection - to match n_filters and input width if verb > 1: print(f' > encoder input: {input}') if input.shape[-1] != n_filters: input = lay_dense(input=input, units=n_filters, name='enc_input_projection', initializer=initializer) if verb > 1: print(f' > encoder projected input: {input}') output = input # for 0 layers case sub_output = input # first input for depth in range(n_layers): lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared' if verb > 1: print(f'<< layer {lay_name}:') lay_input = tf.concat([history_lays[depth], sub_output], axis=-2) if history_lays else sub_output if verb > 1: print(f' > sub_output (previous): {sub_output}') print(f' > lay_input (eventually padded): {lay_input}') input_lays.append(lay_input) hist_lay = depth in hist_layers with tf.variable_scope(lay_name): if hist_lay: hist_summ.append( tf.summary.histogram('a_lay_in', lay_input, family=name)) # LN lay_input = tf.keras.layers.LayerNormalization( axis=-1)(lay_input) if hist_lay: hist_summ.append( tf.summary.histogram('b_LN', lay_input, family=name)) # conv no activation output = lay_conv1D( input=lay_input, name='conv1D', kernels=kernel, filters=n_filters, activation=None, initializer=initializer, padding='same' if history is None else 'valid', seed=seed, verb=0) if hist_lay: hist_summ.append( tf.summary.histogram('c_cnn', output, family=name)) # activation if activation: output = activation(output) zsL += [zeroes(output)] # catch zeroes if hist_lay: hist_summ.append( tf.summary.histogram('d_activation', output, family=name)) # dropout if lay_drop: output = tf.layers.dropout(inputs=output, rate=lay_drop, training=training_flag, seed=seed) if hist_lay: hist_summ.append( tf.summary.histogram('e_drop', output, family=name)) # RES, here we take sub_output, since lay_input may be padded by history output += sub_output if hist_lay: hist_summ.append( tf.summary.histogram('f_residual', output, family=name)) if verb > 1: print(f' > output (layer): {output}') if ldrt_scale: lay_out = lay_DRT(input=output, name=lay_name + '_lay_DRT', hist_name=name, dns_scale=ldrt_scale, activation=activation, dropout=ldrt_drop, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] zsL += lay_out['zeroes'] if hist_lay: hist_summ.append(lay_out['hist_summ']) sub_output = output output = tf.keras.layers.LayerNormalization(axis=-1)(output) # final LN # prepare fin_state fin_state = None if history is not None: state = tf.stack(input_lays, axis=-3) if verb > 1: print(f' > state (stacked): {state}') fin_state = tf.split(state, num_or_size_splits=[-1, kernel - 1], axis=-2)[1] if verb > 1: print(f' > fin_state (split): {fin_state}') if verb > 1: print(f' > {name} output: {output}') return { 'output': output, 'state': fin_state, # history for next 'hist_summ': hist_summ, 'zeroes': zsL }
def mrg_ckpts( ckptA: str, # checkpoint A (folder name) ckptA_FD: str, # root folder of cpktA (absolute or relative) ckptB: str or None, # checkpoint B (folder name), for None takes 100% ckptA ckptB_FD: str or None, # root folder of cpktB (absolute or relative) ckptM: str, # checkpoint merged (folder name) ckptM_FD: str, # root folder of cpktM (absolute or relative) mrgF: float = 0.5, # merge factor (weight) noiseF: float = 0.0, # noise factor, amount of noise added to new value (0.0-1.0...) replace_scope: str = None, # replaces outer scope with given string verb=0): if ckptA_FD[-1] != '/': ckptA_FD += '/' if ckptB_FD and ckptB_FD[-1] != '/': ckptB_FD += '/' if ckptM_FD[-1] != '/': ckptM_FD += '/' var_namesA = sorted( [v[0] for v in tf.train.list_variables(ckptA_FD + ckptA)]) if verb > 0: print(f'variables from ckptA ({len(var_namesA):4d}): {var_namesA}') var_namesB = sorted( [v[0] for v in tf.train.list_variables(ckptB_FD + ckptB)]) if ckptB else [] if verb > 0: print(f'variables from ckptB ({len(var_namesB):4d}): {var_namesB}') oscope_len = 0 if replace_scope: for c in var_namesA[0]: if c == '/': break oscope_len += 1 if verb > 0: print(f'oscope_len {oscope_len}') if oscope_len: print( f' > will replace {var_namesA[0][:oscope_len]} with {replace_scope}' ) avL = [] with tf.variable_scope('av'): for var_name in var_namesA: var = tf.train.load_variable(f'{ckptA_FD}{ckptA}', var_name) avL.append(tf.Variable(var, name=var_name)) bvL = [] if ckptB: with tf.variable_scope('bv'): for var_name in var_namesB: var = tf.train.load_variable(f'{ckptB_FD}{ckptB}', var_name) bvL.append(tf.Variable(var, name=var_name)) cvL = [] for ix in range(len(var_namesA)): var_name = var_namesA[ix] if verb > 0: print(f'old var_name: {var_name}') if replace_scope: var_name = replace_scope + var_name[oscope_len:] varA = avL[ix] if bvL and varA.dtype == 'float32': varB = bvL[ix] noise = tf.random.truncated_normal( # random values from a normal distribution truncated by 2stddev shape=varA.shape, stddev=tf.math.reduce_std(varA)) # stddev of varA var = tf.Variable(mrgF * varA + (1 - mrgF) * varB + noiseF * noise, name=var_name) else: var = tf.Variable(varA, name=var_name) cvL.append(var) # save if verb > 0: print('\nWriting checkpoint... ', end='') child_saver = tf.train.Saver(cvL) #config = tf.ConfigProto() #config.gpu_options.allow_growth = True with tf.Session( #config=config ) as sess: sess.run(tf.global_variables_initializer()) child_saver.save(sess, f'{ckptM_FD}{ckptM}/{ckptM}', write_meta_graph=False) tf.reset_default_graph() if verb > 0: print('done!')
def __init__( self, fwd_func: GRAPH_FUNC, # function building forward graph (from PH to loss) mdict: DNA, # model(graph) parameters dictionary devices=-1, # check neuralmess.dev_manager.ft_devices for details do_optimization: bool = True, # add optimization part to the graph (for training) # values below complement mdict name='NEM', name_timestamp=False, # adds timestamp to name seed=12321, opt_class=tf.train. AdamOptimizer, # default optimizer, other examples: tf.train.GradientDescentOptimizer, partial(tf.train.AdamOptimizer, beta1=0.7, beta2=0.7) iLR=1e-3, warm_up=None, ann_base=None, ann_step=1, n_wup_off: float = 1, avt_SVal=1, avt_window=100, avt_max_upd=1.5, do_clip=False, # save read_only=False, # sets model to be read only (..still may log) save_TFD: str = SAVE_TFD, # top folder of model_FD savers_names: tuple = ( None, ), # names of savers for MultiSaver # TODO: what does for << this default value? load_saver: bool or str = True, # for None does not load, for True loads default do_logfile=True, # enables saving log file in save_TFD # GPU management sep_device=True, # separate first device for variables, gradients_avg, optimizer (otherwise those ar placed on the first FWD calculations tower) collocate_GWO=False, # collocates gradient calculations with tf.OPs (gradients are calculated on every tower with its operations, but remember that vars are on one device...) (otherwise with first FWD calculations tower) # other verb: int = 0 ): # verb of NEModel (object/constructor), fwd_func has own verb in mdict dict.__init__(self) # init self as a dict self.verb = verb if self.verb > 0: print('\n*** NEModel *** initializes...') self_args_dict = { # params dict from NEModel constructor 'name': name, 'seed': seed, 'opt_class': opt_class, 'iLR': iLR, 'warm_up': warm_up, 'ann_base': ann_base, 'ann_step': ann_step, 'n_wup_off': n_wup_off, 'avt_SVal': avt_SVal, 'avt_window': avt_window, 'avt_max_upd': avt_max_upd, 'do_clip': do_clip } fwdf_mdict = get_defaults( function=fwd_func) # params dict with defaults of fwd_func # resolve model name and extend with timestamp when needed resolved_name = self_args_dict['name'] if 'name' in fwdf_mdict: resolved_name = fwdf_mdict['name'] if 'name' in mdict: resolved_name = mdict['name'] if name_timestamp: resolved_name += '.' + stamp() mdict['name'] = resolved_name self.model_dir = f'{save_TFD}/{mdict["name"]}' # here goes everything from the model if self.verb > 0: print(f' > NEModel name: {mdict["name"]}') print(f' > NEModel dir: {self.model_dir}') # build folder managed dna with empty dna, it gets dna FROM FOLDER self.__dna = ParaDict(dna_TFD=save_TFD, dna_SFD=mdict['name'], fn_pfx=NEMODEL_DNA_PFX, verb=self.verb) # set logfile if do_logfile: set_logger(log_folder=self.model_dir, custom_name=mdict['name'], verb=self.verb) # resolve model dict (dna) in proper order md = {} md.update(self_args_dict ) # 1 update with params dict from NEModel constructor md.update(fwdf_mdict) # 2 update with defaults of fwd_func md.update(self.__dna) # 3 update with params from folder md.update(mdict) # 4 update with given mdict self.__dna.update(md) self.__dna.check_params_sim(SPEC_KEYS) # safety check self.readonly = read_only if self.model_dir and not self.readonly: self.__dna.save() self.update( self.__dna) # finally update self with all model building params devices = tf_devices(devices, verb=self.verb) # report devices if self.verb > 0: print() if len(devices) == 1: if 'CPU' in devices[0]: print(f'NEModel builds CPU device setup') else: print(f'NEModel builds single-GPU setup') else: print( f'NEModel builds multi-dev setup for {len(devices)} devices' ) if len(devices) < 3: sep_device = False # SEP is available for 3 or more devices # build FWD graph(s) >> manage variables >> build OPT graph self.gFWD = [] # list of dicts of all FWD graphs (from all devices) self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self['seed']) # set graph seed np.random.seed(self['seed']) if self.verb > 0: print(f'\nNEModel set TF & NP seed to {self["seed"]}') # builds graph @SEP, this graph wont be run, it is only needed to place variables, if not vars_sep >> variables will be placed with first tower if sep_device: if self.verb > 0: print( f'\nNEModel places {self["name"]} VARs on {devices[0]}...' ) with tf.device(devices[0]): fwd_func(**self) tower_devices = [] + devices if sep_device: tower_devices = tower_devices[1:] # trim SEP for dev in tower_devices: if self.verb > 0: print( f'\nNEModel builds FWD graph of {self["name"]} model @device: {dev}' ) with tf.device(dev): with tf.variable_scope('', reuse=tf.AUTO_REUSE): self.gFWD.append(fwd_func(**self)) self.update(self.gFWD[0] ) # update self with dictionary returned by fwd_func # get FWD variables returned by fwd_func (4 saver) train_vars = [] # variables to train saver_vars = {} # dict of variables to save for key in self.keys(): if 'var' in key.lower(): if key == 'train_vars': train_vars = self[key] if type(train_vars) is not list: train_vars = [train_vars] else: if type(self[key]) is not list: saver_vars[key] = [self[key]] else: saver_vars[key] = self[key] all_vars = tf.global_variables() # there are returned variables >> assert there are all variables returned in lists if saver_vars: all_vars_returned = [] for key in saver_vars: all_vars_returned += saver_vars[key] there_are_all = True for var in all_vars: if var not in all_vars_returned: print( f' *** variable {var.name} not returned by fwd_func' ) there_are_all = False assert there_are_all, 'ERR: there are some variables not returned by fwd_func in lists!' else: saver_vars['fwd_vars'] = all_vars # put all if self.verb > 0: print('\nNEModel variables to save from fwd_func:') for key in sorted(list(saver_vars.keys())): varList = saver_vars[key] if varList: print( f' ### vars @{key} - num: {len(varList)}, floats: {short_scin(num_var_floats(varList))} ({varList[0].device})' ) else: print(' ### no vars') if self.verb > 1: log_vars(varList) if 'loss' not in self: do_optimization = False if self.verb > 0: print( '\nthere is no loss in FWD graph, OPT graph wont be build' ) if not do_optimization: if self.verb > 0: print('\nOPT graph wont be build') # build optimization graph else: if self.verb > 0: print(f'\nPreparing OPT part with {self["opt_class"]}') # select trainable variables for OPT all_tvars = tf.trainable_variables() if train_vars: # check if all train_vars are trainable: for var in train_vars: if var not in all_tvars: if self.verb > 0: print( f'variable {var.name} is not trainable but is in train_vars, please check the graph!' ) else: for key in saver_vars: for var in saver_vars[key]: if var in all_tvars: train_vars.append(var) assert train_vars, 'ERR: there are no trainable variables at the graph!' # log train_vars if self.verb > 0: print('\nNEModel trainable variables:') print( f' ### train_vars: {len(train_vars)} floats: {short_scin(num_var_floats(train_vars))}' ) if self.verb > 1: log_vars(train_vars) # build gradients for towers for ix in range(len(self.gFWD)): tower = self.gFWD[ix] tower['gradients'] = tf.gradients( ys=tower['loss'], xs=train_vars, colocate_gradients_with_ops=not collocate_GWO ) # TF default is False >> calculates gradients where OPS, for True >> where train_vars # log gradients if self.verb > 0: nGrad = len(tower['gradients']) # None_as_gradient case device = 'UNKNOWN' for t in tower['gradients']: if t is not None: device = t.device break print( f' > gradients for {ix} tower got {nGrad} tensors ({device})' ) if self.verb > 1: print('NEModel variables and their gradients:') for gix in range(len(tower['gradients'])): grad = tower['gradients'][gix] var = train_vars[gix] print(var, var.device) print( f' > {grad}' ) # grad as a tensor displays device when printed (unless colocated with OP!) self['gradients'] = self.gFWD[0]['gradients'] # None @gradients check none_grads = 0 for grad in self['gradients']: if grad is None: none_grads += 1 if none_grads and self.verb > 0: print( f'There are None gradients: {none_grads}/{len(self["gradients"])}, some trainVars may be unrelated to loss, please check the graph!' ) # average gradients if len(devices) > 1: if self.verb > 0: print( f'\nNEModel builds gradients averaging graph with device {devices[0]} for {len(self.gFWD)} towers' ) with tf.device(devices[0]): towerGrads = [ tower['gradients'] for tower in self.gFWD ] avgGrads = [] for mGrads in zip(*towerGrads): grads = [] for grad in mGrads: if grad is not None: # None for variables not used while training now... expandedG = tf.expand_dims(input=grad, axis=-1) grads.append(expandedG) if grads: grad = tf.concat(values=grads, axis=-1) grad = tf.reduce_mean(input_tensor=grad, axis=-1) avgGrads.append(grad) else: avgGrads.append(None) self[ 'gradients'] = avgGrads # update with averaged gradients if self.verb > 0: print( f' > NEModel averaged gradients ({self["gradients"][0].device})' ) # build OPT graph with tf.variable_scope('OPT', reuse=tf.AUTO_REUSE): if self.verb > 0: print( f'\nBuilding OPT graph for {self["name"]} model @device: {devices[0]}' ) with tf.device(devices[0]): self['g_step'] = tf.get_variable( # global step name='g_step', shape=[], trainable=False, initializer=tf.constant_initializer(0), dtype=tf.int32) self['iLR_var'] = tf.get_variable( # base LR variable name='iLR', shape=[], trainable=False, initializer=tf.constant_initializer(self['iLR']), dtype=tf.float32) self['scaled_LR'] = lr_scaler( iLR=self['iLR_var'], g_step=self['g_step'], warm_up=self['warm_up'], ann_base=self['ann_base'], ann_step=self['ann_step'], n_wup_off=self['n_wup_off'], verb=self.verb)['scaled_LR'] # updates with: optimizer, gg_norm, avt_gg_norm self.update( gc_loss_reductor(optimizer=self['opt_class']( learning_rate=self['scaled_LR']), vars=train_vars, g_step=self['g_step'], gradients=self['gradients'], avt_SVal=self['avt_SVal'], avt_window=self['avt_window'], avt_max_upd=self['avt_max_upd'], do_clip=self['do_clip'], verb=self.verb)) # select OPT vars saver_vars['opt_vars'] = tf.global_variables( scope=tf.get_variable_scope().name) if self.verb > 0: print( f' ### opt_vars: {len(saver_vars["opt_vars"])} floats: {short_scin(num_var_floats(saver_vars["opt_vars"]))} ({saver_vars["opt_vars"][0].device})' ) if self.verb > 1: log_vars(saver_vars['opt_vars']) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.session = tf.Session(graph=self.graph, config=config) # remove keys with no variables (corner case, for proper saver) sKeys = list(saver_vars.keys()) for key in sKeys: if not saver_vars[key]: saver_vars.pop(key) # TODO: saver_vars, savers_names, load_saver - need a little refactor!!! # add saver and load self.__saver = MultiSaver(model_name=self['name'], vars=saver_vars, save_TFD=save_TFD, savers=savers_names, session=self.session, verb=self.verb) if load_saver: if type(load_saver) is bool: load_saver = None self.__saver.load(saver=load_saver) self.update_LR(self['iLR']) # safety update of iLR self.__summ_writer = tf.summary.FileWriter( logdir=self.model_dir, #graph= self.graph, # you can call add_graph() later flush_secs=10) if not self.readonly else None if self.verb > 0: print(f'{self["name"]} (NEModel) build finished!') if self.verb > 2: print(self)