def qnn_graph( name= 'qnn', num_actions: int= 4, num_states: int= 16, state_emb_width= 4, hidden_layers_size= (12,), gamma= 0.9, seed= 121, **kwargs): with tf.variable_scope(name): qv_target_PH = tf.placeholder( # qv next state placeholder shape= [None,num_actions], dtype= tf.float32) reward_PH = tf.placeholder( # reward shape= [None], dtype= tf.float32) state_PH = tf.placeholder( # state shape= [None], dtype= tf.int32) enum_actions_PH = tf.placeholder( # enumerated action indexes (0,1),(1,3),(2,0),.. shape= [None,2], dtype= tf.int32) state_emb = tf.get_variable( name= 'state_emb', shape= [num_states,state_emb_width], dtype= tf.float32) input = tf.nn.embedding_lookup(state_emb, state_PH) print('input:', input) for l in hidden_layers_size: input = lay_dense( input= input, units= l, activation= tf.nn.relu, seed= seed) output = lay_dense( # QV for all actions (for given input(state)) input= input, units= num_actions, activation= None, seed= seed) pred_qv = tf.gather_nd(output, indices=enum_actions_PH) gold_qv = reward_PH + gamma * tf.reduce_max(qv_target_PH, axis=-1) # gold is predicted by same network loss = tf.losses.mean_squared_error(labels=gold_qv, predictions=pred_qv) # loss on predicted vs next, we want predicted to match next loss = tf.reduce_mean(loss) return { 'qv_target_PH': qv_target_PH, 'reward_PH': reward_PH, 'state_PH': state_PH, 'enum_actions_PH': enum_actions_PH, 'output': output, 'loss': loss}
def decN( input, dictW, predN=1, # N samples for every feature name='decN', hLays=None, # tuple or list of ints hActiv=tf.nn.relu, initializer=None, seed=12321, verbLev=0): if verbLev > 0: print('\nBuilding decoderN ...') if verbLev > 1: print('decoder input:', input) if initializer is None: initializer = my_initializer(seed) with tf.variable_scope(name): # hidden layers if hLays: for nLay in range(len(hLays)): laySize = hLays[nLay] input = lay_dense(input=input, units=laySize, activation=hActiv, use_bias=True, initializer=initializer, seed=seed, name='decoderN_Hlay_%s' % nLay) # projection to predN x dictW logits = lay_dense(input=input, units=predN * dictW, activation=None, use_bias=True, initializer=initializer, seed=seed, name='decoderNProjection') if verbLev > 1: print(' > projection to logits (%dx dictW):' % predN, logits) if predN > 1: logits = tf.reshape(logits, [tf.shape(logits)[0], -1, dictW]) if verbLev > 1: print(' > reshaped logits (B,%dxS,dictW):' % predN, logits) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) if verbLev > 1: print(' > predictions:', predictions) return logits, predictions
def mh_attn( in_seq, # input sequence [batch, seq, feats] query=None, # None for self attention, otherwise TAT [batch, n_queries, feats] activation=None, # activation of KQV dense dropout_att=0.0, drop_flag=None, seed=seed): # input projection of in_seq for KQV or KV(if query) width = in_seq.shape[-1].value proj_size = 3 if query is None else 2 c = lay_dense( input=in_seq, # [batch, seq, feats] units=width * proj_size, name='mhProj', activation=activation, initializer=initializer, seed=seed) ins_split = tf.split(c, proj_size, axis=-1) # split projected if query is not None: q = query # projection for Q is not needed (at least with 1 head) k, v = ins_split else: q, k, v = ins_split q, k, v = map(split_heads, [q, k, v]) # attention att_out = attn(q, k, v, dropout_att, drop_flag, seed) a = att_out['attention'] a = merge_heads(a) return {'attention': a, 'att_vals': att_out['att_weights']}
def pgnn_graph(name='pgnn', state_size=4, num_actions=2, hidden_layers=(20, ), seed=121, **kwargs): with tf.variable_scope(name): states_PH = tf.placeholder( # environment state representation (prepared by PolicyGradientsEnvironment.encode_state()) shape=(None, state_size), dtype=tf.float32, name='input_states') acc_rew_PH = tf.placeholder(shape=None, dtype=tf.float32, name='accumulated_rewards') actions_PH = tf.placeholder(shape=None, dtype=tf.int32, name='actions') layer = states_PH for i in range(len(hidden_layers)): layer = lay_dense(input=layer, name=f'hidden_layer_{i + 1}', units=hidden_layers[i], activation=tf.nn.relu, seed=seed) logits = lay_dense(input=layer, name='logits', units=num_actions, activation=None, seed=seed) action_prob = tf.nn.softmax(logits) log_policy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=actions_PH) loss = tf.reduce_mean(acc_rew_PH * log_policy) return { 'states_PH': states_PH, 'acc_rew_PH': acc_rew_PH, 'actions_PH': actions_PH, 'action_prob': action_prob, 'loss': loss }
def cards_enc( train_flag, # train flag (bool tensor) c_ids, # seven cards (ids tensor) tat_case: bool = False, # task attention transformer architecture emb_width: int = 24, # cards embedding width t_drop: float = 0, f_drop: float = 0, in_proj: int = None, n_layers: int = 8, dense_mul: int = 4, # transformer dense multiplication activation=tf.nn.relu, dropout: float = 0, # transformer dropout seed=12321, verb=0): if verb > 0: print('\nBuilding card encoder...') with tf.variable_scope('cards_enc'): zsL = [] hist_summ = [] c_emb = tf.get_variable( # cards embeddings name='c_emb', shape=[53, emb_width], # one card for 'no_card' dtype=tf.float32, initializer=my_initializer(seed=seed)) hist_summ += [tf.summary.histogram('1.c_emb', c_emb, family='c_emb')] c_emb_look = tf.nn.embedding_lookup(params=c_emb, ids=c_ids) if verb > 1: print(' > 1.c_emb_look:', c_emb_look) myc_emb = tf.get_variable( # my cards embeddings name='myc_emb', shape=[2, c_emb.shape[-1]], dtype=tf.float32, initializer=my_initializer(seed=seed)) myc_emb_look = tf.nn.embedding_lookup(params=myc_emb, ids=[0, 0, 1, 1, 1, 1, 1]) if verb > 1: print(' > myc_emb_look:', myc_emb_look) input = c_emb_look + myc_emb_look if t_drop or f_drop: input = tf_drop(input=input, time_drop=t_drop, feat_drop=f_drop, train_flag=train_flag, seed=seed) # input projection (without activation) if in_proj: input = lay_dense(input=input, units=in_proj, name='c_proj', reuse=tf.AUTO_REUSE, use_bias=False, seed=seed) if verb > 1: print(' > input projected:', input) elif verb > 1: print(' > input:', input) enc_out = enc_TNS(in_seq=input, name='TAT' if tat_case else 'TNS', seq_out=not tat_case, add_PE=False, n_blocks=n_layers, n_heads=1, dense_mul=dense_mul, activation=activation, max_seq_len=7, dropout=dropout, dropout_att=0, drop_flag=train_flag, seed=seed, n_hist=3, verb=verb) output = enc_out['output'] zsL += enc_out['zeroes'] hist_summ += enc_out['hist_summ'] if not tat_case: output = tf.unstack(output, axis=-2) output = tf.concat(output, axis=-1) if verb > 1: print(' > encT reshaped output:', output) elif verb > 1: print(' > encT output:', output) enc_vars = tf.global_variables(scope=tf.get_variable_scope().name) return { 'output': output, 'enc_vars': enc_vars, 'hist_summ': hist_summ, 'zeroes': zsL }
def card_net( name='card_net', tat_case: bool = False, emb_width: int = 24, t_drop: float = 0, f_drop: float = 0, in_proj: int = None, # None, 0 or int activation=tf.nn.relu, # TRNS n_layers: int = 8, dense_mul=4, dropout=0, # dropout of encoder transformer # DRT & classif dense_proj=None, # None, 0 or int dr_layers=2, # None, 0 or int dr_scale=6, dropout_DR=0, # DR dropout # train parameters opt_class=partial(tf.compat.v1.train.AdamOptimizer, beta1=0.7, beta2=0.7), iLR=1e-3, warm_up=10000, ann_base=0.999, ann_step=0.04, n_wup_off=1, avt_SVal=0.1, avt_window=500, avt_max_upd=1.5, do_clip=False, seed=12321, verb=0): with tf.variable_scope(name, reuse=tf.AUTO_REUSE): zsL = [] hist_summ = [] train_PH = tf.placeholder_with_default( # train placeholder input=False, name='train_PH', shape=[]) inA_PH = tf.placeholder( # 7 cards of A name='inA_PH', dtype=tf.int32, shape=[None, 7]) # [bsz,7cards] inB_PH = tf.placeholder( # 7 cards of B name='inB_PH', dtype=tf.int32, shape=[None, 7]) # [bsz,7cards] won_PH = tf.placeholder( # wonPH class (labels of winner 0,1-A,B wins,2-draw) name='won_PH', dtype=tf.int32, shape=[None]) # [bsz] rnkA_PH = tf.placeholder( # rank A class (labels <0,8>) name='rnkA_PH', dtype=tf.int32, shape=[None]) # [bsz] rnkB_PH = tf.placeholder( # rank B class (labels <0,8>) name='rnkB_PH', dtype=tf.int32, shape=[None]) # [bsz] mcA_PH = tf.placeholder( # chances of winning for A (montecarlo) name='mcA_PH', dtype=tf.float32, shape=[None]) # [bsz] # cards encoders for A and B enc_outL = [] for cPH in [inA_PH, inB_PH]: enc_outL.append( cards_enc(c_ids=cPH, emb_width=emb_width, train_flag=train_PH, tat_case=tat_case, t_drop=t_drop, f_drop=f_drop, in_proj=in_proj, dense_mul=dense_mul, activation=activation, dropout=dropout, n_layers=n_layers, seed=seed, verb=verb)) enc_vars = enc_outL[0][ 'enc_vars'] # encoder variables (with cards embeddings) zsL += enc_outL[0]['zeroes'] # get nn_zeros from A hist_summ += enc_outL[0]['hist_summ'] # get histograms from A # where all cards of A are known where_all_ca = tf.reduce_max(inA_PH, axis=-1) where_all_ca = tf.where(condition=where_all_ca < 52, x=tf.ones_like(where_all_ca), y=tf.zeros_like(where_all_ca)) if verb > 1: print('\n > where_all_ca', where_all_ca) where_all_caF = tf.cast(where_all_ca, dtype=tf.float32) # cast to float # rank A classifier logits_RA = lay_dense(input=enc_outL[0]['output'], units=9, name='dense_RC', reuse=tf.AUTO_REUSE, use_bias=False, seed=seed) loss_RA = tf.nn.sparse_softmax_cross_entropy_with_logits( # loss rank A labels=rnkA_PH, logits=logits_RA) loss_RA = tf.reduce_mean( loss_RA * where_all_caF) # lossRA masked (where all cards @A) # rank B classifier logits_RB = lay_dense(input=enc_outL[1]['output'], units=9, name='dense_RC', reuse=tf.AUTO_REUSE, use_bias=False, seed=seed) loss_RB = tf.nn.sparse_softmax_cross_entropy_with_logits( # loss rank B labels=rnkB_PH, logits=logits_RB) loss_RB = tf.reduce_mean(loss_RB) loss_R = loss_RA + loss_RB if verb > 1: print(' > loss_R:', loss_R) # winner classifier (on concatenated representations) out_conc = tf.concat([enc_outL[0]['output'], enc_outL[1]['output']], axis=-1) if verb > 1: print(' > out_conc:', out_conc) if dr_layers: enc_out = enc_DRT(input=out_conc, name='drt_W', lay_width=dense_proj, n_layers=dr_layers, dns_scale=dr_scale, activation=activation, dropout=dropout_DR, training_flag=train_PH, n_hist=0, seed=seed, verb=verb) out_conc = enc_out['output'] zsL += enc_out['zeroes'] hist_summ += enc_out['hist_summ'] logits_W = lay_dense( # projection to 3 winner logits input=out_conc, units=3, name='dense_W', reuse=tf.AUTO_REUSE, use_bias=False, seed=seed) if verb > 1: print(' > logits_W:', logits_W) loss_W = tf.nn.sparse_softmax_cross_entropy_with_logits( # loss wonPH labels=won_PH, logits=logits_W) loss_W = tf.reduce_mean( loss_W * where_all_caF) # loss winner classifier, masked if verb > 1: print(' > loss_W:', loss_W) # probability of A winning regressor a_WP = lay_dense(input=enc_outL[0]['output'], units=1, name='dense_WP', reuse=tf.AUTO_REUSE, activation=activation, use_bias=False, seed=seed) a_WP = tf.reshape(a_WP, shape=[-1]) if verb > 1: print(' > player a win probability:', a_WP) loss_AWP = tf.losses.mean_squared_error(labels=mcA_PH, predictions=a_WP) if verb > 1: print(' > loss_AWP:', loss_AWP) diff_AWP = tf.sqrt(tf.square(mcA_PH - a_WP)) diff_AWP_mn = tf.reduce_mean(diff_AWP) diff_AWP_mx = tf.reduce_max(diff_AWP) loss = loss_W + loss_R + loss_AWP # this is how total loss is constructed # accuracy of winner classifier (where all cards) predictions_W = tf.argmax(logits_W, axis=-1, output_type=tf.int32) if verb > 1: print(' > predictionsW:', predictions_W) correct_W = tf.equal(predictions_W, won_PH) if verb > 1: print(' > correct_W:', correct_W) correct_WF = tf.cast(correct_W, dtype=tf.float32) correct_WF_where = correct_WF * where_all_caF acc_W = tf.reduce_sum(correct_WF_where) / tf.reduce_sum(where_all_caF) if verb > 1: print(' > acc_W:', acc_W) # accuracy of winner classifier per class (where all cards) oh_won = tf.one_hot( indices=won_PH, depth=3) # OH [batch,3], 1 where wins, dtype tf.float32 oh_won_where = oh_won * tf.stack([where_all_caF] * 3, axis=1) # masked where all cards won_density = tf.reduce_mean( oh_won_where, axis=0) # [3] measures density of 1 @batch per class oh_correct = tf.where(condition=correct_W, x=oh_won_where, y=tf.zeros_like(oh_won)) # [batch,3] won_corr_density = tf.reduce_mean(oh_correct, axis=0) acc_WC = won_corr_density / won_density oh_notcorrect_W = tf.where( condition=tf.logical_not(correct_W), x=oh_won, y=tf.zeros_like(oh_won)) # OH wins where not correct oh_notcorrect_W *= tf.stack([where_all_caF] * 3, axis=1) # masked with all cards # acc of rank(B) predictions_R = tf.argmax(logits_RB, axis=-1, output_type=tf.int32) correct_R = tf.equal(predictions_R, rnkB_PH) acc_R = tf.reduce_mean(tf.cast(correct_R, dtype=tf.float32)) if verb > 1: print(' > acc_R:', acc_R) # acc of rank(B) per class oh_rnkB = tf.one_hot(indices=rnkB_PH, depth=9) rnkB_density = tf.reduce_mean(oh_rnkB, axis=0) oh_correct_R = tf.where(condition=correct_R, x=oh_rnkB, y=tf.zeros_like(oh_rnkB)) rnkB_corr_density = tf.reduce_mean(oh_correct_R, axis=0) acc_RC = rnkB_corr_density / rnkB_density oh_notcorrect_R = tf.where( condition=tf.logical_not(correct_R), x=oh_rnkB, y=tf.zeros_like(oh_rnkB)) # OH ranks where not correct cls_vars = tf.global_variables(scope=tf.get_variable_scope().name) cls_vars = [var for var in cls_vars if var not in enc_vars] return { 'train_PH': train_PH, 'inA_PH': inA_PH, 'inB_PH': inB_PH, 'won_PH': won_PH, 'rnkA_PH': rnkA_PH, 'rnkB_PH': rnkB_PH, 'mcA_PH': mcA_PH, 'loss': loss, # total training loss (sum) 'loss_W': loss_W, # loss of winner classifier 'loss_R': loss_R, # loss of rank classifier 'loss_AWP': loss_AWP, # loss of A prob win 'diff_AWP_mn': diff_AWP_mn, # min diff of A prob win 'diff_AWP_mx': diff_AWP_mx, # max diff of A prob win 'acc_W': acc_W, 'acc_WC': acc_WC, 'predictions_W': predictions_W, 'oh_notcorrect_W': oh_notcorrect_W, 'acc_R': acc_R, 'acc_RC': acc_RC, 'predictions_R': predictions_R, 'oh_notcorrect_R': oh_notcorrect_R, 'hist_summ': tf.summary.merge(hist_summ), 'zeroes': tf.concat(zsL, axis=-1), 'enc_vars': enc_vars, 'cls_vars': cls_vars }
def enc_DRT( input, name='enc_DRT', shared_lays: bool = False, # shared variables in enc_layers n_layers=12, lay_width: int = None, # for None matches input width dns_scale=6, # scale(*) of first dense activation=tf.nn.relu, # gelu is really worth a try dropout=0.0, # dropout after two denses training_flag=None, # training flag tensor (for dropout) initializer=None, seed=12321, n_hist=4, # number of histogram layers (for TB) verb=0): lay_width_matched = '' if lay_width is None: lay_width = input.shape.as_list()[-1] lay_width_matched = '(lay_width taken form input width)' if verb > 0: drp = 0.0 if not dropout else dropout print( f'\nBuilding DRTencoder ({n_layers}x{lay_width} drop:{drp:.2f}) {lay_width_matched}...' ) if initializer is None: initializer = my_initializer(seed) hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(' > histogram layers of DRTencoder:', hist_layers) zsL = [] # zeroes list with tf.variable_scope(name): # input projection iW = input.shape[-1] if iW != lay_width: input = lay_dense(input=input, units=lay_width, use_bias=False, initializer=initializer, seed=seed) if verb > 0: print('projected input to layWidth(%d) since it differs(%d)' % (lay_width, iW)) input = tf.keras.layers.LayerNormalization(axis=-1)( input) # input layer_norm output = input # for 0 layers case for nL in range(n_layers): lay_name = f'DRLay_{nL}' if not shared_lays else 'DRLay_shared' lay_out = lay_DRT(input=output, name=lay_name, hist_name=name, dns_scale=dns_scale, activation=activation, dropout=dropout, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] if nL in hist_layers: hist_summ.append(lay_out['hist_summ']) zsL += lay_out['zeroes'] return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
def tblock(in_seq, seed, task_query=None): hist_summ = [] output = in_seq taskQueryNorm = None if task_query is None: hist_summ.append( tf.summary.histogram('a_inputSeq', output, family=name)) # layer norm 1 on seq if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('b_inputSeqLN', output, family=name)) else: hist_summ.append( tf.summary.histogram('a_inTaskQuery', task_query, family=name)) taskQueryNorm = task_query # layer norm 1 on taskQuery if do_LN: taskQueryNorm = tf.keras.layers.LayerNormalization( axis=-1)(task_query) hist_summ.append( tf.summary.histogram('b_taskQueryLN', task_query, family=name)) # multi head self attention mha_out = mh_attn(in_seq=output, query=taskQueryNorm, dropout_att=dropout_att, drop_flag=training_flag, seed=seed) output = mha_out['attention'] att_vals = mha_out['att_vals'] hist_summ.append(tf.summary.histogram('c_mhAttn', output, family=name)) # dense without activation output = lay_dense(input=output, units=output.shape[-1].value, name='afterAttProj', initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('d_denseAftAtt', output, family=name)) if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual 1 if task_query is None: res1_out = in_seq + output hist_summ.append( tf.summary.histogram('e_res_onInputSeq', res1_out, family=name)) else: res1_out = task_query + output hist_summ.append( tf.summary.histogram('e_res_onTaskQuery', res1_out, family=name)) output = res1_out # layer norm 2 if do_LN: output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('f_layNorm', output, family=name)) # 2x dense base_width = output.shape[-1].value output = lay_dense(input=output, units=int(base_width * dense_mul), name='dense1afterAtt', activation=activation, initializer=initializer, seed=seed) zsL = [zeroes(output)] hist_summ.append( tf.summary.histogram('g_1denseOut', output, family=name)) output = lay_dense(input=output, units=base_width, name='dense2afterAtt', initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('h_2denseOut', output, family=name)) if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual2 output += res1_out hist_summ.append(tf.summary.histogram('i_res', output, family=name)) return { 'output': output, 'hist_summ': hist_summ, 'att_vals': att_vals, 'zeroes': zsL }
def lay_DRT( input, name='lay_DRT', # scope name, be careful when stacked since auto_reuse hist_name=None, # family name of histogram dns_scale=4, activation=tf.nn.relu, # gelu is really worth a try dropout=None, # dropout (after two denses) training_flag=None, # training flag tensor (for dropout) initializer=None, seed=12321): if not hist_name: hist_name = name lay_width = input.shape[-1] if initializer is None: initializer = my_initializer(seed) hist_summ = [] with tf.variable_scope(name_or_scope=name, reuse=tf.AUTO_REUSE): hist_summ.append( tf.summary.histogram('a_denseSin', input, family=hist_name)) # dense (scale up) output = lay_dense(input=input, units=int(lay_width * dns_scale), activation=None, use_bias=True, initializer=initializer, seed=seed, name='denseS') hist_summ.append( tf.summary.histogram('b_denseSout', output, family=hist_name)) # activation output = activation(output) zsL = [zeroes(output)] # zeroes list hist_summ.append( tf.summary.histogram('c_activation', output, family=hist_name)) # dense (scale down) no activ output = lay_dense(input=output, units=lay_width, name='DRTdenseNA', use_bias=True, initializer=initializer, seed=seed) hist_summ.append( tf.summary.histogram('d_denseNAout', output, family=hist_name)) # layer dropout if dropout: output = tf.layers.dropout(inputs=output, rate=dropout, training=training_flag, seed=seed) # residual output = lay_res(input, output) hist_summ.append( tf.summary.histogram('e_residual', output, family=hist_name)) # layer_norm output = tf.keras.layers.LayerNormalization(axis=-1)(output) hist_summ.append( tf.summary.histogram('f_LAYout', output, family=hist_name)) return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
def enc_CNN( input: tf.Tensor, history: tf. Tensor = None, # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn name='enc_CNN', # layer params shared_lays: bool = False, # shared variables in enc_layers n_layers: int = 12, # num of layers kernel: int = 3, # layer kernel n_filters: int = 128, # num of filters activation=tf.nn. relu, # global enc activation func, gelu is really worth a try lay_drop: float or None = 0.0, ldrt_scale: int or None = 0, # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build ldrt_drop: float or None = 0.0, # DRT @enc_lay - dropout # other training_flag: tf.Tensor or bool = None, # dropout training flag tensor initializer=None, seed: int = 12321, n_hist: int = 4, # number of histogram layers verb=0): if verb > 0: print( f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...') if initializer is None: initializer = my_initializer(seed) # manage history history_lays = None if history is not None: history_lays = tf.unstack(history, axis=-3) if verb > 1: print( f' > state_lays len {len(history_lays)} of: {history_lays[0]}') hist_summ = [] hist_layers = list_of_layers(n_layers, n_select=n_hist) if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}') with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_lays = [ ] # here we will store inputs of the following layers to extract the state (history) zsL = [] # zeroes # input projection - to match n_filters and input width if verb > 1: print(f' > encoder input: {input}') if input.shape[-1] != n_filters: input = lay_dense(input=input, units=n_filters, name='enc_input_projection', initializer=initializer) if verb > 1: print(f' > encoder projected input: {input}') output = input # for 0 layers case sub_output = input # first input for depth in range(n_layers): lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared' if verb > 1: print(f'<< layer {lay_name}:') lay_input = tf.concat([history_lays[depth], sub_output], axis=-2) if history_lays else sub_output if verb > 1: print(f' > sub_output (previous): {sub_output}') print(f' > lay_input (eventually padded): {lay_input}') input_lays.append(lay_input) hist_lay = depth in hist_layers with tf.variable_scope(lay_name): if hist_lay: hist_summ.append( tf.summary.histogram('a_lay_in', lay_input, family=name)) # LN lay_input = tf.keras.layers.LayerNormalization( axis=-1)(lay_input) if hist_lay: hist_summ.append( tf.summary.histogram('b_LN', lay_input, family=name)) # conv no activation output = lay_conv1D( input=lay_input, name='conv1D', kernels=kernel, filters=n_filters, activation=None, initializer=initializer, padding='same' if history is None else 'valid', seed=seed, verb=0) if hist_lay: hist_summ.append( tf.summary.histogram('c_cnn', output, family=name)) # activation if activation: output = activation(output) zsL += [zeroes(output)] # catch zeroes if hist_lay: hist_summ.append( tf.summary.histogram('d_activation', output, family=name)) # dropout if lay_drop: output = tf.layers.dropout(inputs=output, rate=lay_drop, training=training_flag, seed=seed) if hist_lay: hist_summ.append( tf.summary.histogram('e_drop', output, family=name)) # RES, here we take sub_output, since lay_input may be padded by history output += sub_output if hist_lay: hist_summ.append( tf.summary.histogram('f_residual', output, family=name)) if verb > 1: print(f' > output (layer): {output}') if ldrt_scale: lay_out = lay_DRT(input=output, name=lay_name + '_lay_DRT', hist_name=name, dns_scale=ldrt_scale, activation=activation, dropout=ldrt_drop, training_flag=training_flag, initializer=initializer, seed=seed) output = lay_out['output'] zsL += lay_out['zeroes'] if hist_lay: hist_summ.append(lay_out['hist_summ']) sub_output = output output = tf.keras.layers.LayerNormalization(axis=-1)(output) # final LN # prepare fin_state fin_state = None if history is not None: state = tf.stack(input_lays, axis=-3) if verb > 1: print(f' > state (stacked): {state}') fin_state = tf.split(state, num_or_size_splits=[-1, kernel - 1], axis=-2)[1] if verb > 1: print(f' > fin_state (split): {fin_state}') if verb > 1: print(f' > {name} output: {output}') return { 'output': output, 'state': fin_state, # history for next 'hist_summ': hist_summ, 'zeroes': zsL }
def dvc_model( seed: int, # seed for TF OPs multi_sen: int, train_tower: bool, vec_width: int, tok_emb, # tuple with embeddings shape or np.arr/LL with values of embeddings seq_width: int, max_seq_len: int, drt_scale: float or int, # global DRT scale classes: int or list, # (Multi-Classif) vtc_drop: float, vtc_proj: int, drtC_nLay: int, drtC_drop: float, out_drop: float, l2lc: float, verb, **kwargs): #actv_func = tf.nn.relu actv_func = gelu hist_summ = [] zsL = [] isVec = vec_width is not None isTks = tok_emb is not None isSeq = seq_width is not None if verb > 0: print('\n*** DVCmodel *** builds graph for', end='') if isVec: print(' vec(%d)' % vec_width, end='') if isTks: print(' tks (tokens sequence)', end='') if isSeq: print(' seq (vectors sequence)', end='') print() if type(classes) is not list: classes = [classes] if classes is not None else [ ] # nClasses may be None >> no classifiers with tf.variable_scope(name_or_scope='FWD'): # ********************************* input placeholders vec_PHL = [ tf.compat.v1.placeholder(name='vec%d_PH' % nS, dtype=tf.float32, shape=[None, vec_width]) for nS in range(multi_sen) ] if isVec else None tks_PHL = [ tf.compat.v1.placeholder(name='tks%d_PH' % nS, dtype=tf.int32, shape=[None, max_seq_len]) for nS in range(multi_sen) ] if isTks else None # batch, seqLen seq_PHL = [ tf.compat.v1.placeholder(name='seq%d_PH' % nS, dtype=tf.float32, shape=[None, max_seq_len, seq_width]) for nS in range(multi_sen) ] if isSeq else None # batch, seqLen, vec lab_PHL = [ tf.compat.v1.placeholder(name='labC%d_ID' % nC, dtype=tf.int32, shape=[None]) for nC in range(len(classes)) ] train_flag_PH = tf.compat.v1.placeholder( name='train_flag', dtype=tf.bool, shape=[]) # placeholder marking training process # ********************************* encTowers if verb > 0: print('...building %d DVC encTowers' % multi_sen) enc_outs = [] for nS in range(multi_sen): encT_out = enc_tower( actv_func=actv_func, vec_PH=vec_PHL[nS] if vec_PHL is not None else None, tks_PH=tks_PHL[nS] if tks_PHL is not None else None, seq_PH=seq_PHL[nS] if seq_PHL is not None else None, train_flag_PH=train_flag_PH, tok_emb=tok_emb, max_seq_len=max_seq_len, drt_scale=drt_scale, seed=seed, verb=verb, **kwargs) enc_outs.append(encT_out) vec_output = tf.concat([eo['vector'] for eo in enc_outs], axis=-1) if len(enc_outs) > 1 and verb > 0: print('\n > outputs (concatenated) of %d towers:' % len(enc_outs), vec_output) tower_vars = enc_outs[0]['tower_vars'] hist_summ += enc_outs[0]['hist_summ'] for encT_out in enc_outs: zsL += encT_out['zeroes'] hist_summ.append( tf.summary.histogram('5towersOut_concatALL', vec_output, family='C.cls')) # ********************************* Multi-Classifier with tf.variable_scope('vClassif'): if classes: # dropout on vector to classifier if vtc_drop: vec_output = tf.layers.dropout(inputs=vec_output, rate=vtc_drop, training=train_flag_PH, seed=seed) if verb > 1: print( ' > dropout %.2f applied to vec_output of tower(s):' % vtc_drop, vec_output) # projection on vector to classifier if vtc_proj and vtc_proj != vec_output.shape.as_list()[-1]: vec_output = lay_dense(input=vec_output, units=vtc_proj, activation=None, use_bias=True, seed=seed, name='inVProjection') if verb > 1: print(' > projected vector input:', vec_output) hist_summ.append( tf.summary.histogram('7vecTCProj', vec_output, family='C.cls')) # layerNorm (after projection) vec_output = tf.contrib.layers.layer_norm(inputs=vec_output, begin_norm_axis=-1, begin_params_axis=-1) hist_summ.append( tf.summary.histogram('8projLNorm', vec_output, family='C.cls')) mc_losses = [] mc_probs = [] if verb > 1: print('\nBuilding multi-classifier graphs...') for cix in range(len(classes)): if verb > 1: print(' > multi-classifier (%d/%d):' % (cix + 1, len(classes))) # DRT encoder @classifier if drtC_nLay: eDRTout = enc_DRT(input=vec_output, n_layers=drtC_nLay, dns_scale=drt_scale, activation=actv_func, dropout=drtC_drop, training_flag=train_flag_PH, seed=seed, n_hist=2, verb=verb) vec_output = eDRTout['output'] zsL += eDRTout['zeroes'] if out_drop: vec_output = tf.layers.dropout(inputs=vec_output, rate=out_drop, training=train_flag_PH, seed=seed) logits = lay_dense(input=vec_output, units=classes[cix], activation=None, use_bias=True, seed=seed, name='logits_projection_cix%d' % cix) if verb > 1: print(' >> logits (projected)', logits) hist_summ.append( tf.summary.histogram('9logits', logits, family='C.cls')) probs = tf.nn.softmax(logits, name=f'predict_probabilities_c{cix}') predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) if verb > 1: print(' >> predictions:', predictions) correct = tf.equal(predictions, lab_PHL[cix]) if verb > 1: print(' >> correct prediction:', correct) accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32)) # softmax loss cLoss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=lab_PHL[cix]) if verb > 1: print(' > cLoss (softmax)', cLoss) # shape [batch] (per sample) """ TODO (experimental): scaled cLoss # scale cLoss scale = tf.where( condition= correct, x= tf.ones_like(correct, dtype=tf.float32)*tf.constant(0.8), # positive scale y= tf.ones_like(correct, dtype=tf.float32)*tf.constant(1.7)) # negative scale cLoss *= scale """ mc_losses.append(cLoss) mc_probs.append(probs) # average all losses (multi-classifiers losses) loss = tf.reduce_mean(tf.stack(mc_losses)) # shape [1] if verb > 1: print(' > loss (averaged all multi-classif)', loss) class_vars = tf.global_variables( scope=tf.get_variable_scope().name) # vClass variables train_vars = [] if train_tower: train_vars += tower_vars train_vars += class_vars # L2 cLoss if l2lc: restrictedNames = [ 'bias', # dense bias 'beta', # LN offset 'gamma', # LN scale 'tns_pos_emb', # position embeddings 'tok_emb' ] # token embeddings if verb > 1: print(' > applying L2 cLoss to variables (not including %s)' % restrictedNames) l2Vars = [] for v in train_vars: vIsOk = True for nmp in restrictedNames: if nmp in v.name: vIsOk = False if vIsOk: l2Vars.append(v) if verb > 1: print(' > L2 / all(--) variables of model:') for var in train_vars: if var in l2Vars: print(' >> L2', var) else: print(' >> --', var) l2loss = tf.add_n([tf.nn.l2_loss(v) for v in l2Vars]) * l2lc # shape [1] if verb > 1: print(' > L2 cLoss', l2loss) loss += l2loss return { # placeholders 'vec_PHL': vec_PHL, 'tks_PHL': tks_PHL, 'seq_PHL': seq_PHL, 'lab_PHL': lab_PHL, 'train_flag_PH': train_flag_PH, # variables 'train_vars': train_vars, # to train 'tower_vars': tower_vars, # to save 'class_vars': class_vars, # to save # tensors 'probs': probs, # ...of last multi-classifier 'mc_probs': mc_probs, 'predictions': predictions, # ...of last multi-classifier 'accuracy': accuracy, # ...of last multi-classifier 'loss': loss, # avg of all multi-classifiers 'hist_summ': tf.summary.merge(hist_summ), 'zeroes': zsL }
def enc_tower( actv_func, # activation function vec_PH: tf.compat.v1.placeholder, # vector placeholder (vec input) tks_PH: tf.compat.v1. placeholder, # tokens seq placeholder (seq input - IDs) seq_PH: tf.compat.v1.placeholder, # vector seq placeholder (seq input) train_flag_PH: tf.compat.v1.placeholder, # train flag placeholder tok_emb, tok_emb_train: bool, # flag, when True tok_emb are trainable tok_emb_add, # np.arr/LL with values of additional embeddings (always trainable) # vectors processing inV_drop: float, inV_proj: int or None, # value equal to last dimension width turns-off projection drt_nLay, drt_scale, drt_drop, # sequence params inS_drop: float, intime_drop: float, infeat_drop: float, inS_proj: int or None, # value equal to last dimension width turns-off projection inS_actv: bool, # inS_proj activation # seq encoders params cnn_nLay, rnn_nLay, max_seq_len, tns_nBlocks, enc_drop, tnsAT_drop, tns_scale, tat_nBlocks, tatAT_drop, tat_drop, # other seed, verb, **kwargs): if verb > 0: print('\nenc_tower inits...') zsL = [] hist_summ = [] with tf.variable_scope('encTower', reuse=tf.AUTO_REUSE): vectorL = [ ] # list of vectors to concatenate (vec form vec_PH + reduced sequence (tok_PH & seq_PH)) # ********************************* vector processing if vec_PH is not None: vector = vec_PH if verb > 1: print(' > vector input:', vector) hist_summ.append( tf.summary.histogram('1vecIn', vector, family='A.vec')) # layerNorm (on input, always) vector = tf.contrib.layers.layer_norm(inputs=vector, begin_norm_axis=-1, begin_params_axis=-1) hist_summ.append( tf.summary.histogram('2inLNorm', vector, family='A.vec')) # dropout (on input, before projection) if inV_drop: vector = tf.layers.dropout(inputs=vector, rate=inV_drop, training=train_flag_PH, seed=seed) if verb > 1: print(' > dropout %.2f applied to vec:' % inV_drop, vector) # projection (rescales input, without activation) if inV_proj and inV_proj != vector.shape.as_list()[-1]: vector = lay_dense(input=vector, units=inV_proj, activation=None, use_bias=True, seed=seed, name='inVProjection') if verb > 1: print(' > projected vector input:', vector) hist_summ.append( tf.summary.histogram('3inProj', vector, family='A.vec')) # layerNorm (after projection) vector = tf.contrib.layers.layer_norm(inputs=vector, begin_norm_axis=-1, begin_params_axis=-1) hist_summ.append( tf.summary.histogram('4projLNorm', vector, family='A.vec')) # DRT encoder for vector @tower if drt_nLay: eDRTout = enc_DRT(input=vector, n_layers=drt_nLay, dns_scale=drt_scale, activation=actv_func, dropout=drt_drop, training_flag=train_flag_PH, seed=seed, n_hist=2, verb=verb) vector = eDRTout['output'] zsL += eDRTout['zeroes'] hist_summ += eDRTout['hist_summ'] if verb > 1: print(' > drtLay output', vector) hist_summ.append( tf.summary.histogram('5drtLayOut', vector, family='A.vec')) vectorL.append(vector) # ********************************* sequence processing sequence = None seq_to_concat = [] # tokens embedding for sequence if tks_PH is not None: if type(tok_emb) is tuple: all_emb = tf.get_variable( # embeddings initialized from scratch name='tok_emb', shape=tok_emb, initializer=tf.truncated_normal_initializer(stddev=0.01, seed=seed), dtype=tf.float32, trainable=True) else: all_emb = tf.get_variable( # embeddings initialized with given variable name='tok_emb', initializer=tok_emb, dtype=tf.float32, trainable=tok_emb_train) if tok_emb_add is not None: tokEmbAddV = tf.get_variable( # add embeddings initialized with given variable name='tok_emb_add', initializer=tok_emb_add, dtype=tf.float32, trainable=True) all_emb = tf.concat([all_emb, tokEmbAddV], axis=0) sequence = tf.nn.embedding_lookup(params=all_emb, ids=tks_PH) if verb > 1: print('\n > sequence (tokens lookup):', sequence) hist_summ.append( tf.summary.histogram('1seqT', sequence, family='B.seq')) seq_to_concat.append(sequence) if seq_PH is not None: if verb > 1: print(' > sequence of vectors:', seq_PH) hist_summ.append( tf.summary.histogram('1seqV', seq_PH, family='B.seq')) seq_to_concat.append(seq_PH) # concat sequences if len(seq_to_concat) == 1: sequence = seq_to_concat[0] if len(seq_to_concat) > 2: # it will work only when shapes match !!! sequence = tf.concat(seq_to_concat, axis=1) if verb > 1: print(' > concatenated sequence (vec+tok):', sequence) if sequence is not None: # dropout (applied to seq of tok_emb works much better than applied after projection) if inS_drop: sequence = tf.layers.dropout(inputs=sequence, rate=inS_drop, training=train_flag_PH, seed=seed) if verb > 1: print(' > dropout %.2f applied to seq:' % inS_drop, sequence) # time & feats drop if intime_drop or infeat_drop: sequence = tf_drop(input=sequence, time_drop=intime_drop, feat_drop=infeat_drop, train_flag=train_flag_PH, seed=seed) # sequence layer_norm (on (dropped)input, always) sequence = tf.contrib.layers.layer_norm(inputs=sequence, begin_norm_axis=-2, begin_params_axis=-2) if verb > 1: print(' > normalized seq:', sequence) hist_summ.append( tf.summary.histogram('2inLNorm', sequence, family='B.seq')) # in_projection (rescales input) without activation if inS_proj and inS_proj != sequence.shape.as_list()[-1]: sequence = lay_dense( input=sequence, units=inS_proj, activation=actv_func if inS_actv else None, use_bias=True, seed=seed, name='inSProjection') if verb > 1: print(' > inProjection (%d) for seq:' % inS_proj, sequence) hist_summ.append( tf.summary.histogram('3inProj', sequence, family='B.seq')) # layerNorm (after projection) sequence = tf.contrib.layers.layer_norm(inputs=sequence, begin_norm_axis=-2, begin_params_axis=-2) if verb > 1: print(' > normalized seq:', sequence) hist_summ.append( tf.summary.histogram('4projLNorm', sequence, family='B.seq')) # ********* below are 3 types of seq2seq encoders stacked each on another enc_width = sequence.shape.as_list()[-1] if cnn_nLay: eCOut = enc_CNN(input=sequence, n_layers=cnn_nLay, activation=actv_func, lay_drop=enc_drop, training_flag=train_flag_PH, n_filters=enc_width, n_hist=2, seed=seed, verb=verb) sequence = eCOut['output'] hist_summ += eCOut['hist_summ'] if rnn_nLay: from tensorflow.contrib import rnn eLOut = enc_RNN(input=sequence, cellFN=rnn.LSTMCell, biDir=False, cellWidth=enc_width, numLays=rnn_nLay, dropout=enc_drop, dropFlagT=train_flag_PH, seed=seed) sequence = eLOut['output'] if tns_nBlocks: tns_out = enc_TNS(in_seq=sequence, name='encTRNS', n_blocks=tns_nBlocks, n_heads=1, dense_mul=tns_scale, activation=actv_func, max_seq_len=max_seq_len, dropout_att=tnsAT_drop, dropout=enc_drop, training_flag=train_flag_PH, seed=seed, n_hist=2, verb=verb) sequence = tns_out['output'] hist_summ += tns_out['hist_summ'] zsL += tns_out['zeroes'] # ********** below sequence is reduced to vector, with TAT or pooling # TAT reduction if tat_nBlocks: tat_out = enc_TNS(in_seq=sequence, seq_out=False, name='tatTRNS', n_blocks=tat_nBlocks, n_heads=1, dense_mul=tns_scale, activation=actv_func, max_seq_len=max_seq_len, dropout_att=tatAT_drop, dropout=tat_drop, training_flag=train_flag_PH, seed=seed, n_hist=2, verb=verb) sequence_reduced = tat_out['output'] hist_summ += tat_out['hist_summ'] # attVals = tat_out['att_vals'] zsL += tat_out['zeroes'] # reduce sequence with concat of avg & max else: sequence_reduced = tf.concat([ tf.reduce_mean(sequence, axis=-2), tf.reduce_max(sequence, axis=-2) ], axis=-1) if verb > 1: print(' > reduced sequence to one vec with mean (+) max:', sequence_reduced) vectorL.append(sequence_reduced) # ********************************* concatenate and finish vector = tf.concat(vectorL, axis=-1) if len(vectorL) > 1 else vectorL[0] if verb > 1: print(' > vector (tower output):', vector) tower_vars = tf.global_variables( scope=tf.get_variable_scope().name) # eTower variables return { 'vector': vector, 'sequence': sequence, 'tower_vars': tower_vars, 'hist_summ': hist_summ, 'zeroes': zsL }
def cnn_DMG( name :str, train_ce :bool= True, # train cards encoder c_embW :int= 12, # card emb width >> makes network width (x7) n_lay= 12, # number of CNNR layers >> makes network deep ( >> context length) width= None, # representation width (number of filters), for None uses cards_encoded_width activation= tf.nn.relu, opt_class= partial(tf.compat.v1.train.AdamOptimizer, beta1=0.7, beta2=0.7), iLR= 3e-5, warm_up= 100, # num of steps has to be small (since we do rare updates) avt_SVal= 0.04, avt_window= 20, do_clip= True, verb= 0, **kwargs): if verb>0: print(f'\nBuilding {name} cnn_DMG (graph)...') with tf.variable_scope(name): n_hands = tf.get_variable( # number of hands while learning name= 'n_hands', shape= [], trainable= False, initializer= tf.constant_initializer(0), dtype= tf.int32) cards_PH = tf.placeholder( # 7 cards placeholder name= 'cards_PH', dtype= tf.int32, shape= [None, None, 7]) # [bsz,seq,7cards] train_PH = tf.placeholder( # train placeholder name= 'train_PH', dtype= tf.bool, shape= []) ce_out = cards_enc( train_flag= train_PH, c_ids= cards_PH, emb_width= c_embW) cards_encoded = ce_out['output'] enc_vars = ce_out['enc_vars'] enc_zsL = ce_out['zeroes'] if verb>1: print(' ### num of enc_vars (%d) %s'%(len(enc_vars),short_scin(num_var_floats(enc_vars)))) if verb>1: print(' > cards encoded:', cards_encoded) switch_PH = tf.placeholder( # switch placeholder name= 'switch_PH', dtype= tf.int32, # 0 for move, 1 for cards shape= [None, None, 1]) # [bsz,seq,1] event_PH = tf.placeholder( # event id placeholder name= 'event_PH', dtype= tf.int32, shape= [None, None]) # [bsz,seq] n_events = 1 + N_TABLE_PLAYERS + len(TBL_MOV)*(N_TABLE_PLAYERS-1) event_emb = tf.get_variable( # event type embeddings name= 'event_emb', shape= [n_events, cards_encoded.shape[-1]], dtype= tf.float32, initializer= my_initializer()) event_in = tf.nn.embedding_lookup(params=event_emb, ids=event_PH) if verb>1: print(' > event_in:', event_in) # tried with tf.where and switching inputs, but speed was the same... switch = tf.cast(switch_PH, dtype=tf.float32) input = switch*cards_encoded + (1-switch)*event_in if verb>1: print(' > input (merged):', input) # projection without activation and bias if width: input = lay_dense( input= input, units= width, use_bias= False) if verb>1: print(' > projected input (projected):', input) else: width = cards_encoded.shape[-1] # layer_norm sub_output = tf.contrib.layers.layer_norm( inputs= input, begin_norm_axis= -1, begin_params_axis= -1) state_shape = [n_lay, 2, width] single_zero_state = tf.zeros(shape=state_shape) # [n_lay,2,width] state_PH = tf.placeholder( name= 'state_PH', dtype= tf.float32, shape= [None] + state_shape) # [bsz,n_lay,2,width] cnn_enc_out = enc_CNN( input= sub_output, history= state_PH, n_layers= n_lay, n_filters= width, activation= activation, n_hist= 0) out = cnn_enc_out['output'] fin_state = cnn_enc_out['state'] cnn_zsL = cnn_enc_out['zeroes'] if verb > 1: print(' > out:', out) print(' > fin_state (split):', fin_state) # projection to logits logits = lay_dense( input= out, units= len(TBL_MOV), use_bias= False) if verb>1: print(' > logits:', logits) probs = tf.nn.softmax(logits) cnn_vars = tf.trainable_variables(scope=tf.get_variable_scope().name) + [n_hands] cnn_vars = [var for var in cnn_vars if var not in enc_vars] if verb>1: print(' ### num of cnn_vars (%d) %s'%(len(cnn_vars),short_scin(num_var_floats(cnn_vars)))) move_PH = tf.placeholder( # move made (label) name= 'move_PH', dtype= tf.int32, shape= [None, None]) # [bsz,seq] rew_PH = tf.placeholder( # reward for move made name= 'rew_PH', dtype= tf.float32, shape= [None, None]) # [bsz,seq] # this loss is auto averaged with reduction parameter # loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) # loss = loss(y_true=move, y_pred=logits, sample_weight=rew) loss = tf.losses.sparse_softmax_cross_entropy( labels= move_PH, logits= logits, weights= rew_PH) train_vars = [] + cnn_vars if train_ce: train_vars += enc_vars return{ 'name': name, 'cards_PH': cards_PH, 'train_PH': train_PH, 'switch_PH': switch_PH, 'event_PH': event_PH, 'move_PH': move_PH, 'rew_PH': rew_PH, 'state_PH': state_PH, 'single_zero_state': single_zero_state, 'probs': probs, 'fin_state': fin_state, 'enc_zeroes': tf.concat(enc_zsL, axis=-1), 'cnn_zeroes': tf.concat(cnn_zsL, axis=-1), 'loss': loss, 'n_hands': n_hands, 'enc_vars': enc_vars, 'cnn_vars': cnn_vars, 'train_vars': train_vars}