Ejemplo n.º 1
0
def qnn_graph(
        name=               'qnn',
        num_actions: int=   4,
        num_states: int=    16,
        state_emb_width=    4,
        hidden_layers_size= (12,),
        gamma=              0.9,
        seed=               121,
        **kwargs):

    with tf.variable_scope(name):

        qv_target_PH = tf.placeholder(      # qv next state placeholder
            shape=  [None,num_actions],
            dtype=  tf.float32)
        reward_PH = tf.placeholder(         # reward
            shape=  [None],
            dtype=  tf.float32)
        state_PH = tf.placeholder(          # state
            shape=  [None],
            dtype=  tf.int32)
        enum_actions_PH = tf.placeholder(   # enumerated action indexes (0,1),(1,3),(2,0),..
            shape=  [None,2],
            dtype=  tf.int32)

        state_emb = tf.get_variable(
            name=   'state_emb',
            shape=  [num_states,state_emb_width],
            dtype=  tf.float32)

        input = tf.nn.embedding_lookup(state_emb, state_PH)
        print('input:', input)

        for l in hidden_layers_size:
            input = lay_dense(
                input=      input,
                units=      l,
                activation= tf.nn.relu,
                seed=       seed)
        output = lay_dense( # QV for all actions (for given input(state))
            input=      input,
            units=      num_actions,
            activation= None,
            seed=       seed)

        pred_qv = tf.gather_nd(output, indices=enum_actions_PH)
        gold_qv = reward_PH + gamma * tf.reduce_max(qv_target_PH, axis=-1) # gold is predicted by same network

        loss = tf.losses.mean_squared_error(labels=gold_qv, predictions=pred_qv) # loss on predicted vs next, we want predicted to match next
        loss = tf.reduce_mean(loss)

    return {
        'qv_target_PH':     qv_target_PH,
        'reward_PH':        reward_PH,
        'state_PH':         state_PH,
        'enum_actions_PH':  enum_actions_PH,
        'output':           output,
        'loss':             loss}
Ejemplo n.º 2
0
def decN(
        input,
        dictW,
        predN=1,  # N samples for every feature
        name='decN',
        hLays=None,  # tuple or list of ints
        hActiv=tf.nn.relu,
        initializer=None,
        seed=12321,
        verbLev=0):

    if verbLev > 0: print('\nBuilding decoderN ...')
    if verbLev > 1: print('decoder input:', input)

    if initializer is None: initializer = my_initializer(seed)

    with tf.variable_scope(name):

        # hidden layers
        if hLays:
            for nLay in range(len(hLays)):
                laySize = hLays[nLay]
                input = lay_dense(input=input,
                                  units=laySize,
                                  activation=hActiv,
                                  use_bias=True,
                                  initializer=initializer,
                                  seed=seed,
                                  name='decoderN_Hlay_%s' % nLay)

        # projection to predN x dictW
        logits = lay_dense(input=input,
                           units=predN * dictW,
                           activation=None,
                           use_bias=True,
                           initializer=initializer,
                           seed=seed,
                           name='decoderNProjection')
        if verbLev > 1:
            print(' > projection to logits (%dx dictW):' % predN, logits)

        if predN > 1:
            logits = tf.reshape(logits, [tf.shape(logits)[0], -1, dictW])
            if verbLev > 1:
                print(' > reshaped logits (B,%dxS,dictW):' % predN, logits)

        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        if verbLev > 1: print(' > predictions:', predictions)

    return logits, predictions
Ejemplo n.º 3
0
    def mh_attn(
            in_seq,  # input sequence [batch, seq, feats]
            query=None,  # None for self attention, otherwise TAT [batch, n_queries, feats]
            activation=None,  # activation of KQV dense
            dropout_att=0.0,
            drop_flag=None,
            seed=seed):

        # input projection of in_seq for KQV or KV(if query)
        width = in_seq.shape[-1].value
        proj_size = 3 if query is None else 2
        c = lay_dense(
            input=in_seq,  # [batch, seq, feats]
            units=width * proj_size,
            name='mhProj',
            activation=activation,
            initializer=initializer,
            seed=seed)
        ins_split = tf.split(c, proj_size, axis=-1)  # split projected

        if query is not None:
            q = query  # projection for Q is not needed (at least with 1 head)
            k, v = ins_split
        else:
            q, k, v = ins_split
        q, k, v = map(split_heads, [q, k, v])

        # attention
        att_out = attn(q, k, v, dropout_att, drop_flag, seed)
        a = att_out['attention']
        a = merge_heads(a)
        return {'attention': a, 'att_vals': att_out['att_weights']}
Ejemplo n.º 4
0
def pgnn_graph(name='pgnn',
               state_size=4,
               num_actions=2,
               hidden_layers=(20, ),
               seed=121,
               **kwargs):

    with tf.variable_scope(name):

        states_PH = tf.placeholder(  # environment state representation (prepared by PolicyGradientsEnvironment.encode_state())
            shape=(None, state_size),
            dtype=tf.float32,
            name='input_states')
        acc_rew_PH = tf.placeholder(shape=None,
                                    dtype=tf.float32,
                                    name='accumulated_rewards')
        actions_PH = tf.placeholder(shape=None, dtype=tf.int32, name='actions')

        layer = states_PH
        for i in range(len(hidden_layers)):
            layer = lay_dense(input=layer,
                              name=f'hidden_layer_{i + 1}',
                              units=hidden_layers[i],
                              activation=tf.nn.relu,
                              seed=seed)
        logits = lay_dense(input=layer,
                           name='logits',
                           units=num_actions,
                           activation=None,
                           seed=seed)

        action_prob = tf.nn.softmax(logits)
        log_policy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=actions_PH)
        loss = tf.reduce_mean(acc_rew_PH * log_policy)

    return {
        'states_PH': states_PH,
        'acc_rew_PH': acc_rew_PH,
        'actions_PH': actions_PH,
        'action_prob': action_prob,
        'loss': loss
    }
Ejemplo n.º 5
0
def cards_enc(
        train_flag,  # train flag (bool tensor)
        c_ids,  # seven cards (ids tensor)
        tat_case: bool = False,  # task attention transformer architecture
        emb_width: int = 24,  # cards embedding width
        t_drop: float = 0,
        f_drop: float = 0,
        in_proj: int = None,
        n_layers: int = 8,
        dense_mul: int = 4,  # transformer dense multiplication
        activation=tf.nn.relu,
        dropout: float = 0,  # transformer dropout
        seed=12321,
        verb=0):

    if verb > 0: print('\nBuilding card encoder...')

    with tf.variable_scope('cards_enc'):

        zsL = []
        hist_summ = []

        c_emb = tf.get_variable(  # cards embeddings
            name='c_emb',
            shape=[53, emb_width],  # one card for 'no_card'
            dtype=tf.float32,
            initializer=my_initializer(seed=seed))
        hist_summ += [tf.summary.histogram('1.c_emb', c_emb, family='c_emb')]

        c_emb_look = tf.nn.embedding_lookup(params=c_emb, ids=c_ids)
        if verb > 1: print(' > 1.c_emb_look:', c_emb_look)

        myc_emb = tf.get_variable(  # my cards embeddings
            name='myc_emb',
            shape=[2, c_emb.shape[-1]],
            dtype=tf.float32,
            initializer=my_initializer(seed=seed))

        myc_emb_look = tf.nn.embedding_lookup(params=myc_emb,
                                              ids=[0, 0, 1, 1, 1, 1, 1])
        if verb > 1: print(' > myc_emb_look:', myc_emb_look)

        input = c_emb_look + myc_emb_look

        if t_drop or f_drop:
            input = tf_drop(input=input,
                            time_drop=t_drop,
                            feat_drop=f_drop,
                            train_flag=train_flag,
                            seed=seed)

        # input projection (without activation)
        if in_proj:
            input = lay_dense(input=input,
                              units=in_proj,
                              name='c_proj',
                              reuse=tf.AUTO_REUSE,
                              use_bias=False,
                              seed=seed)
            if verb > 1: print(' > input projected:', input)
        elif verb > 1: print(' > input:', input)

        enc_out = enc_TNS(in_seq=input,
                          name='TAT' if tat_case else 'TNS',
                          seq_out=not tat_case,
                          add_PE=False,
                          n_blocks=n_layers,
                          n_heads=1,
                          dense_mul=dense_mul,
                          activation=activation,
                          max_seq_len=7,
                          dropout=dropout,
                          dropout_att=0,
                          drop_flag=train_flag,
                          seed=seed,
                          n_hist=3,
                          verb=verb)
        output = enc_out['output']
        zsL += enc_out['zeroes']
        hist_summ += enc_out['hist_summ']
        if not tat_case:
            output = tf.unstack(output, axis=-2)
            output = tf.concat(output, axis=-1)
            if verb > 1: print(' > encT reshaped output:', output)
        elif verb > 1: print(' > encT output:', output)

        enc_vars = tf.global_variables(scope=tf.get_variable_scope().name)

    return {
        'output': output,
        'enc_vars': enc_vars,
        'hist_summ': hist_summ,
        'zeroes': zsL
    }
Ejemplo n.º 6
0
def card_net(
        name='card_net',
        tat_case: bool = False,
        emb_width: int = 24,
        t_drop: float = 0,
        f_drop: float = 0,
        in_proj: int = None,  # None, 0 or int
        activation=tf.nn.relu,
        # TRNS
        n_layers: int = 8,
        dense_mul=4,
        dropout=0,  # dropout of encoder transformer
        # DRT & classif
    dense_proj=None,  # None, 0 or int
        dr_layers=2,  # None, 0 or int
        dr_scale=6,
        dropout_DR=0,  # DR dropout
        # train parameters
    opt_class=partial(tf.compat.v1.train.AdamOptimizer, beta1=0.7, beta2=0.7),
        iLR=1e-3,
        warm_up=10000,
        ann_base=0.999,
        ann_step=0.04,
        n_wup_off=1,
        avt_SVal=0.1,
        avt_window=500,
        avt_max_upd=1.5,
        do_clip=False,
        seed=12321,
        verb=0):

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        zsL = []
        hist_summ = []

        train_PH = tf.placeholder_with_default(  # train placeholder
            input=False, name='train_PH', shape=[])

        inA_PH = tf.placeholder(  # 7 cards of A
            name='inA_PH', dtype=tf.int32, shape=[None, 7])  # [bsz,7cards]

        inB_PH = tf.placeholder(  # 7 cards of B
            name='inB_PH', dtype=tf.int32, shape=[None, 7])  # [bsz,7cards]

        won_PH = tf.placeholder(  # wonPH class (labels of winner 0,1-A,B wins,2-draw)
            name='won_PH',
            dtype=tf.int32,
            shape=[None])  # [bsz]

        rnkA_PH = tf.placeholder(  # rank A class (labels <0,8>)
            name='rnkA_PH',
            dtype=tf.int32,
            shape=[None])  # [bsz]

        rnkB_PH = tf.placeholder(  # rank B class (labels <0,8>)
            name='rnkB_PH',
            dtype=tf.int32,
            shape=[None])  # [bsz]

        mcA_PH = tf.placeholder(  # chances of winning for A (montecarlo)
            name='mcA_PH',
            dtype=tf.float32,
            shape=[None])  # [bsz]

        # cards encoders for A and B
        enc_outL = []
        for cPH in [inA_PH, inB_PH]:
            enc_outL.append(
                cards_enc(c_ids=cPH,
                          emb_width=emb_width,
                          train_flag=train_PH,
                          tat_case=tat_case,
                          t_drop=t_drop,
                          f_drop=f_drop,
                          in_proj=in_proj,
                          dense_mul=dense_mul,
                          activation=activation,
                          dropout=dropout,
                          n_layers=n_layers,
                          seed=seed,
                          verb=verb))

        enc_vars = enc_outL[0][
            'enc_vars']  # encoder variables (with cards embeddings)
        zsL += enc_outL[0]['zeroes']  # get nn_zeros from A
        hist_summ += enc_outL[0]['hist_summ']  # get histograms from A

        # where all cards of A are known
        where_all_ca = tf.reduce_max(inA_PH, axis=-1)
        where_all_ca = tf.where(condition=where_all_ca < 52,
                                x=tf.ones_like(where_all_ca),
                                y=tf.zeros_like(where_all_ca))
        if verb > 1: print('\n > where_all_ca', where_all_ca)
        where_all_caF = tf.cast(where_all_ca,
                                dtype=tf.float32)  # cast to float

        # rank A classifier
        logits_RA = lay_dense(input=enc_outL[0]['output'],
                              units=9,
                              name='dense_RC',
                              reuse=tf.AUTO_REUSE,
                              use_bias=False,
                              seed=seed)
        loss_RA = tf.nn.sparse_softmax_cross_entropy_with_logits(  # loss rank A
            labels=rnkA_PH, logits=logits_RA)
        loss_RA = tf.reduce_mean(
            loss_RA * where_all_caF)  # lossRA masked (where all cards @A)

        # rank B classifier
        logits_RB = lay_dense(input=enc_outL[1]['output'],
                              units=9,
                              name='dense_RC',
                              reuse=tf.AUTO_REUSE,
                              use_bias=False,
                              seed=seed)
        loss_RB = tf.nn.sparse_softmax_cross_entropy_with_logits(  # loss rank B
            labels=rnkB_PH, logits=logits_RB)
        loss_RB = tf.reduce_mean(loss_RB)

        loss_R = loss_RA + loss_RB
        if verb > 1: print(' > loss_R:', loss_R)

        # winner classifier (on concatenated representations)
        out_conc = tf.concat([enc_outL[0]['output'], enc_outL[1]['output']],
                             axis=-1)
        if verb > 1: print(' > out_conc:', out_conc)
        if dr_layers:
            enc_out = enc_DRT(input=out_conc,
                              name='drt_W',
                              lay_width=dense_proj,
                              n_layers=dr_layers,
                              dns_scale=dr_scale,
                              activation=activation,
                              dropout=dropout_DR,
                              training_flag=train_PH,
                              n_hist=0,
                              seed=seed,
                              verb=verb)
            out_conc = enc_out['output']
            zsL += enc_out['zeroes']
            hist_summ += enc_out['hist_summ']
        logits_W = lay_dense(  # projection to 3 winner logits
            input=out_conc,
            units=3,
            name='dense_W',
            reuse=tf.AUTO_REUSE,
            use_bias=False,
            seed=seed)
        if verb > 1: print(' > logits_W:', logits_W)
        loss_W = tf.nn.sparse_softmax_cross_entropy_with_logits(  # loss wonPH
            labels=won_PH, logits=logits_W)
        loss_W = tf.reduce_mean(
            loss_W * where_all_caF)  # loss winner classifier, masked
        if verb > 1: print(' > loss_W:', loss_W)

        # probability of A winning regressor
        a_WP = lay_dense(input=enc_outL[0]['output'],
                         units=1,
                         name='dense_WP',
                         reuse=tf.AUTO_REUSE,
                         activation=activation,
                         use_bias=False,
                         seed=seed)
        a_WP = tf.reshape(a_WP, shape=[-1])
        if verb > 1: print(' > player a win probability:', a_WP)
        loss_AWP = tf.losses.mean_squared_error(labels=mcA_PH,
                                                predictions=a_WP)
        if verb > 1: print(' > loss_AWP:', loss_AWP)

        diff_AWP = tf.sqrt(tf.square(mcA_PH - a_WP))
        diff_AWP_mn = tf.reduce_mean(diff_AWP)
        diff_AWP_mx = tf.reduce_max(diff_AWP)

        loss = loss_W + loss_R + loss_AWP  # this is how total loss is constructed

        # accuracy of winner classifier (where all cards)
        predictions_W = tf.argmax(logits_W, axis=-1, output_type=tf.int32)
        if verb > 1: print(' > predictionsW:', predictions_W)
        correct_W = tf.equal(predictions_W, won_PH)
        if verb > 1: print(' > correct_W:', correct_W)
        correct_WF = tf.cast(correct_W, dtype=tf.float32)
        correct_WF_where = correct_WF * where_all_caF
        acc_W = tf.reduce_sum(correct_WF_where) / tf.reduce_sum(where_all_caF)
        if verb > 1: print(' > acc_W:', acc_W)

        # accuracy of winner classifier per class (where all cards)
        oh_won = tf.one_hot(
            indices=won_PH,
            depth=3)  # OH [batch,3], 1 where wins, dtype tf.float32
        oh_won_where = oh_won * tf.stack([where_all_caF] * 3,
                                         axis=1)  # masked where all cards
        won_density = tf.reduce_mean(
            oh_won_where, axis=0)  # [3] measures density of 1 @batch per class
        oh_correct = tf.where(condition=correct_W,
                              x=oh_won_where,
                              y=tf.zeros_like(oh_won))  # [batch,3]
        won_corr_density = tf.reduce_mean(oh_correct, axis=0)
        acc_WC = won_corr_density / won_density

        oh_notcorrect_W = tf.where(
            condition=tf.logical_not(correct_W),
            x=oh_won,
            y=tf.zeros_like(oh_won))  # OH wins where not correct
        oh_notcorrect_W *= tf.stack([where_all_caF] * 3,
                                    axis=1)  # masked with all cards

        # acc of rank(B)
        predictions_R = tf.argmax(logits_RB, axis=-1, output_type=tf.int32)
        correct_R = tf.equal(predictions_R, rnkB_PH)
        acc_R = tf.reduce_mean(tf.cast(correct_R, dtype=tf.float32))
        if verb > 1: print(' > acc_R:', acc_R)

        # acc of rank(B) per class
        oh_rnkB = tf.one_hot(indices=rnkB_PH, depth=9)
        rnkB_density = tf.reduce_mean(oh_rnkB, axis=0)
        oh_correct_R = tf.where(condition=correct_R,
                                x=oh_rnkB,
                                y=tf.zeros_like(oh_rnkB))
        rnkB_corr_density = tf.reduce_mean(oh_correct_R, axis=0)
        acc_RC = rnkB_corr_density / rnkB_density

        oh_notcorrect_R = tf.where(
            condition=tf.logical_not(correct_R),
            x=oh_rnkB,
            y=tf.zeros_like(oh_rnkB))  # OH ranks where not correct

        cls_vars = tf.global_variables(scope=tf.get_variable_scope().name)
        cls_vars = [var for var in cls_vars if var not in enc_vars]

    return {
        'train_PH': train_PH,
        'inA_PH': inA_PH,
        'inB_PH': inB_PH,
        'won_PH': won_PH,
        'rnkA_PH': rnkA_PH,
        'rnkB_PH': rnkB_PH,
        'mcA_PH': mcA_PH,
        'loss': loss,  # total training loss (sum)
        'loss_W': loss_W,  # loss of winner classifier
        'loss_R': loss_R,  # loss of rank classifier
        'loss_AWP': loss_AWP,  # loss of A prob win
        'diff_AWP_mn': diff_AWP_mn,  # min diff of A prob win
        'diff_AWP_mx': diff_AWP_mx,  # max diff of A prob win
        'acc_W': acc_W,
        'acc_WC': acc_WC,
        'predictions_W': predictions_W,
        'oh_notcorrect_W': oh_notcorrect_W,
        'acc_R': acc_R,
        'acc_RC': acc_RC,
        'predictions_R': predictions_R,
        'oh_notcorrect_R': oh_notcorrect_R,
        'hist_summ': tf.summary.merge(hist_summ),
        'zeroes': tf.concat(zsL, axis=-1),
        'enc_vars': enc_vars,
        'cls_vars': cls_vars
    }
Ejemplo n.º 7
0
def enc_DRT(
        input,
        name='enc_DRT',
        shared_lays: bool = False,  # shared variables in enc_layers
        n_layers=12,
        lay_width: int = None,  # for None matches input width
        dns_scale=6,  # scale(*) of first dense
        activation=tf.nn.relu,  # gelu is really worth a try
        dropout=0.0,  # dropout after two denses
        training_flag=None,  # training flag tensor (for dropout)
        initializer=None,
        seed=12321,
        n_hist=4,  # number of histogram layers (for TB)
        verb=0):

    lay_width_matched = ''
    if lay_width is None:
        lay_width = input.shape.as_list()[-1]
        lay_width_matched = '(lay_width taken form input width)'
    if verb > 0:
        drp = 0.0 if not dropout else dropout
        print(
            f'\nBuilding DRTencoder ({n_layers}x{lay_width} drop:{drp:.2f}) {lay_width_matched}...'
        )

    if initializer is None: initializer = my_initializer(seed)

    hist_summ = []
    hist_layers = list_of_layers(n_layers, n_select=n_hist)
    if verb > 1: print(' > histogram layers of DRTencoder:', hist_layers)

    zsL = []  # zeroes list
    with tf.variable_scope(name):

        # input projection
        iW = input.shape[-1]
        if iW != lay_width:
            input = lay_dense(input=input,
                              units=lay_width,
                              use_bias=False,
                              initializer=initializer,
                              seed=seed)
            if verb > 0:
                print('projected input to layWidth(%d) since it differs(%d)' %
                      (lay_width, iW))

        input = tf.keras.layers.LayerNormalization(axis=-1)(
            input)  # input layer_norm

        output = input  # for 0 layers case
        for nL in range(n_layers):

            lay_name = f'DRLay_{nL}' if not shared_lays else 'DRLay_shared'
            lay_out = lay_DRT(input=output,
                              name=lay_name,
                              hist_name=name,
                              dns_scale=dns_scale,
                              activation=activation,
                              dropout=dropout,
                              training_flag=training_flag,
                              initializer=initializer,
                              seed=seed)

            output = lay_out['output']
            if nL in hist_layers: hist_summ.append(lay_out['hist_summ'])
            zsL += lay_out['zeroes']

    return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
Ejemplo n.º 8
0
    def tblock(in_seq, seed, task_query=None):

        hist_summ = []

        output = in_seq
        taskQueryNorm = None
        if task_query is None:
            hist_summ.append(
                tf.summary.histogram('a_inputSeq', output, family=name))
            # layer norm 1 on seq
            if do_LN:
                output = tf.keras.layers.LayerNormalization(axis=-1)(output)
                hist_summ.append(
                    tf.summary.histogram('b_inputSeqLN', output, family=name))
        else:
            hist_summ.append(
                tf.summary.histogram('a_inTaskQuery', task_query, family=name))
            taskQueryNorm = task_query
            # layer norm 1 on taskQuery
            if do_LN:
                taskQueryNorm = tf.keras.layers.LayerNormalization(
                    axis=-1)(task_query)
                hist_summ.append(
                    tf.summary.histogram('b_taskQueryLN',
                                         task_query,
                                         family=name))

        # multi head self attention
        mha_out = mh_attn(in_seq=output,
                          query=taskQueryNorm,
                          dropout_att=dropout_att,
                          drop_flag=training_flag,
                          seed=seed)
        output = mha_out['attention']
        att_vals = mha_out['att_vals']
        hist_summ.append(tf.summary.histogram('c_mhAttn', output, family=name))

        # dense without activation
        output = lay_dense(input=output,
                           units=output.shape[-1].value,
                           name='afterAttProj',
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('d_denseAftAtt', output, family=name))

        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual 1
        if task_query is None:
            res1_out = in_seq + output
            hist_summ.append(
                tf.summary.histogram('e_res_onInputSeq', res1_out,
                                     family=name))
        else:
            res1_out = task_query + output
            hist_summ.append(
                tf.summary.histogram('e_res_onTaskQuery',
                                     res1_out,
                                     family=name))

        output = res1_out
        # layer norm 2
        if do_LN:
            output = tf.keras.layers.LayerNormalization(axis=-1)(output)
            hist_summ.append(
                tf.summary.histogram('f_layNorm', output, family=name))

        # 2x dense
        base_width = output.shape[-1].value
        output = lay_dense(input=output,
                           units=int(base_width * dense_mul),
                           name='dense1afterAtt',
                           activation=activation,
                           initializer=initializer,
                           seed=seed)
        zsL = [zeroes(output)]
        hist_summ.append(
            tf.summary.histogram('g_1denseOut', output, family=name))
        output = lay_dense(input=output,
                           units=base_width,
                           name='dense2afterAtt',
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('h_2denseOut', output, family=name))

        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual2
        output += res1_out
        hist_summ.append(tf.summary.histogram('i_res', output, family=name))

        return {
            'output': output,
            'hist_summ': hist_summ,
            'att_vals': att_vals,
            'zeroes': zsL
        }
Ejemplo n.º 9
0
def lay_DRT(
        input,
        name='lay_DRT',  # scope name, be careful when stacked since auto_reuse
        hist_name=None,  # family name of histogram
        dns_scale=4,
        activation=tf.nn.relu,  # gelu is really worth a try
        dropout=None,  # dropout (after two denses)
        training_flag=None,  # training flag tensor (for dropout)
        initializer=None,
        seed=12321):

    if not hist_name: hist_name = name
    lay_width = input.shape[-1]
    if initializer is None: initializer = my_initializer(seed)
    hist_summ = []

    with tf.variable_scope(name_or_scope=name, reuse=tf.AUTO_REUSE):

        hist_summ.append(
            tf.summary.histogram('a_denseSin', input, family=hist_name))

        # dense (scale up)
        output = lay_dense(input=input,
                           units=int(lay_width * dns_scale),
                           activation=None,
                           use_bias=True,
                           initializer=initializer,
                           seed=seed,
                           name='denseS')
        hist_summ.append(
            tf.summary.histogram('b_denseSout', output, family=hist_name))

        # activation
        output = activation(output)
        zsL = [zeroes(output)]  # zeroes list
        hist_summ.append(
            tf.summary.histogram('c_activation', output, family=hist_name))

        # dense (scale down) no activ
        output = lay_dense(input=output,
                           units=lay_width,
                           name='DRTdenseNA',
                           use_bias=True,
                           initializer=initializer,
                           seed=seed)
        hist_summ.append(
            tf.summary.histogram('d_denseNAout', output, family=hist_name))

        # layer dropout
        if dropout:
            output = tf.layers.dropout(inputs=output,
                                       rate=dropout,
                                       training=training_flag,
                                       seed=seed)

        # residual
        output = lay_res(input, output)
        hist_summ.append(
            tf.summary.histogram('e_residual', output, family=hist_name))

        # layer_norm
        output = tf.keras.layers.LayerNormalization(axis=-1)(output)
        hist_summ.append(
            tf.summary.histogram('f_LAYout', output, family=hist_name))

    return {'output': output, 'hist_summ': hist_summ, 'zeroes': zsL}
Ejemplo n.º 10
0
def enc_CNN(
        input: tf.Tensor,
        history: tf.
    Tensor = None,  # optional history(state) tensor with shape [bsz, n_layers ,kernel-1, n_filters], >> masked cnn
        name='enc_CNN',
        # layer params
        shared_lays: bool = False,  # shared variables in enc_layers
        n_layers: int = 12,  # num of layers
        kernel: int = 3,  # layer kernel
        n_filters: int = 128,  # num of filters
        activation=tf.nn.
    relu,  # global enc activation func, gelu is really worth a try
        lay_drop: float or None = 0.0,
        ldrt_scale: int or
    None = 0,  # DRT @enc_lay - scale(*) of first dense, for None or 0 DRT @lay won't be build
        ldrt_drop: float or None = 0.0,  # DRT @enc_lay - dropout
        # other
    training_flag: tf.Tensor or bool = None,  # dropout training flag tensor
        initializer=None,
        seed: int = 12321,
        n_hist: int = 4,  # number of histogram layers
        verb=0):

    if verb > 0:
        print(
            f'\n *** enc_CNN *** Building {name} ({n_layers}x{n_filters})...')

    if initializer is None: initializer = my_initializer(seed)

    # manage history
    history_lays = None
    if history is not None:
        history_lays = tf.unstack(history, axis=-3)
        if verb > 1:
            print(
                f' > state_lays len {len(history_lays)} of: {history_lays[0]}')

    hist_summ = []
    hist_layers = list_of_layers(n_layers, n_select=n_hist)
    if verb > 1: print(f' > histogram layers of cnn encoder: {hist_layers}')

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        input_lays = [
        ]  # here we will store inputs of the following layers to extract the state (history)
        zsL = []  # zeroes

        # input projection - to match n_filters and input width
        if verb > 1: print(f' > encoder input: {input}')
        if input.shape[-1] != n_filters:
            input = lay_dense(input=input,
                              units=n_filters,
                              name='enc_input_projection',
                              initializer=initializer)
            if verb > 1: print(f' > encoder projected input: {input}')

        output = input  # for 0 layers case
        sub_output = input  # first input
        for depth in range(n_layers):

            lay_name = f'enc_CNN_lay_{depth}' if not shared_lays else 'enc_CNN_lay_shared'
            if verb > 1: print(f'<< layer {lay_name}:')

            lay_input = tf.concat([history_lays[depth], sub_output],
                                  axis=-2) if history_lays else sub_output
            if verb > 1:
                print(f' > sub_output (previous): {sub_output}')
                print(f' > lay_input (eventually padded): {lay_input}')
            input_lays.append(lay_input)

            hist_lay = depth in hist_layers

            with tf.variable_scope(lay_name):

                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('a_lay_in',
                                             lay_input,
                                             family=name))

                # LN
                lay_input = tf.keras.layers.LayerNormalization(
                    axis=-1)(lay_input)
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('b_LN', lay_input, family=name))

                # conv no activation
                output = lay_conv1D(
                    input=lay_input,
                    name='conv1D',
                    kernels=kernel,
                    filters=n_filters,
                    activation=None,
                    initializer=initializer,
                    padding='same' if history is None else 'valid',
                    seed=seed,
                    verb=0)
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('c_cnn', output, family=name))

                # activation
                if activation:
                    output = activation(output)
                    zsL += [zeroes(output)]  # catch zeroes
                    if hist_lay:
                        hist_summ.append(
                            tf.summary.histogram('d_activation',
                                                 output,
                                                 family=name))

                # dropout
                if lay_drop:
                    output = tf.layers.dropout(inputs=output,
                                               rate=lay_drop,
                                               training=training_flag,
                                               seed=seed)
                    if hist_lay:
                        hist_summ.append(
                            tf.summary.histogram('e_drop', output,
                                                 family=name))

                # RES, here we take sub_output, since lay_input may be padded by history
                output += sub_output
                if hist_lay:
                    hist_summ.append(
                        tf.summary.histogram('f_residual', output,
                                             family=name))

                if verb > 1: print(f' > output (layer): {output}')

                if ldrt_scale:
                    lay_out = lay_DRT(input=output,
                                      name=lay_name + '_lay_DRT',
                                      hist_name=name,
                                      dns_scale=ldrt_scale,
                                      activation=activation,
                                      dropout=ldrt_drop,
                                      training_flag=training_flag,
                                      initializer=initializer,
                                      seed=seed)
                    output = lay_out['output']
                    zsL += lay_out['zeroes']
                    if hist_lay: hist_summ.append(lay_out['hist_summ'])

                sub_output = output

    output = tf.keras.layers.LayerNormalization(axis=-1)(output)  # final LN

    # prepare fin_state
    fin_state = None
    if history is not None:
        state = tf.stack(input_lays, axis=-3)
        if verb > 1: print(f' > state (stacked): {state}')
        fin_state = tf.split(state,
                             num_or_size_splits=[-1, kernel - 1],
                             axis=-2)[1]
        if verb > 1: print(f' > fin_state (split): {fin_state}')

    if verb > 1: print(f' > {name} output: {output}')
    return {
        'output': output,
        'state': fin_state,  # history for next
        'hist_summ': hist_summ,
        'zeroes': zsL
    }
Ejemplo n.º 11
0
def dvc_model(
        seed: int,  # seed for TF OPs
        multi_sen: int,
        train_tower: bool,
        vec_width: int,
        tok_emb,  # tuple with embeddings shape or np.arr/LL with values of embeddings
        seq_width: int,
        max_seq_len: int,
        drt_scale: float or int,  # global DRT scale
        classes: int or list,  # (Multi-Classif)
        vtc_drop: float,
        vtc_proj: int,
        drtC_nLay: int,
        drtC_drop: float,
        out_drop: float,
        l2lc: float,
        verb,
        **kwargs):

    #actv_func = tf.nn.relu
    actv_func = gelu

    hist_summ = []
    zsL = []
    isVec = vec_width is not None
    isTks = tok_emb is not None
    isSeq = seq_width is not None

    if verb > 0:
        print('\n*** DVCmodel *** builds graph for', end='')
        if isVec: print(' vec(%d)' % vec_width, end='')
        if isTks: print(' tks (tokens sequence)', end='')
        if isSeq: print(' seq (vectors sequence)', end='')
        print()

    if type(classes) is not list:
        classes = [classes] if classes is not None else [
        ]  # nClasses may be None >> no classifiers

    with tf.variable_scope(name_or_scope='FWD'):

        # ********************************* input placeholders
        vec_PHL = [
            tf.compat.v1.placeholder(name='vec%d_PH' % nS,
                                     dtype=tf.float32,
                                     shape=[None, vec_width])
            for nS in range(multi_sen)
        ] if isVec else None

        tks_PHL = [
            tf.compat.v1.placeholder(name='tks%d_PH' % nS,
                                     dtype=tf.int32,
                                     shape=[None, max_seq_len])
            for nS in range(multi_sen)
        ] if isTks else None  # batch, seqLen

        seq_PHL = [
            tf.compat.v1.placeholder(name='seq%d_PH' % nS,
                                     dtype=tf.float32,
                                     shape=[None, max_seq_len, seq_width])
            for nS in range(multi_sen)
        ] if isSeq else None  # batch, seqLen, vec

        lab_PHL = [
            tf.compat.v1.placeholder(name='labC%d_ID' % nC,
                                     dtype=tf.int32,
                                     shape=[None])
            for nC in range(len(classes))
        ]

        train_flag_PH = tf.compat.v1.placeholder(
            name='train_flag', dtype=tf.bool,
            shape=[])  # placeholder marking training process

        # ********************************* encTowers
        if verb > 0: print('...building %d DVC encTowers' % multi_sen)
        enc_outs = []
        for nS in range(multi_sen):
            encT_out = enc_tower(
                actv_func=actv_func,
                vec_PH=vec_PHL[nS] if vec_PHL is not None else None,
                tks_PH=tks_PHL[nS] if tks_PHL is not None else None,
                seq_PH=seq_PHL[nS] if seq_PHL is not None else None,
                train_flag_PH=train_flag_PH,
                tok_emb=tok_emb,
                max_seq_len=max_seq_len,
                drt_scale=drt_scale,
                seed=seed,
                verb=verb,
                **kwargs)
            enc_outs.append(encT_out)

        vec_output = tf.concat([eo['vector'] for eo in enc_outs], axis=-1)
        if len(enc_outs) > 1 and verb > 0:
            print('\n > outputs (concatenated) of %d towers:' % len(enc_outs),
                  vec_output)

        tower_vars = enc_outs[0]['tower_vars']
        hist_summ += enc_outs[0]['hist_summ']
        for encT_out in enc_outs:
            zsL += encT_out['zeroes']

        hist_summ.append(
            tf.summary.histogram('5towersOut_concatALL',
                                 vec_output,
                                 family='C.cls'))

    # ********************************* Multi-Classifier
    with tf.variable_scope('vClassif'):

        if classes:

            # dropout on vector to classifier
            if vtc_drop:
                vec_output = tf.layers.dropout(inputs=vec_output,
                                               rate=vtc_drop,
                                               training=train_flag_PH,
                                               seed=seed)
                if verb > 1:
                    print(
                        ' > dropout %.2f applied to vec_output of tower(s):' %
                        vtc_drop, vec_output)

            # projection on vector to classifier
            if vtc_proj and vtc_proj != vec_output.shape.as_list()[-1]:

                vec_output = lay_dense(input=vec_output,
                                       units=vtc_proj,
                                       activation=None,
                                       use_bias=True,
                                       seed=seed,
                                       name='inVProjection')
                if verb > 1: print(' > projected vector input:', vec_output)
                hist_summ.append(
                    tf.summary.histogram('7vecTCProj',
                                         vec_output,
                                         family='C.cls'))

                # layerNorm (after projection)
                vec_output = tf.contrib.layers.layer_norm(inputs=vec_output,
                                                          begin_norm_axis=-1,
                                                          begin_params_axis=-1)
                hist_summ.append(
                    tf.summary.histogram('8projLNorm',
                                         vec_output,
                                         family='C.cls'))

            mc_losses = []
            mc_probs = []
            if verb > 1: print('\nBuilding multi-classifier graphs...')
            for cix in range(len(classes)):
                if verb > 1:
                    print(' > multi-classifier (%d/%d):' %
                          (cix + 1, len(classes)))

                # DRT encoder @classifier
                if drtC_nLay:
                    eDRTout = enc_DRT(input=vec_output,
                                      n_layers=drtC_nLay,
                                      dns_scale=drt_scale,
                                      activation=actv_func,
                                      dropout=drtC_drop,
                                      training_flag=train_flag_PH,
                                      seed=seed,
                                      n_hist=2,
                                      verb=verb)
                    vec_output = eDRTout['output']
                    zsL += eDRTout['zeroes']

                if out_drop:
                    vec_output = tf.layers.dropout(inputs=vec_output,
                                                   rate=out_drop,
                                                   training=train_flag_PH,
                                                   seed=seed)

                logits = lay_dense(input=vec_output,
                                   units=classes[cix],
                                   activation=None,
                                   use_bias=True,
                                   seed=seed,
                                   name='logits_projection_cix%d' % cix)
                if verb > 1: print(' >> logits (projected)', logits)
                hist_summ.append(
                    tf.summary.histogram('9logits', logits, family='C.cls'))

                probs = tf.nn.softmax(logits,
                                      name=f'predict_probabilities_c{cix}')
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                if verb > 1: print(' >> predictions:', predictions)
                correct = tf.equal(predictions, lab_PHL[cix])
                if verb > 1: print(' >> correct prediction:', correct)
                accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

                # softmax loss
                cLoss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=lab_PHL[cix])
                if verb > 1:
                    print(' > cLoss (softmax)',
                          cLoss)  # shape [batch] (per sample)
                """ TODO (experimental): scaled cLoss
                # scale cLoss
                scale = tf.where(
                    condition=  correct,
                    x=          tf.ones_like(correct, dtype=tf.float32)*tf.constant(0.8), # positive scale
                    y=          tf.ones_like(correct, dtype=tf.float32)*tf.constant(1.7)) # negative scale
                cLoss *= scale
                """

                mc_losses.append(cLoss)
                mc_probs.append(probs)

        # average all losses (multi-classifiers losses)
        loss = tf.reduce_mean(tf.stack(mc_losses))  # shape [1]
        if verb > 1: print(' > loss (averaged all multi-classif)', loss)

        class_vars = tf.global_variables(
            scope=tf.get_variable_scope().name)  # vClass variables

        train_vars = []
        if train_tower: train_vars += tower_vars
        train_vars += class_vars

        # L2 cLoss
        if l2lc:
            restrictedNames = [
                'bias',  # dense bias
                'beta',  # LN offset
                'gamma',  # LN scale
                'tns_pos_emb',  # position embeddings
                'tok_emb'
            ]  # token embeddings
            if verb > 1:
                print(' > applying L2 cLoss to variables (not including %s)' %
                      restrictedNames)
            l2Vars = []
            for v in train_vars:
                vIsOk = True
                for nmp in restrictedNames:
                    if nmp in v.name: vIsOk = False
                if vIsOk: l2Vars.append(v)
            if verb > 1:
                print(' > L2 / all(--) variables of model:')
                for var in train_vars:
                    if var in l2Vars: print(' >> L2', var)
                    else: print(' >> --', var)
            l2loss = tf.add_n([tf.nn.l2_loss(v)
                               for v in l2Vars]) * l2lc  # shape [1]
            if verb > 1: print(' > L2 cLoss', l2loss)
            loss += l2loss

    return {
        # placeholders
        'vec_PHL': vec_PHL,
        'tks_PHL': tks_PHL,
        'seq_PHL': seq_PHL,
        'lab_PHL': lab_PHL,
        'train_flag_PH': train_flag_PH,
        # variables
        'train_vars': train_vars,  # to train
        'tower_vars': tower_vars,  # to save
        'class_vars': class_vars,  # to save
        # tensors
        'probs': probs,  # ...of last multi-classifier
        'mc_probs': mc_probs,
        'predictions': predictions,  # ...of last multi-classifier
        'accuracy': accuracy,  # ...of last multi-classifier
        'loss': loss,  # avg of all multi-classifiers
        'hist_summ': tf.summary.merge(hist_summ),
        'zeroes': zsL
    }
Ejemplo n.º 12
0
def enc_tower(
        actv_func,  # activation function
        vec_PH: tf.compat.v1.placeholder,  # vector placeholder     (vec input)
        tks_PH: tf.compat.v1.
    placeholder,  # tokens seq placeholder (seq input - IDs)
        seq_PH: tf.compat.v1.placeholder,  # vector seq placeholder (seq input)
        train_flag_PH: tf.compat.v1.placeholder,  # train flag placeholder
        tok_emb,
        tok_emb_train: bool,  # flag, when True tok_emb are trainable
        tok_emb_add,  # np.arr/LL with values of additional embeddings (always trainable)
        # vectors processing
    inV_drop: float,
        inV_proj: int
    or None,  # value equal to last dimension width turns-off projection
        drt_nLay,
        drt_scale,
        drt_drop,
        # sequence params
        inS_drop: float,
        intime_drop: float,
        infeat_drop: float,
        inS_proj: int
    or None,  # value equal to last dimension width turns-off projection
        inS_actv: bool,  # inS_proj activation
        # seq encoders params
    cnn_nLay,
        rnn_nLay,
        max_seq_len,
        tns_nBlocks,
        enc_drop,
        tnsAT_drop,
        tns_scale,
        tat_nBlocks,
        tatAT_drop,
        tat_drop,
        # other
        seed,
        verb,
        **kwargs):

    if verb > 0: print('\nenc_tower inits...')
    zsL = []
    hist_summ = []

    with tf.variable_scope('encTower', reuse=tf.AUTO_REUSE):
        vectorL = [
        ]  # list of vectors to concatenate (vec form vec_PH + reduced sequence (tok_PH & seq_PH))

        # ********************************* vector processing
        if vec_PH is not None:
            vector = vec_PH
            if verb > 1: print(' > vector input:', vector)
            hist_summ.append(
                tf.summary.histogram('1vecIn', vector, family='A.vec'))

            # layerNorm (on input, always)
            vector = tf.contrib.layers.layer_norm(inputs=vector,
                                                  begin_norm_axis=-1,
                                                  begin_params_axis=-1)
            hist_summ.append(
                tf.summary.histogram('2inLNorm', vector, family='A.vec'))

            # dropout (on input, before projection)
            if inV_drop:
                vector = tf.layers.dropout(inputs=vector,
                                           rate=inV_drop,
                                           training=train_flag_PH,
                                           seed=seed)
                if verb > 1:
                    print(' > dropout %.2f applied to vec:' % inV_drop, vector)

            # projection (rescales input, without activation)
            if inV_proj and inV_proj != vector.shape.as_list()[-1]:
                vector = lay_dense(input=vector,
                                   units=inV_proj,
                                   activation=None,
                                   use_bias=True,
                                   seed=seed,
                                   name='inVProjection')
                if verb > 1: print(' > projected vector input:', vector)
                hist_summ.append(
                    tf.summary.histogram('3inProj', vector, family='A.vec'))

                # layerNorm (after projection)
                vector = tf.contrib.layers.layer_norm(inputs=vector,
                                                      begin_norm_axis=-1,
                                                      begin_params_axis=-1)
                hist_summ.append(
                    tf.summary.histogram('4projLNorm', vector, family='A.vec'))

            # DRT encoder for vector @tower
            if drt_nLay:
                eDRTout = enc_DRT(input=vector,
                                  n_layers=drt_nLay,
                                  dns_scale=drt_scale,
                                  activation=actv_func,
                                  dropout=drt_drop,
                                  training_flag=train_flag_PH,
                                  seed=seed,
                                  n_hist=2,
                                  verb=verb)
                vector = eDRTout['output']
                zsL += eDRTout['zeroes']
                hist_summ += eDRTout['hist_summ']
                if verb > 1: print(' > drtLay output', vector)
                hist_summ.append(
                    tf.summary.histogram('5drtLayOut', vector, family='A.vec'))

            vectorL.append(vector)

        # ********************************* sequence processing
        sequence = None
        seq_to_concat = []

        # tokens embedding for sequence
        if tks_PH is not None:
            if type(tok_emb) is tuple:
                all_emb = tf.get_variable(  # embeddings initialized from scratch
                    name='tok_emb',
                    shape=tok_emb,
                    initializer=tf.truncated_normal_initializer(stddev=0.01,
                                                                seed=seed),
                    dtype=tf.float32,
                    trainable=True)
            else:
                all_emb = tf.get_variable(  # embeddings initialized with given variable
                    name='tok_emb',
                    initializer=tok_emb,
                    dtype=tf.float32,
                    trainable=tok_emb_train)
            if tok_emb_add is not None:
                tokEmbAddV = tf.get_variable(  # add embeddings initialized with given variable
                    name='tok_emb_add',
                    initializer=tok_emb_add,
                    dtype=tf.float32,
                    trainable=True)
                all_emb = tf.concat([all_emb, tokEmbAddV], axis=0)

            sequence = tf.nn.embedding_lookup(params=all_emb, ids=tks_PH)
            if verb > 1: print('\n > sequence (tokens lookup):', sequence)
            hist_summ.append(
                tf.summary.histogram('1seqT', sequence, family='B.seq'))
            seq_to_concat.append(sequence)

        if seq_PH is not None:
            if verb > 1: print(' > sequence of vectors:', seq_PH)
            hist_summ.append(
                tf.summary.histogram('1seqV', seq_PH, family='B.seq'))
            seq_to_concat.append(seq_PH)

        # concat sequences
        if len(seq_to_concat) == 1:
            sequence = seq_to_concat[0]
        if len(seq_to_concat) > 2:
            # it will work only when shapes match !!!
            sequence = tf.concat(seq_to_concat, axis=1)
            if verb > 1: print(' > concatenated sequence (vec+tok):', sequence)

        if sequence is not None:
            # dropout (applied to seq of tok_emb works much better than applied after projection)
            if inS_drop:
                sequence = tf.layers.dropout(inputs=sequence,
                                             rate=inS_drop,
                                             training=train_flag_PH,
                                             seed=seed)
                if verb > 1:
                    print(' > dropout %.2f applied to seq:' % inS_drop,
                          sequence)

            # time & feats drop
            if intime_drop or infeat_drop:
                sequence = tf_drop(input=sequence,
                                   time_drop=intime_drop,
                                   feat_drop=infeat_drop,
                                   train_flag=train_flag_PH,
                                   seed=seed)

            # sequence layer_norm (on (dropped)input, always)
            sequence = tf.contrib.layers.layer_norm(inputs=sequence,
                                                    begin_norm_axis=-2,
                                                    begin_params_axis=-2)
            if verb > 1: print(' > normalized seq:', sequence)
            hist_summ.append(
                tf.summary.histogram('2inLNorm', sequence, family='B.seq'))

            # in_projection (rescales input) without activation
            if inS_proj and inS_proj != sequence.shape.as_list()[-1]:
                sequence = lay_dense(
                    input=sequence,
                    units=inS_proj,
                    activation=actv_func if inS_actv else None,
                    use_bias=True,
                    seed=seed,
                    name='inSProjection')
                if verb > 1:
                    print(' > inProjection (%d) for seq:' % inS_proj, sequence)
                hist_summ.append(
                    tf.summary.histogram('3inProj', sequence, family='B.seq'))

                # layerNorm (after projection)
                sequence = tf.contrib.layers.layer_norm(inputs=sequence,
                                                        begin_norm_axis=-2,
                                                        begin_params_axis=-2)
                if verb > 1: print(' > normalized seq:', sequence)
                hist_summ.append(
                    tf.summary.histogram('4projLNorm',
                                         sequence,
                                         family='B.seq'))

            # ********* below are 3 types of seq2seq encoders stacked each on another
            enc_width = sequence.shape.as_list()[-1]
            if cnn_nLay:
                eCOut = enc_CNN(input=sequence,
                                n_layers=cnn_nLay,
                                activation=actv_func,
                                lay_drop=enc_drop,
                                training_flag=train_flag_PH,
                                n_filters=enc_width,
                                n_hist=2,
                                seed=seed,
                                verb=verb)
                sequence = eCOut['output']
                hist_summ += eCOut['hist_summ']

            if rnn_nLay:
                from tensorflow.contrib import rnn
                eLOut = enc_RNN(input=sequence,
                                cellFN=rnn.LSTMCell,
                                biDir=False,
                                cellWidth=enc_width,
                                numLays=rnn_nLay,
                                dropout=enc_drop,
                                dropFlagT=train_flag_PH,
                                seed=seed)
                sequence = eLOut['output']

            if tns_nBlocks:
                tns_out = enc_TNS(in_seq=sequence,
                                  name='encTRNS',
                                  n_blocks=tns_nBlocks,
                                  n_heads=1,
                                  dense_mul=tns_scale,
                                  activation=actv_func,
                                  max_seq_len=max_seq_len,
                                  dropout_att=tnsAT_drop,
                                  dropout=enc_drop,
                                  training_flag=train_flag_PH,
                                  seed=seed,
                                  n_hist=2,
                                  verb=verb)
                sequence = tns_out['output']
                hist_summ += tns_out['hist_summ']
                zsL += tns_out['zeroes']

            # ********** below sequence is reduced to vector, with TAT or pooling
            # TAT reduction
            if tat_nBlocks:
                tat_out = enc_TNS(in_seq=sequence,
                                  seq_out=False,
                                  name='tatTRNS',
                                  n_blocks=tat_nBlocks,
                                  n_heads=1,
                                  dense_mul=tns_scale,
                                  activation=actv_func,
                                  max_seq_len=max_seq_len,
                                  dropout_att=tatAT_drop,
                                  dropout=tat_drop,
                                  training_flag=train_flag_PH,
                                  seed=seed,
                                  n_hist=2,
                                  verb=verb)
                sequence_reduced = tat_out['output']
                hist_summ += tat_out['hist_summ']
                # attVals =           tat_out['att_vals']
                zsL += tat_out['zeroes']

            # reduce sequence with concat of avg & max
            else:
                sequence_reduced = tf.concat([
                    tf.reduce_mean(sequence, axis=-2),
                    tf.reduce_max(sequence, axis=-2)
                ],
                                             axis=-1)
                if verb > 1:
                    print(' > reduced sequence to one vec with mean (+) max:',
                          sequence_reduced)

            vectorL.append(sequence_reduced)

        # ********************************* concatenate and finish
        vector = tf.concat(vectorL,
                           axis=-1) if len(vectorL) > 1 else vectorL[0]
        if verb > 1: print(' > vector (tower output):', vector)

        tower_vars = tf.global_variables(
            scope=tf.get_variable_scope().name)  # eTower variables

    return {
        'vector': vector,
        'sequence': sequence,
        'tower_vars': tower_vars,
        'hist_summ': hist_summ,
        'zeroes': zsL
    }
Ejemplo n.º 13
0
def cnn_DMG(
        name :str,
        train_ce :bool= True,           # train cards encoder
        c_embW :int=    12,             # card emb width >> makes network width (x7)
        n_lay=          12,             # number of CNNR layers >> makes network deep ( >> context length)
        width=          None,           # representation width (number of filters), for None uses cards_encoded_width
        activation=     tf.nn.relu,
        opt_class=      partial(tf.compat.v1.train.AdamOptimizer, beta1=0.7, beta2=0.7),
        iLR=            3e-5,
        warm_up=        100,            # num of steps has to be small (since we do rare updates)
        avt_SVal=       0.04,
        avt_window=     20,
        do_clip=        True,
        verb=           0,
        **kwargs):

    if verb>0: print(f'\nBuilding {name} cnn_DMG (graph)...')

    with tf.variable_scope(name):

        n_hands = tf.get_variable( # number of hands while learning
            name=           'n_hands',
            shape=          [],
            trainable=      False,
            initializer=    tf.constant_initializer(0),
            dtype=          tf.int32)

        cards_PH = tf.placeholder(  # 7 cards placeholder
            name=           'cards_PH',
            dtype=          tf.int32,
            shape=          [None, None, 7])  # [bsz,seq,7cards]

        train_PH = tf.placeholder(  # train placeholder
            name=           'train_PH',
            dtype=          tf.bool,
            shape=          [])

        ce_out = cards_enc(
            train_flag= train_PH,
            c_ids=      cards_PH,
            emb_width=  c_embW)
        cards_encoded = ce_out['output']
        enc_vars =      ce_out['enc_vars']
        enc_zsL =       ce_out['zeroes']
        if verb>1: print(' ### num of enc_vars (%d) %s'%(len(enc_vars),short_scin(num_var_floats(enc_vars))))
        if verb>1: print(' > cards encoded:', cards_encoded)

        switch_PH = tf.placeholder( # switch placeholder
            name=           'switch_PH',
            dtype=          tf.int32, # 0 for move, 1 for cards
            shape=          [None, None, 1])  # [bsz,seq,1]

        event_PH = tf.placeholder(  # event id placeholder
            name=           'event_PH',
            dtype=          tf.int32,
            shape=          [None, None])  # [bsz,seq]

        n_events = 1 + N_TABLE_PLAYERS + len(TBL_MOV)*(N_TABLE_PLAYERS-1)
        event_emb = tf.get_variable(  # event type embeddings
            name=           'event_emb',
            shape=          [n_events, cards_encoded.shape[-1]],
            dtype=          tf.float32,
            initializer=    my_initializer())

        event_in = tf.nn.embedding_lookup(params=event_emb, ids=event_PH)
        if verb>1: print(' > event_in:', event_in)

        # tried with tf.where and switching inputs, but speed was the same...
        switch = tf.cast(switch_PH, dtype=tf.float32)
        input = switch*cards_encoded + (1-switch)*event_in
        if verb>1: print(' > input (merged):', input)

        # projection without activation and bias
        if width:
            input = lay_dense(
                input=          input,
                units=          width,
                use_bias=       False)
            if verb>1: print(' > projected input (projected):', input)
        else: width = cards_encoded.shape[-1]

        # layer_norm
        sub_output = tf.contrib.layers.layer_norm(
            inputs=             input,
            begin_norm_axis=    -1,
            begin_params_axis=  -1)

        state_shape = [n_lay, 2, width]
        single_zero_state = tf.zeros(shape=state_shape)  # [n_lay,2,width]

        state_PH = tf.placeholder(
            name=           'state_PH',
            dtype=          tf.float32,
            shape=          [None] + state_shape) # [bsz,n_lay,2,width]

        cnn_enc_out = enc_CNN(
            input=          sub_output,
            history=        state_PH,
            n_layers=       n_lay,
            n_filters=      width,
            activation=     activation,
            n_hist=         0)
        out =       cnn_enc_out['output']
        fin_state = cnn_enc_out['state']
        cnn_zsL =   cnn_enc_out['zeroes']

        if verb > 1:
            print(' > out:', out)
            print(' > fin_state (split):', fin_state)

        # projection to logits
        logits = lay_dense(
            input=          out,
            units=          len(TBL_MOV),
            use_bias=       False)
        if verb>1: print(' > logits:', logits)

        probs = tf.nn.softmax(logits)

        cnn_vars = tf.trainable_variables(scope=tf.get_variable_scope().name) + [n_hands]
        cnn_vars = [var for var in cnn_vars if var not in enc_vars]
        if verb>1: print(' ### num of cnn_vars (%d) %s'%(len(cnn_vars),short_scin(num_var_floats(cnn_vars))))

        move_PH = tf.placeholder(  # move made (label)
            name=           'move_PH',
            dtype=          tf.int32,
            shape=          [None, None])  # [bsz,seq]

        rew_PH = tf.placeholder(  # reward for move made
            name=           'rew_PH',
            dtype=          tf.float32,
            shape=          [None, None])  # [bsz,seq]

        # this loss is auto averaged with reduction parameter
        # loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
        # loss = loss(y_true=move, y_pred=logits, sample_weight=rew)
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=     move_PH,
            logits=     logits,
            weights=    rew_PH)

    train_vars = [] + cnn_vars
    if train_ce: train_vars += enc_vars

    return{
        'name':                 name,
        'cards_PH':             cards_PH,
        'train_PH':             train_PH,
        'switch_PH':            switch_PH,
        'event_PH':             event_PH,
        'move_PH':              move_PH,
        'rew_PH':               rew_PH,
        'state_PH':             state_PH,
        'single_zero_state':    single_zero_state,
        'probs':                probs,
        'fin_state':            fin_state,
        'enc_zeroes':           tf.concat(enc_zsL, axis=-1),
        'cnn_zeroes':           tf.concat(cnn_zsL, axis=-1),
        'loss':                 loss,
        'n_hands':              n_hands,
        'enc_vars':             enc_vars,
        'cnn_vars':             cnn_vars,
        'train_vars':           train_vars}