Ejemplo n.º 1
0
def _top_k_sample(logits, ignore_ids=None, num_samples=1, k=10):
    """
    Does top-k sampling. if ignore_ids is on, then we will zero out those logits.
    :param logits: [batch_size, vocab_size] tensor
    :param ignore_ids: [vocab_size] one-hot representation of the indices we'd like to ignore and never predict,
                        like padding maybe
    :param p: topp threshold to use, either a float or a [batch_size] vector
    :return: [batch_size, num_samples] samples

    # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK
    """
    with tf.variable_scope('top_p_sample'):
        batch_size, vocab_size = get_shape_list(logits, expected_rank=2)

        probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10,
                              axis=-1)
        # [batch_size, vocab_perm]
        indices = tf.argsort(probs, direction='DESCENDING')

        # find the top pth index to cut off. careful we don't want to cutoff everything!
        # result will be [batch_size, vocab_perm]
        k_expanded = k if isinstance(k, int) else k[:, None]
        exclude_mask = tf.range(vocab_size)[None] >= k_expanded

        # OPTION A - sample in the sorted space, then unsort.
        logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
        sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
        sample = tf.batch_gather(indices, sample_perm)

    return {
        'probs': probs,
        'sample': sample,
    }
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs_student = tf.nn.log_softmax(logits, axis=-1)
    probs_student = tf.nn.softmax(logits, axis=-1)

    prob_shape = tf.shape(log_probs_student)
    new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor]

    top_k_indices = tf.reshape(top_k_indices, new_shape)
    top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices)
    top_k_probs_student = tf.batch_gather(probs_student, top_k_indices)

    return top_k_log_probs_student, top_k_probs_student
Ejemplo n.º 3
0
def keypoints_flip_left_right(keypoints,swap_index=None):
    x,y = tf.unstack(keypoints,axis=-1)
    org_x = x
    cond = tf.logical_and(x>=0,y>=0)
    x = 1.0-x
    x = tf.where(cond,x,org_x)
    if swap_index is not None:
        swap_dict = {}
        for a,b in swap_index:
            swap_dict[a] = b
            swap_dict[b] = a
        X,N,_ = btf.combined_static_and_dynamic_shape(keypoints)
        indexs = []
        for i in range(N):
            if i in swap_dict:
                indexs.append(swap_dict[i])
            else:
                indexs.append(i)
        indexs = tf.convert_to_tensor(indexs,dtype=tf.int32)
        indexs = tf.reshape(indexs,[1,N])
        indexs = tf.tile(indexs,[X,1])
        x = tf.batch_gather(x,indexs)
        y = tf.batch_gather(y,indexs)

    return tf.stack([x,y],axis=-1)
Ejemplo n.º 4
0
    def _head(self, torso_output):
        torso_output, level_name = torso_output

        normalized_vf_games    = snt.Linear(self._number_of_games, name='baseline')(torso_output)
        un_normalized_vf_games = self._std * normalized_vf_games + self._mean

        # Adding time dimension
        level_name     = tf.reshape(level_name, [-1, 1, 1])

        # Reshaping as to seperate the time and batch dimensions
        # We need to know the length of the time dimension, because it may differ in the initialization
        # E.g the learner and actors have different size batch/time dimension
        normalized_vf    = tf.reshape(normalized_vf_games, [tf.shape(level_name)[0], -1, self._number_of_games])
        un_normalized_vf = tf.reshape(un_normalized_vf_games, [tf.shape(level_name)[0], -1, self._number_of_games])
        
        # Tile the time dimension
        level_name       = tf.tile(level_name, [1, tf.shape(normalized_vf)[1], 1])
        normalized_vf    = tf.batch_gather(normalized_vf, level_name)    # (batch_size, time, 1)
        un_normalized_vf = tf.batch_gather(un_normalized_vf, level_name)    # (batch_size, time, 1)
        # Reshape to the batch size - because Sonnet's BatchApply expects a batch_size * time dimension. 
        normalized_vf    = tf.reshape(normalized_vf, [tf.shape(torso_output)[0]])
        un_normalized_vf = tf.reshape(un_normalized_vf, [tf.shape(torso_output)[0]])
        
        # Sample an action from the policy.
        policy_logits = snt.Linear(self._num_actions, name='policy_logits')(torso_output)
        new_action = tf.random.categorical(policy_logits, num_samples=1, 
                                          dtype=tf.int32)
        new_action = tf.squeeze(new_action, 1, name='new_action')
        return PopArtAgentOutput(new_action, policy_logits, un_normalized_vf, normalized_vf) 
def mask_completeness_loss(logit_1, logit_2, logit_3, relation_12, relation_23):
  with tf.name_scope('mask_completeness_loss'):
    L1 = logit_1
    L2 = tf.batch_gather(logit_2, relation_12)
    L3 = tf.batch_gather(logit_3, tf.batch_gather(relation_23, relation_12))
    loss = tf.reduce_mean((L1 + L2 + L3 - 1)**2)
  return loss
Ejemplo n.º 6
0
    def kl_divergence(self,
                      alpha,
                      alpha_prior,
                      i_perm=None,
                      wrt='Dirichlet-Marginals'):
        """
        Computes the KL divergence between the Kumaraswamy q distributions and the Dirichlet prior's Beta marginals.
        :param alpha: posterior approximation Dirichlet parameters
        :param alpha_prior: prior Dirichlet parameters
        :param i_perm: random permutation indices used during sampling procedure
        :param wrt: that which the KL divergence is with respect to, either Dirichlet marginal or Beta stick breaks
        :return: KL divergence of marginal Beta distributions of shape [batch size x K]
        """
        assert wrt in {'Dirichlet-Marginals', 'Beta-Sticks'}

        # apply permutation if one was provided
        if i_perm is not None:
            alpha_prior = self.__parameter_rank_check(alpha_prior)
            alpha_prior = tf.tile(alpha_prior, tf.stack(
                (tf.shape(alpha)[0], 1)))
            alpha = tf.batch_gather(alpha, i_perm)
            alpha_prior = tf.batch_gather(alpha_prior, i_perm)

        # take KL divergence w.r.t. to the Dirichlet's marginal Betas
        if wrt == 'Dirichlet-Marginals':

            # compute marginal q(pi; a', b') approximation parameters
            a_prime = self.__parameter_rank_check(alpha)
            b_prime = tf.reduce_sum(a_prime, axis=1, keepdims=True) - a_prime

            # compute marginal p(pi; a, b) prior parameters
            a_prior = self.__parameter_rank_check(alpha_prior)
            b_prior = tf.reduce_sum(a_prior, axis=1, keepdims=True) - a_prior

        # take KL divergence w.r.t. to the stick-breaking marginal Betas
        else:

            # compute marginal q(pi; a', b') approximation parameters
            a_prime, b_prime = self.__stick_break_parameters(alpha)

            # compute marginal p(pi; a, b) prior parameters
            a_prior, b_prior = self.__stick_break_parameters(alpha_prior)

        # KL-Divergence
        kl = (a_prime - a_prior) / a_prime * (-np.euler_gamma - tf.digamma(b_prime) - 1 / b_prime) \
            + (tf.log(a_prime * b_prime)) \
            + (tf.lbeta(tf.stack((a_prior, b_prior), axis=-1))) \
            - (b_prime - 1) / b_prime
        for m in range(1, self.M + 1):
            B = tf.exp(
                tf.lbeta(
                    tf.concat((tf.expand_dims(m / a_prime, axis=-1),
                               tf.expand_dims(b_prime, axis=-1)),
                              axis=-1)))
            kl += (b_prior - 1) * b_prime / (m + a_prime * b_prime) * B

        # sum over the dimensions
        kl = tf.reduce_sum(kl, axis=1)

        return kl
Ejemplo n.º 7
0
    def sample_is(self, x, n=1):
        mixture_distribution, mixture_components = \
         self._gate.conditional_mixture_distribution(x),\
         self._experts.conditional_components_distribution(x)

        y = mixture_components.sample(n)
        # npdt = y.dtype.as_numpy_dtype

        is_logits = self._is_function(mixture_distribution.logits)
        is_mixture_distribution = ds.Categorical(logits=is_logits)
        idx = is_mixture_distribution.sample(n)

        # TODO check if we should not renormalize mixture.logits - tf.stop_...

        weights = tf.batch_gather(
            mixture_distribution.logits - tf.stop_gradient(is_logits),
            tf.transpose(idx))
        # TODO check axis
        # weights = tf.batch_gather(
        # 	log_normalize(mixture_distribution.logits - tf.stop_gradient(is_logits), axis=1),
        # 						  tf.transpose(idx))

        if n == 1:
            return tf.batch_gather(y, idx[:, :,
                                          None])[0, :,
                                                 0], tf.transpose(weights)[0]
        else:
            return tf.batch_gather(y, idx[:, :,
                                          None])[:, :,
                                                 0], tf.transpose(weights)
Ejemplo n.º 8
0
    def vote_reg_loss(seed_xyz, vote_xyz, seed_inds, vote_label, vote_label_mask):
        """
        seed_inds (B, 512)
        seed_xyz seed_points (B, 512, 3/C)
        vote_xyz vote_features (B, 512*vote_factor, 3/C)
        vote_num = num_seed * vote_factor
        GT_VOTE_FACTOR so vote_label (B,N,9)
        vote_label_mask: (B,N)
        """

        batch_size = tf.shape(seed_xyz)[0]
        num_seed = tf.shape(seed_xyz)[1]
        # vote_num = num_seed * vote_factor
        # tf 1.13
        seed_gt_votes_mask = tf.cast(tf.batch_gather(vote_label_mask, seed_inds), dtype=tf.float32)

        # same with torch.gather with 3 dims
        seed_gt_votes = tf.batch_gather(vote_label, seed_inds) + tf.tile(seed_xyz, [1, 1, 3])

        vote_xyz_reshape = tf.reshape(vote_xyz, [batch_size * num_seed, vote_factor, 3])
        seed_gt_votes_reshape = tf.reshape(seed_gt_votes, [batch_size * num_seed, GT_VOTE_FACTOR, 3])

        diff = tf.expand_dims(vote_xyz_reshape, 2) - tf.expand_dims(seed_gt_votes_reshape, 1)
        dist2center = tf.reduce_sum(tf.losses.huber_loss(labels=tf.zeros_like(diff),
                                                         predictions=diff,
                                                         reduction=tf.losses.Reduction.NONE), axis=-1)  # (B, N', BB)
        dist2 = tf.reduce_min(dist2center, axis=1)
        vote_dist = tf.reduce_min(dist2, axis=1)
        vote_dist = tf.reshape(vote_dist, [batch_size, num_seed])
        vote_loss = tf.reduce_sum(vote_dist * seed_gt_votes_mask) / tf.reduce_sum(seed_gt_votes_mask + 1e-6)
        vote_loss = tf.identity(vote_loss, 'vote_loss')
        return vote_loss
Ejemplo n.º 9
0
def get_ecdf(
        sample: tf.Tensor,
        weights: Optional[tf.Tensor] = None) -> Tuple[tf.Tensor, tf.Tensor]:
    """
    Get empirical CDF from a weighted 1D sample
    """

    if weights is None:
        weights = tf.ones_like(sample)

    with tf.control_dependencies(
        [tf.assert_equal(tf.shape(sample), tf.shape(weights))]):
        i = tf.contrib.framework.argsort(sample, axis=0)

        x = _T(tf.batch_gather(_T(sample), _T(i)))
        w = _T(tf.batch_gather(_T(weights), _T(i)))

        w_cumsum = tf.cumsum(w, axis=0)

        smallest_wsum = tf.reduce_min(w_cumsum[-1])
        with tf.control_dependencies(
            [tf.assert_greater(smallest_wsum, tf.zeros_like(smallest_wsum))]):
            w_cumsum /= w_cumsum[-1]

    return x, w_cumsum
Ejemplo n.º 10
0
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess
        self.s = tf.placeholder(tf.float32, [None, n_features], "state")
        self.q_a_ = tf.placeholder(tf.float32, [None, 1], "q_a_")
        self.r = tf.placeholder(tf.float32, [None, 1], 'r')
        self.a = tf.placeholder(tf.int32, [None, 1], 'act')

        self.q = self.build_net("Critic")
        self.q_target = self.build_net("Target")
        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Target')
        self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic')
        self.replace_target_op = [tf.assign(t, p) for t, p in zip(self.t_params, self.params)]

        self.q_a = tf.batch_gather(self.q, self.a)
        self.q_a_target = tf.batch_gather(self.q_target, self.a)

        # self.v = alpha * tf.log(tf.reduce_sum(tf.exp(self.q/alpha), axis=1, keepdims=True))
        # self.v = tf.reduce_sum(self.act_probs * self.q, axis=1, keepdims=True)
        # self.v_target = tf.reduce_max(self.q_target, axis=1, keepdims=True)

        with tf.variable_scope('squared_TD_error'):
            # self.td_error = self.r + 0.8 * self.v_ - self.v
            # self.td_error = self.q_a - (self.r + 0.8 * self.v_)
            self.td_error = self.r + 0.8 * self.q_a_ - self.q_a
            # self.h = -tf.reduce_sum(self.act_probs * tf.log(self.act_probs), axis=1, keepdims=True)
            # self.error = self.v - (self.r + 0.8 * self.v_ + alpha * self.h)
            self.loss = tf.reduce_mean(0.5*tf.square(self.td_error))  # TD_error = (r+gamma*V_next) - V_eval
            # self.loss = tf.reduce_mean(0.5*tf.square(self.error))
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
Ejemplo n.º 11
0
    def beam_search_image(self, sentence, beam_width, num_classes):
        self.feature_beam = tf.tile(tf.expand_dims(self.feature, axis=1),
                                    [1, beam_width, 1, 1])
        sentence = tf.tile(tf.expand_dims(sentence, axis=1),
                           [1, beam_width, 1])  # ba x beam x hidden
        total_sentence = tf.ones(
            (self.batch_size, beam_width, 1), dtype=tf.int32) * 2
        self.last_state = [
            tf.reshape(
                tf.tile(tf.expand_dims(state, axis=1), [1, beam_width, 1]),
                [-1, hp.lstm_units]) for state in self.last_state
        ]  # (ba x beam)x lstm_dim
        self.last_output = tf.tile(tf.expand_dims(self.last_output, axis=1),
                                   [1, beam_width, 1])  # 同上
        value = tf.log([[1.] + [0.] * (beam_width - 1)])
        mask = tf.ones((self.batch_size, beam_width))
        for i in range(hp.maxlen - 1):
            alpha = self.attention(self.last_output,
                                   self.feature_beam)  # ba x beam x 196
            image_attention = tf.reduce_sum(
                self.feature_beam * tf.expand_dims(alpha, -1),
                axis=-2)  # batch_size x beam x 1024
            if self._selector:
                image_attention = self.selector(image_attention,
                                                self.last_output)

            inputs = tf.reshape(
                tf.concat((image_attention, sentence), axis=-1),
                [-1, hp.hidden_units_cap + 1024])
            output, state = self.lstm(inputs, self.last_state)
            output = tf.reshape(output,
                                [self.batch_size, beam_width, hp.lstm_units])

            expanded_output = tf.concat([output, sentence, image_attention],
                                        axis=-1)  # ba x beam x sth
            logits = self.decode(expanded_output)
            logits = tf.nn.log_softmax(logits)
            sum_logprob = tf.expand_dims(
                value, axis=2) + logits * tf.expand_dims(mask, axis=2)
            t = tf.reshape(sum_logprob, [-1, beam_width * num_classes])
            value, index = tf.nn.top_k(t, k=beam_width)  # batch x beam
            ids = index % num_classes  # batch x beam
            pre_ids = index // num_classes  # batch x beam

            sentence = tf.nn.embedding_lookup(self.lookup_table, ids)
            pre_sentence = tf.batch_gather(total_sentence,
                                           pre_ids)  # batch x beam x len

            new_word = tf.expand_dims(ids, axis=2)  # batch x beam x 1
            total_sentence = tf.concat([pre_sentence, new_word],
                                       axis=2)  # batch x beam x (len+1)
            mask = tf.batch_gather(mask, pre_ids) * tf.to_float(
                tf.not_equal(ids, 3))  # 第一项表示之前结束没,第二项表示现在结束了吗(0表示结束)
            # 下一循环要用的
            self.last_output = output
            self.last_state = state
        preds = self.select(total_sentence, value)
        return preds
    def extrac_subject_two(self, output, subject_ids):
        """根据subject_ids从output中取出subject的向量表征
        """
        index_s = subject_ids[:, :1]  #s对应的向量
        index_e = subject_ids[:, 1:]
        start = tf.batch_gather(output, index_s)  # shape=(batch_size, 1, 768)
        end = tf.batch_gather(output, index_e)

        return start, end
Ejemplo n.º 13
0
def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9):
    """
    Does top-p sampling. if ignore_ids is on, then we will zero out those logits.
    :param logits: [batch_size, vocab_size] tensor
    :param ignore_ids: [vocab_size] one-hot representation of the indices we'd like to ignore and never predict,
                        like padding maybe
    :param p: topp threshold to use, either a float or a [batch_size] vector
    :return: [batch_size, num_samples] samples

    # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK
    """
    with tf.variable_scope('top_p_sample'):
        batch_size, vocab_size = get_shape_list(logits, expected_rank=2)

        probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10,
                              axis=-1)

        if isinstance(p, float) and p > 0.999999:
            # Don't do top-p sampling in this case
            print("Top-p sampling DISABLED", flush=True)
            return {
                'probs': probs,
                'sample': tf.random.categorical(
                    logits=logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10,
                    num_samples=num_samples, dtype=tf.int32),
            }

        # [batch_size, vocab_perm]
        indices = tf.argsort(probs, direction='DESCENDING')
        cumulative_probabilities = tf.math.cumsum(tf.batch_gather(probs, indices), axis=-1, exclusive=False)

        # find the top pth index to cut off. careful we don't want to cutoff everything!
        # result will be [batch_size, vocab_perm]
        p_expanded = p if isinstance(p, float) else p[:, None]
        exclude_mask = tf.logical_not(
            tf.logical_or(cumulative_probabilities < p_expanded, tf.range(vocab_size)[None] < 1))

        # OPTION A - sample in the sorted space, then unsort.
        logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
        sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
        sample = tf.batch_gather(indices, sample_perm)

        # OPTION B - unsort first - Indices need to go back to 0 -> N-1 -- then sample
        # unperm_indices = tf.argsort(indices, direction='ASCENDING')
        # include_mask_unperm = tf.batch_gather(include_mask, unperm_indices)
        # logits_to_use = logits - (1 - tf.cast(include_mask_unperm, tf.float32)) * 1e10
        # sample = tf.random.categorical(logits=logits_to_use, num_samples=num_samples, dtype=tf.int32)

    return {
        'probs': probs,
        # 'cumsum': cumulative_probabilities,
        'sample': sample,
        # 'indices_sorted': indices,
        # 'logits_masked': logits_to_use,
        # 'logits_raw': tf.batch_gather(logits_to_use, indices),
    }
Ejemplo n.º 14
0
def myBeamSearch(batch_states, sequence_length, k, begin_id, end_id):
    '''
    给定初始状态和序列长度,集束搜索得到topk个分数最高的id序列以及它们的分数
    input:
        batch_states: (batch,dim) tensor
        sequence_length: int
    output:
        sequence_ids:(batch,k,sequence_length)
        sequence_score:(batch,k)
    *在这里还要完成一个mask功能,也就是说当beam里的一个序列是以end_id结尾时,不再对这个序列的分数进行更新。
    维护一个mask向量(batch,k)用来进行表示
    *使用tf.batch_gather而不是tf.gather的原因是每个样本需要gather的位置是不同的,gather作用于第一个维度,batch_gather作用于第二个维度
    *tf.nn.top_k()是tensorflow提供的函数,输入(batch,n),返回最大的k个数及他们的索引(batch,k)(batch,k)
    '''
    batch_size = batch_states.shape.as_list()[0]
    #初始化状态和输入
    states = tf.tile(tf.expand_dims(batch_states, axis=1), (1, k, 1))
    inputs = tf.tile([[begin_id]], (batch_size, k))

    #初始化top k个id序列和他们的分数
    sequence_ids = tf.zeros(shape=(batch_size, k, 0))
    sequence_score = tf.zeros(shape=(batch_size, k), dtype=tf.float32)

    mask = tf.ones((batch_size, k), dtype=tf.float32)

    for i in range(sequence_length):

        #将topk个状态和输入送给RNN,得到新的states和预测的概率分布
        new_states, now_score = update_one_step(
            states, inputs)  #(batch,k,states),(batch,k,num_class)

        #根据当前已有序列的分数+每个序列的概率分布,得到k*num_class种结果的分数
        now_score = tf.multiply(
            now_score,
            tf.tile(tf.expand_dims(mask, axis=-1),
                    (1, 1, now_score.shape.as_list()[-1])))
        all_score = now_score + tf.expand_dims(sequence_score, axis=-1)

        #选出topk个高的分数以及它们的索引
        sequence_score, indexs = tf.nn.top_k(
            tf.reshape(all_score, shape=(batch_size, None)))

        #得到这topk个分数所属哪个beam,以及它们对应的输出类别即下一时刻的输入
        beam_ids = indexs // k
        inputs = indexs % k

        #更新topk个states,更新topk个输出序列
        states = tf.batch_gather(new_states, beam_ids)
        sequence_ids = tf.concat(tf.batch_gather(sequence_ids, beam_ids),
                                 tf.expand_dims(inputs, axis=-1))

        #根据end_id也就是当前的inputs来更新mask
        mask = tf.multiply(tf.cast(inputs != end_id, dtype=tf.float32),
                           tf.batch_gather(mask, beam_ids))
    return sequence_ids, sequence_score
Ejemplo n.º 15
0
    def call(self, inputs, state=None):
        s1_tm, s2_tm, s1_mask, s2_mask, rh_tm = state
        # s: (B, Lx * dim), values: (B, keys_num * dim),
        # r_h: (B, dim)
        k = self.k
        s1_tm = tf.reshape(s1_tm, [-1, self.sent1_length, self.dim])                       # (B, L1, dim)
        s2_tm = tf.reshape(s2_tm, [-1, self.sent2_length, self.dim])                       # (B, L2, dim)
        s1_mask = tf.expand_dims(s1_mask, axis=2)
        s2_mask = tf.expand_dims(s2_mask, axis=2)

        s1_score, s1_mask = self.get_phrase(s1_tm, self.sent1_length, s1_mask, rh_tm)  # (B, L1, 1)
        s2_score, s2_mask = self.get_phrase(s2_tm, self.sent2_length, s2_mask, rh_tm)  # (B, L2, 1)

        # selecting k-max
        s1_kmax_values, s1_kmax_index = tf.nn.top_k(tf.squeeze(s1_score, axis=2), k=k)
        s2_kmax_values, s2_kmax_index = tf.nn.top_k(tf.squeeze(s2_score, axis=2), k=k)

        s1_kmax_values = s1_kmax_values / tf.reduce_sum(s1_kmax_values, axis=1, keepdims=True)
        s2_kmax_values = s2_kmax_values / tf.reduce_sum(s2_kmax_values, axis=1, keepdims=True)

        s1_kmax = tf.batch_gather(s1_tm, s1_kmax_index)
        s2_kmax = tf.batch_gather(s2_tm, s2_kmax_index)

        score_matrix_kmax = tf.keras.backend.batch_dot(tf.expand_dims(s1_kmax_values, axis=2),
                                                       tf.expand_dims(s2_kmax_values, axis=2),
                                                       [2, 2])  # (B, L1, L2)

        threshold = 0.08
        condition = tf.less_equal(score_matrix_kmax, threshold)
        zero_tensor = tf.zeros_like(score_matrix_kmax)
        score_matrix_kmax = tf.keras.backend.switch(condition, zero_tensor, score_matrix_kmax)

        vec_matrix_kmax = self.get_vec_matrix(s1_kmax, k, s2_kmax, k)
        score_matrix_kmax = tf.expand_dims(score_matrix_kmax, axis=3)
        phrase_vec_kmax = self.get_cnn_feature(score_matrix_kmax * vec_matrix_kmax)

        rh, _ = self.r_cell(phrase_vec_kmax, rh_tm)

        s1_tm = tf.reshape(s1_tm, [-1, self.sent1_length * self.dim])                       # (B, L1, dim)
        s2_tm = tf.reshape(s2_tm, [-1, self.sent2_length * self.dim])                       # (B, L2, dim)
        s1_mask = tf.squeeze(s1_mask, axis=2)
        s2_mask = tf.squeeze(s2_mask, axis=2)

        # # compute mask
        # mask_temp = 1.0 - threshold_score
        # condition = tf.less_equal(mask_temp, 0.98)
        # zero_tensor = tf.zeros_like(mask_temp)
        # mask_temp = tf.keras.backend.switch(condition, zero_tensor, mask_temp)
        # s_mask = s_mask * mask_temp

        state = [s1_tm, s2_tm, s1_mask, s2_mask, rh]

        return rh, DoubleStateTuple(*state)
def compute_topk_scores_and_seq(sequences,
                                scores,
                                scores_to_gather,
                                flags,
                                beam_size,
                                prefix="default"):
  """Given sequences and scores, will gather the top k=beam size sequences.

  This function is used to grow alive, and finished. It takes sequences,
  scores, and flags, and returns the top k from sequences, scores_to_gather,
  and flags based on the values in scores.

  This method permits easy introspection using tfdbg.  It adds three named ops
  that are prefixed by `prefix`:
    - _topk_seq: the tensor for topk_seq returned by this method.
    - _topk_flags: the tensor for topk_finished_flags returned by this method.
    - _topk_scores: the tensor for tokp_gathered_scores returned by this method.

  Args:
    sequences: Tensor of sequences that we need to gather from.
      [batch_size, beam_size, seq_length]
    scores: Tensor of scores for each sequence in sequences.
      [batch_size, beam_size]. We will use these to compute the topk.
    scores_to_gather: Tensor of scores for each sequence in sequences.
      [batch_size, beam_size]. We will return the gathered scores from here.
      Scores to gather is different from scores because for grow_alive, we will
      need to return log_probs, while for grow_finished, we will need to return
      the length penalized scores.
    flags: Tensor of bools for sequences that say whether a sequence has reached
      EOS or not
    beam_size: int
    prefix: string that will prefix unique names for the ops run.

  Returns:
    Tuple of
    (topk_seq [batch_size, beam_size, decode_length],
     topk_gathered_scores [batch_size, beam_size],
     topk_finished_flags[batch_size, beam_size],
     topk_indexes)
  """
  _, topk_indexes = top_k_with_unique(scores, k=beam_size)
  # Gather up the highest scoring sequences.  For each operation added, give
  # it a concrete name to simplify observing these operations with tfdbg.
  # Clients can capture these tensors by watching these node names.
  topk_seq = tf.batch_gather(sequences, topk_indexes, prefix + "_topk_seq")
  topk_flags = tf.batch_gather(flags, topk_indexes, prefix + "_topk_flags")
  topk_gathered_scores = tf.batch_gather(scores_to_gather, topk_indexes,
                                         prefix + "_topk_scores")

  return topk_seq, topk_gathered_scores, topk_flags, topk_indexes
Ejemplo n.º 17
0
def sample_step(tokens, ignore_ids, news_config, batch_size=1, p_for_topp=0.95, cache=None, do_topk=False):
    """
    Helper function that samples from grover for a single step
    :param tokens: [batch_size, n_ctx_b] tokens that we will predict from
    :param ignore_ids: [n_vocab] mask of the tokens we don't want to predict
    :param news_config: config for the GroverModel
    :param batch_size: batch size to use
    :param p_for_topp: top-p or top-k threshold
    :param cache: [batch_size, news_config.num_hidden_layers, 2,
                   news_config.num_attention_heads, n_ctx_a,
                   news_config.hidden_size // news_config.num_attention_heads] OR, None
    :return: new_tokens, size [batch_size]
             new_probs, also size [batch_size]
             new_cache, size [batch_size, news_config.num_hidden_layers, 2, n_ctx_b,
                   news_config.num_attention_heads, news_config.hidden_size // news_config.num_attention_heads]
    """
    model = GroverModel(
        config=news_config,
        is_training=False,
        input_ids=tokens,
        reuse=tf.AUTO_REUSE,
        scope='newslm',
        chop_off_last_token=False,
        do_cache=True,
        cache=cache,
    )

    # Extract the FINAL SEQ LENGTH
    batch_size_times_seq_length, vocab_size = get_shape_list(model.logits_flat, expected_rank=2)
    prev_probs = tf.exp(tf.squeeze(tf.batch_gather(model.log_probs[:, :-1], tokens[:, 1:, None]), axis=2))

    logits = tf.reshape(model.logits_flat, [batch_size, -1, vocab_size])
    next_logits = logits[:, -1]

    if do_topk:
        sample_info = _top_k_sample(next_logits, num_samples=1, k=tf.cast(p_for_topp, dtype=tf.int32))
    else:
        sample_info = _top_p_sample(next_logits, ignore_ids=ignore_ids, num_samples=1, p=p_for_topp)

    new_tokens = tf.squeeze(sample_info['sample'], 1)
    new_probs = tf.squeeze(tf.batch_gather(sample_info['probs'], sample_info['sample']), 1)
    return {
        'new_tokens': new_tokens,
        'new_probs': new_probs,
        'new_probs_all': tf.nn.softmax(next_logits, dim=-1),
        'prev_probs': prev_probs,
        'new_cache': model.new_kvs,
    }
Ejemplo n.º 18
0
def _transpose_and_gather(feat, ind):
    # tf.keras.layers.Permute(feat, )
    feat = tf.transpose(feat, perm=(0, 2, 3, 1))
    feat = tf.reshape(feat, (tf.shape(feat)[0], -1, tf.shape(feat)[-1]))
    ind = tf.cast(ind, tf.int32)
    feat = tf.batch_gather(feat, ind)
    return feat
Ejemplo n.º 19
0
 def __init__(self, model, lr, train_config, num_classes=9):
     self.num_classes = num_classes
     self.model = model
     self.tf_input_states = model.input
     self.tf_action_probs = model.output
     self.tf_executed_actions = tf.placeholder(dtype=tf.int32,
                                               shape=[None, 1])
     self.tf_returns = tf.placeholder(dtype=tf.float32, shape=[None, 1])
     tf_executed_probs = tf.batch_gather(self.tf_action_probs,
                                         self.tf_executed_actions)
     self.tf_L = tf.reduce_mean(self.tf_returns * tf.log(tf_executed_probs))
     self.global_step = tf.train.get_or_create_global_step()
     self.learning_rate = tf.train.exponential_decay(
         lr,
         self.global_step,
         train_config['decay_step'],
         train_config['decay_factor'],
         staircase=True)
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
     with tf.control_dependencies(update_ops):
         self.train_op = tf.train.AdamOptimizer(
             learning_rate=self.learning_rate).minimize(
                 -self.tf_L, global_step=self.global_step)
     gpu_ops = tf.GPUOptions(allow_growth=True)
     config = tf.ConfigProto(gpu_options=gpu_ops)
     self.sess = tf.Session(config=config)
     # initialization
     self.sess.run(tf.initializers.global_variables())
Ejemplo n.º 20
0
def gaussian_mixture_approximate_mode(gm):
    """Returns the mean of the most probable mixture component."""
    # Find the most likely mixture component.
    mode_alpha = gm.mixture_distribution.mode()[Ellipsis, None]
    mus = gm.components_distribution.mean()
    # Gather the mean of the most likely component.
    return tf.squeeze(tf.batch_gather(mus, mode_alpha), axis=-2)
Ejemplo n.º 21
0
def _batch_gather_with_broadcast(params, indices, axis):
    """Like batch_gather, but broadcasts to the left of axis."""
    # batch_gather assumes...
    #   params.shape =  [A1,...,AN, B1,...,BM]
    #   indices.shape = [A1,...,AN, C]
    # which gives output of shape
    #                   [A1,...,AN, C, B1,...,BM]
    # Here we broadcast dims of each to the left of `axis` in params, and left of
    # the rightmost dim in indices, e.g. we can
    # have
    #   params.shape =  [A1,...,AN, B1,...,BM]
    #   indices.shape = [a1,...,aN, C],
    # where Ai broadcasts with Ai.

    # leading_bcast_shape is the broadcast of [A1,...,AN] and [a1,...,aN].
    leading_bcast_shape = tf.broadcast_dynamic_shape(
        tf.shape(params)[:axis],
        tf.shape(indices)[:-1])
    params += tf.zeros(tf.concat(
        (leading_bcast_shape, tf.shape(params)[axis:]), axis=0),
                       dtype=params.dtype)
    indices += tf.zeros(tf.concat(
        (leading_bcast_shape, tf.shape(indices)[-1:]), axis=0),
                        dtype=indices.dtype)
    return tf.batch_gather(params, indices)
Ejemplo n.º 22
0
def _tf_sample_neg(batch_size: "tf.Tensor", all_bs: "tf.Tensor",
                   neg_ids: "tf.Tensor") -> "tf.Tensor":
    """Sample negative examples for given indices"""

    tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))

    return tf.batch_gather(tiled_all_bs, neg_ids)
Ejemplo n.º 23
0
    def __call__(self, features, detection_priors, classes, is_training):
        """Generate instance masks from FPN features and detection priors.

    This corresponds to the Fig. 5-6 of the ShapeMask paper at
    https://arxiv.org/pdf/1904.03239.pdf

    Args:
      features: a float Tensor of shape [batch_size * num_instances,
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
      detection_priors: a float Tensor of shape [batch_size * num_instances,
        mask_crop_size, mask_crop_size, 1]. This is the detection prior for
        the instance.
      classes: a int Tensor of shape [batch_size, num_instances]
        of instance classes.
      is_training: a bool indicating whether in training mode.

    Returns:
      mask_outputs: instance mask prediction as a float Tensor of shape
        [batch_size * num_instances, mask_size, mask_size, num_classes].
    """
        with tf.variable_scope('coarse_mask', reuse=tf.AUTO_REUSE):
            # Transform detection priors to have the same dimension as features.
            detection_priors = tf.layers.dense(
                tf.expand_dims(detection_priors, axis=-1),
                self._num_downsample_channels)

            features += detection_priors
            mask_logits = self.decoder_net(features, is_training)
            # Gather the logits with right input class.
            mask_logits = tf.batch_gather(
                tf.transpose(mask_logits, [0, 1, 4, 2, 3]),
                tf.expand_dims(classes, -1))
            mask_logits = tf.squeeze(mask_logits, axis=2)
            return mask_logits
Ejemplo n.º 24
0
def nearest_neighbor_interpolation(
    train_points,
    train_values,
    query_points,
):
    """Performs nearest neighbor interpolation.

  Args:
    train_points

  """

    displacement_vectors = query_points[:, :, tf.
                                        newaxis, :] - train_points[:, tf.
                                                                   newaxis, :, :]

    # Comput distance between all `train_points` and `query_points`.
    # `displacement_length` has shape `[batch, query_count, train_count]`.
    displacement_length = tf.reduce_sum(displacement_vectors**2, -1)

    # Find indices with minimum distance along `train_count` axis.
    # `query_indices` is a tensor of indices into `train_values` with shape
    # `[batch, query_count]`
    query_indices = tf.math.argmin(displacement_length, axis=-1)

    print(train_values)
    print(query_indices)

    return tf.batch_gather(train_values, tf.cast(query_indices, tf.int32))
Ejemplo n.º 25
0
    def reverse(self):
        if self._reversed is not None:
            return self._reversed

        max_num_edges = self.max_num_edges

        def fn(params):
            edges, num_nodes, num_edges = params
            true_edges = edges[:num_edges, :]
            # (E, 2) and (E)
            reverse, indices = tf_lexsort(tf.reverse(true_edges, axis=[-1]))
            padded_reverse = tf_pad_axis_to(reverse, -2, max_num_edges)
            padded_indices = tf_pad_axis_to(indices, -1, max_num_edges)
            return padded_reverse, padded_indices

        reverse_edges, reverse_indices = tf.map_fn(
            fn, (self.edges, self.num_nodes, self.num_edges),
            dtype=(tf.int32, tf.int32)
        )
        reverse_edge_attrs = None
        if self.edge_attrs is not None:
            reverse_edge_attrs = self.mask_edge_info(
                tf.batch_gather(self.edge_attrs, reverse_indices), ndims=1
            )
        return BaseRuntimeGraph(
            edges=reverse_edges,
            node_mask=tf.cast(self.node_mask, tf.int32),
            center_mask=tf.cast(self.center_mask, tf.int32),
            edge_mask=tf.cast(self.edge_mask, tf.int32),
            dense=(self.dense_adjacency is not None),
            node_attrs=self.node_attrs,
            edge_attrs=reverse_edge_attrs,
            reversed=self
        )
Ejemplo n.º 26
0
 def call(self,
          inputs,
          length,
          dropout=None,
          attention_dropout=None,
          use_2d=False):
     shape = gpt2.get_tensor_shape(inputs)
     x = self.embedding(inputs)
     if use_2d:
         x = tf.reshape(
             x, [shape[0] * shape[1], self.embedding.embedding_size])
     x = self.transformer(inputs=x,
                          dropout=dropout,
                          attention_dropout=attention_dropout,
                          use_2d=use_2d,
                          shape=shape)
     result = None
     if self.mode == "last_token":
         if use_2d:
             x = tf.reshape(
                 x, [shape[0], shape[1], self.embedding.embedding_size])
         result = tf.batch_gather(x, tf.expand_dims(length, 1))
         if use_2d:
             result = tf.squeeze(result, 1)
     elif self.mode == "attention":
         mask = tf.sequence_mask(length, shape[1])
         result = self.aggregation(inputs=x,
                                   mask=mask,
                                   attention_dropout=attention_dropout,
                                   use_2d=use_2d,
                                   shape=shape)
     return result
Ejemplo n.º 27
0
    def build_rs_graph(self):
        # FIXME: not sure if it workers for batch_size > 1 (num_rollouts > 1)
        returns = 0  # (batch_size * n_candidates,)
        act = tf.random.uniform(
            shape=[self.horizon, tf.shape(self.obs_ph)[0] * self.n_candidates, self.action_space_dims],
            minval=self.env.action_space.low,
            maxval=self.env.action_space.high)

        # Equivalent to np.repeat
        observation = tf.reshape(
            tf.tile(tf.expand_dims(self.obs_ph, -1), [1, self.n_candidates, 1]),
            [-1, self.obs_space_dims]
        )
        # observation = tf.concat([self.obs_ph for _ in range(self.n_candidates)], axis=0)

        for t in range(self.horizon):
            # dynamics_dist = self.dynamics_model.distribution_info_sym(observation, act[t])
            # mean, var = dynamics_dist['mean'], dynamics_dist['var']
            # next_observation = mean + tf.random.normal(shape=tf.shape(mean))*tf.sqrt(var)
            next_observation = self.dynamics_model.predict_sym(observation, act[t])
            assert self.reward_model is None
            rewards = self.unwrapped_env.tf_reward(observation, act[t], next_observation)
            returns += self.discount ** t * rewards
            observation = next_observation
        """
        returns = tf.reshape(returns, (self.n_candidates, -1))
        idx = tf.reshape(tf.argmax(returns, axis=0), [-1, 1])  # (batch_size, 1)
        cand_a = tf.reshape(act[0], [self.n_candidates, -1, self.action_space_dims])  # (n_candidates, batch_size, act_dims)
        cand_a = tf.transpose(cand_a, perm=[1, 0, 2])  # (batch_size, n_candidates, act_dims)
        self.optimal_action = tf.squeeze(tf.batch_gather(cand_a, idx), axis=1)
        """
        returns = tf.reshape(returns, (-1, self.n_candidates))  # (batch_size, n_candidates)
        cand_a = tf.reshape(act[0], [-1, self.n_candidates, self.action_space_dims])  # (batch_size, n_candidates, act_dims)
        idx = tf.reshape(tf.argmax(returns, axis=1), [-1, 1])  # (batch_size, 1)
        self.optimal_action = tf.squeeze(tf.batch_gather(cand_a, idx), axis=1)
Ejemplo n.º 28
0
    def get_params(self, c, b, m):
        B = tf.shape(c)[0]
        d = self.hps.dimension
        r = self.hps.linear_rank
        r = d if r <= 0 else r
        h = tf.concat([c, b, m], axis=1)
        wc = self.wnn(h)
        wc1, wc2 = tf.split(wc, 2, axis=1)
        wc1 = tf.reshape(wc1, [B, d, r])
        wc2 = tf.reshape(wc2, [B, r, d])
        wc = tf.matmul(wc1, wc2)
        bc = self.bnn(h)
        weight = wc + self.w
        bias = bc + self.b
        # reorder
        query = m * (1 - b)
        order = tf.contrib.framework.argsort(query,
                                             direction='DESCENDING',
                                             stable=True)
        t = tf.batch_gather(tf.matrix_diag(query), order)
        weight = tf.matmul(tf.matmul(t, weight), tf.transpose(t,
                                                              perm=[0, 2, 1]))
        bias = tf.squeeze(tf.matmul(t, tf.expand_dims(bias, axis=-1)), axis=-1)

        return weight, bias
Ejemplo n.º 29
0
  def sample(self, num_samples=1):
    """Sample from the rejection sampling distribution.

    For ease of implementation, draw the maximum number of proposal samples.

    Args:
      num_samples: integer, number of samples to draw.

    Returns:
      samples: Tensor of samples from the distribution, [num_samples] + data_dim
    """
    flat_proposal_samples = self.proposal.sample(num_samples * self.T)
    proposal_samples = tf.reshape(flat_proposal_samples,
                                  [num_samples, self.T] + self.data_dim)
    flat_logit_accept = self.logit_accept_fn(flat_proposal_samples)
    logit_accept = tf.reshape(flat_logit_accept, [num_samples, self.T])
    accept_samples = tfd.Bernoulli(logits=logit_accept[:, :-1]).sample()

    # Add forced accept to last sample to ensure truncation
    accept_samples = tf.concat([
        accept_samples,
        tf.ones([num_samples, 1], dtype=accept_samples.dtype)
    ], axis=-1)

    # For each of sample_shape, find the first nonzero accept
    def get_first_nonzero_index(t):
      # t is batch_dims + [T], t is binary.
      _, indices = tf.math.top_k(t, k=1, sorted=False)
      return indices

    accept_indices = get_first_nonzero_index(accept_samples)  # sample_shape
    samples = tf.batch_gather(proposal_samples, accept_indices)
    return tf.squeeze(samples, axis=1)  # Squeeze the selected dim
Ejemplo n.º 30
0
    def _interpolate(self, xy1, xy2, points2):
        batch_size = tf.shape(xy1)[0]
        ndataset1 = tf.shape(xy1)[1]

        eps = 1e-6
        dist_mat = tf.matmul(xy1, xy2, transpose_b=True)
        norm1 = tf.reduce_sum(xy1 * xy1, axis=-1, keepdims=True)
        norm2 = tf.reduce_sum(xy2 * xy2, axis=-1, keepdims=True)
        dist_mat = tf.sqrt(norm1 - 2 * dist_mat +
                           tf.linalg.matrix_transpose(norm2) + eps)
        dist, idx = tf.math.top_k(tf.negative(dist_mat), k=3)

        dist = tf.maximum(dist, 1e-10)
        norm = tf.reduce_sum((1.0 / dist), axis=2, keepdims=True)
        norm = tf.tile(norm, [1, 1, 3])
        weight = (1.0 / dist) / norm
        idx = tf.reshape(idx, (batch_size, -1))
        nn_points = tf.batch_gather(points2, idx)
        nn_points = tf.reshape(
            nn_points,
            (batch_size, ndataset1, 3, points2.get_shape()[-1].value))
        interpolated_points = tf.reduce_sum(weight[..., tf.newaxis] *
                                            nn_points,
                                            axis=-2)

        return interpolated_points