예제 #1
0
 def get_bert_embeddings(self, flattened_input_ids, flattened_input_mask,
                         is_training: bool):
     """
     applying BERT to each sliding window, and get token embeddings corresponding to the right tokens
     :param flattened_input_ids: [-1]
     :param flattened_input_mask: [-1]
     :param is_training:
     :return: (num_tokens, embed_size)
     """
     input_ids = tf.reshape(flattened_input_ids,
                            [-1, self.config.sliding_window_size])
     input_mask = tf.reshape(flattened_input_mask,
                             [-1, self.config.sliding_window_size])
     actual_mask = tf.cast(tf.not_equal(input_mask, self.config.pad_idx),
                           tf.int32)
     with tf.variable_scope('bert', reuse=tf.AUTO_REUSE):
         bert_model = BertModel(self.bert_config,
                                is_training,
                                input_ids,
                                actual_mask,
                                scope='bert')
     bert_embeddings = bert_model.get_sequence_output(
     )  # (num_windows, window_size, embed_size)
     flattened_embeddings = tf.reshape(bert_embeddings,
                                       [-1, self.bert_config.hidden_size])
     flattened_mask = tf.greater_equal(flattened_input_mask, 0)
     output_embeddings = tf.boolean_mask(flattened_embeddings,
                                         flattened_mask)
     print('xixi', bert_embeddings.get_shape(),
           output_embeddings.get_shape(), flattened_embeddings.get_shape(),
           flattened_mask.get_shape())
     return output_embeddings
class CheckpointSmallBERT(AbstractBase):
    def __init__(self, path, training=False, max_seq_length=512):
        self.max_seq_length = max_seq_length
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.input_ids = tf.compat.v1.placeholder(
                tf.int32, shape=(None, self.max_seq_length))
            self.input_mask = tf.compat.v1.placeholder(
                tf.int32, shape=(None, self.max_seq_length))
            self.segment_ids = tf.compat.v1.placeholder(
                tf.int32, shape=(None, self.max_seq_length))
            self.bert_config = BertConfig.from_json_file(path +
                                                         '/bert_config.json')
            self.bert_module = BertModel(config=self.bert_config,
                                         is_training=training,
                                         input_ids=self.input_ids,
                                         input_mask=self.input_mask,
                                         token_type_ids=self.segment_ids,
                                         use_one_hot_embeddings=False)
            assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(
                tf.trainable_variables(), path + '/bert_model.ckpt')
            tf.train.init_from_checkpoint(path + '/bert_model.ckpt',
                                          assignment_map)
            self.sess = tf.compat.v1.Session()
            self.sess.run(
                tf.group(tf.compat.v1.global_variables_initializer(),
                         tf.compat.v1.tables_initializer()))
            self.bert_outputs = {
                'sequence_output': self.bert_module.get_sequence_output(),
                'pooled_output': self.bert_module.get_pooled_output(),
            }
            self.tok = tokenization.FullTokenizer(vocab_file=path +
                                                  '/vocab.txt',
                                                  do_lower_case=True)
    def __init__(self):
        bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name
        self.do_lower_case = args.bert_model_name.startswith('uncased')
        self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt')
        self.config_file = os.path.join(bert_pretrained_dir,
                                        'bert_config.json')
        self.tokenizer = FullTokenizer(vocab_file=self.vocab_file,
                                       do_lower_case=self.do_lower_case)

        self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids')
        self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask')
        self.segment_ids = tf.placeholder(tf.int64, [None, None],
                                          'segment_ids')

        bert_config = BertConfig.from_json_file(self.config_file)
        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=self.input_id,
                          input_mask=self.input_mask,
                          token_type_ids=self.segment_ids,
                          use_one_hot_embeddings=True,
                          scope='bert')
        self.output_layer = model.get_sequence_output()
        self.embedding_layer = model.get_embedding_output()

        saver = tf.train.Saver()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')
예제 #4
0
 def model_fn(features, labels, mode, params):
     input_ids = features["input_ids"]
     input_mask = features["input_mask"]
     segment_ids = features["segment_ids"]
     model = BertModel(config, True, input_ids, input_mask, segment_ids)
     final_hidden = model.get_sequence_output()
     return final_hidden
예제 #5
0
        def qa_loop_body(i, starts, ends, labels, scores):
            input_ids = tf.reshape(flattened_input_ids,
                                   [-1, self.config.sliding_window_size
                                    ])  # (num_windows, window_size)
            input_mask = tf.reshape(flattened_input_mask,
                                    [-1, self.config.sliding_window_size])
            actual_mask = tf.cast(tf.not_equal(input_mask,
                                               self.config.pad_idx),
                                  tf.int32)  # (num_windows, window_size)

            num_windows = tf.shape(actual_mask)[0]
            question_tokens = self.get_question_token_ids(
                sentence_map, flattened_input_ids, flattened_input_mask,
                top_span_starts[i], top_span_ends[i])  # (num_question_tokens)
            tiled_question = tf.tile(
                tf.expand_dims(question_tokens, 0),
                [num_windows, 1])  # (num_windows, num_ques_tokens)
            question_ones = tf.ones_like(tiled_question, dtype=tf.int32)
            question_zeros = tf.zeros_like(tiled_question, dtype=tf.int32)
            qa_input_ids = tf.concat(
                [tiled_question, input_ids],
                1)  # (num_windows, num_ques_tokens + window_size)
            qa_input_mask = tf.concat(
                [question_ones, actual_mask],
                1)  # (num_windows, num_ques_tokens + window_size)
            token_type_ids = tf.concat([question_zeros, actual_mask], 1)
            with tf.variable_scope('bert', reuse=tf.AUTO_REUSE):
                bert_model = BertModel(self.bert_config,
                                       is_training,
                                       qa_input_ids,
                                       qa_input_mask,
                                       token_type_ids,
                                       scope='bert')
            bert_embeddings = bert_model.get_sequence_output(
            )  # num_windows, num_ques_tokens + window_size, embed_size
            flattened_embeddings = tf.reshape(
                bert_embeddings, [-1, self.bert_config.hidden_size])
            output_mask = tf.concat(
                [-1 * question_ones, input_mask],
                1)  # (num_windows, num_ques_tokens + window_size)
            flattened_mask = tf.reshape(tf.greater_equal(output_mask, 0), [-1])
            qa_embeddings = tf.boolean_mask(
                flattened_embeddings,
                flattened_mask)  # (num_tokens, embed_size)
            qa_scores, qa_indices, qa_starts, qa_ends, qa_embs = self.filter_by_mention_scores(
                qa_embeddings, candidate_starts, candidate_ends, dropout, c)
            qa_cluster_ids = self.get_top_span_cluster_ids(
                candidate_starts, candidate_ends, span_starts, span_ends,
                cluster_ids, qa_indices)
            return (i + 1,
                    tf.concat(
                        [starts, tf.expand_dims(qa_starts, axis=0)], axis=0),
                    tf.concat([ends, tf.expand_dims(qa_ends, axis=0)], axis=0),
                    tf.concat([labels,
                               tf.expand_dims(qa_cluster_ids, axis=0)],
                              axis=0),
                    tf.concat(
                        [scores, tf.expand_dims(qa_scores, axis=0)], axis=0))
예제 #6
0
    def body(self, features, mode):
        """Body of the model, aka Bert

        Arguments:
            features {dict} -- feature dict,
                keys: input_ids, input_mask, segment_ids
            mode {mode} -- mode

        Returns:
            dict -- features extracted from bert.
                keys: 'seq', 'pooled', 'all', 'embed'

        seq:
            tensor, [batch_size, seq_length, hidden_size]
        pooled:
            tensor, [batch_size, hidden_size]
        all:
            list of tensor, num_hidden_layers * [batch_size, seq_length, hidden_size]
        embed:
            tensor, [batch_size, seq_length, hidden_size]
        """

        config = self.config
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = BertModel(config=config.bert_config,
                          is_training=is_training,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          token_type_ids=segment_ids,
                          use_one_hot_embeddings=config.use_one_hot_embeddings)

        feature_dict = {}
        for logit_type in ['seq', 'pooled', 'all', 'embed', 'embed_table']:
            if logit_type == 'seq':
                # tensor, [batch_size, seq_length, hidden_size]
                feature_dict[logit_type] = model.get_sequence_output()
            elif logit_type == 'pooled':
                # tensor, [batch_size, hidden_size]
                feature_dict[logit_type] = model.get_pooled_output()
            elif logit_type == 'all':
                # list, num_hidden_layers * [batch_size, seq_length, hidden_size]
                feature_dict[logit_type] = model.get_all_encoder_layers()
            elif logit_type == 'embed':
                # for res connection
                feature_dict[logit_type] = model.get_embedding_output()
            elif logit_type == 'embed_table':
                feature_dict[logit_type] = model.get_embedding_table()

        return feature_dict
예제 #7
0
    def get_model(self):
        logging.info("get bert model")
        graph = tf.Graph()
        with graph.as_default():
            ph_input_ids = tf.placeholder(dtype=tf.int32, shape=[None, self._seq_length + 2], name="ph_input_ids")
            con = BertConfig.from_json_file(config.PROJECT_ROOT + "/bert_config.json")
            bert_model = BertModel(config=con, is_training=False, input_ids=ph_input_ids,
                                   use_one_hot_embeddings=True)
            output = bert_model.get_sequence_output()
            init = tf.global_variables_initializer()

        sess = tf.Session(graph=graph)
        sess.run(init)

        return sess, ph_input_ids, output
class BertEncoder(object):
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None):
        self.model = BertModel(config=config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=token_type_ids)

        self.embeddings_table = self.model.get_embedding_table()

    def encode(self):
        #encoded is => sequence_output` shape = [batch_size, seq_length, hidden_size].
        output = self.model.get_sequence_output()
        states = ()
        for layer in self.model.get_all_encoder_layers():
            states += (tf.reduce_mean(layer, axis=1), )
        return output, states,
예제 #9
0
def main(_):
    logging.set_verbosity(logging.INFO)

    for i in range(_NUM_PARTITIONS):
        tf.io.gfile.makedirs(
            os.path.join(FLAGS.output_bert_feature_dir, '%02d' % i))

    # Create Bert model.
    bert_tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.bert_vocab_file, do_lower_case=FLAGS.do_lower_case)

    # Bert prediction.
    input_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
    token_to_id_layer = token_to_id.TokenToIdLayer(FLAGS.bert_vocab_file,
                                                   unk_token_id=UNK)

    bert_config = BertConfig.from_json_file(FLAGS.bert_config_file)
    bert_model = BertModel(bert_config,
                           is_training=False,
                           input_ids=token_to_id_layer(
                               tf.expand_dims(input_placeholder, 0)))
    sequence_output = bert_model.get_sequence_output()[0]
    pooled_output = bert_model.get_pooled_output()[0]
    saver = tf.compat.v1.train.Saver()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.compat.v1.Session(config=config)
    sess.run(tf.compat.v1.tables_initializer())
    saver.restore(sess, FLAGS.bert_checkpoint_file)

    for name in sess.run(tf.compat.v1.report_uninitialized_variables()):
        logging.warn('%s is uninitialized!', name)

    def _bert_fn(sequence):
        return sess.run([sequence_output, pooled_output],
                        feed_dict={input_placeholder: sequence})

    # Load annotations.
    annots = _load_annotations(FLAGS.annotations_jsonl_file)
    logging.info('Loaded %i annotations.', len(annots))

    shard_id, num_shards = FLAGS.shard_id, FLAGS.num_shards
    assert 0 <= shard_id < num_shards

    for idx, annot in enumerate(annots):
        if (idx + 1) % 1000 == 0:
            logging.info('On example %i/%i.', idx + 1, len(annots))

        annot_id = int(annot['annot_id'].split('-')[-1])
        if annot_id % num_shards != shard_id:
            continue

        # Check npy file.
        part_id = get_partition_id(annot['annot_id'])
        output_file = os.path.join(FLAGS.output_bert_feature_dir,
                                   '%02d' % part_id,
                                   annot['annot_id'] + '.npy')
        if os.path.isfile(output_file):
            logging.info('%s is there.', output_file)
            continue

        annot_id = int(annot['annot_id'].split('-')[-1])
        if annot_id % num_shards != shard_id:
            continue

        # Create TF example.
        bert_outputs = _create_bert_embeddings(annot, bert_tokenizer,
                                               FLAGS.do_lower_case, _bert_fn)
        with open(output_file, 'wb') as f:
            np.save(f, bert_outputs)

    logging.info('Done')
예제 #10
0
    def build(self, data_iter, bert_config_file):
        # get the inputs
        with tf.variable_scope('inputs'):
            input_map = data_iter.get_next()
            usrid, prdid, input_x, input_y, doc_len = \
                (input_map['usr'], input_map['prd'],
                 input_map['content'], input_map['rating'],
                 input_map['doc_len'])

            input_x = tf.cast(input_x, tf.int32)
            self.usr = lookup(self.embeddings['usr_emb'],
                              usrid,
                              name='cur_usr_embedding')
            self.prd = lookup(self.embeddings['prd_emb'],
                              prdid,
                              name='cur_prd_embedding')
            # input_x = lookup(self.embeddings['wrd_emb'], input_x, name='cur_wrd_embedding')
            input_x = tf.reshape(input_x, [-1, self.max_doc_len])
            input_mask = tf.sequence_mask(doc_len, self.max_doc_len)
            input_mask = tf.cast(input_mask, tf.int32)

        bert_config = BertConfig.from_json_file(bert_config_file)
        bert = BertModel(bert_config,
                         is_training=True,
                         input_ids=input_x,
                         input_mask=input_mask,
                         token_type_ids=None,
                         use_one_hot_embeddings=False)

        pooled_output = bert.get_pooled_output()
        sequence_output = bert.get_sequence_output()
        alphas = attention(sequence_output, None, self.max_doc_len,
                           self.max_doc_len)
        sequence_output = tf.matmul(alphas, sequence_output)
        sequence_output = tf.squeeze(sequence_output, axis=1)
        bert_output = tf.concat([pooled_output, sequence_output], axis=1)

        logits = tf.layers.dense(
            bert_output,
            self.cls_cnt,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02))
        self.bert_output = bert_output
        self.logits = logits

        # build the process of model
        prediction = tf.argmax(logits, 1, name='prediction')
        self.prediction = prediction

        with tf.variable_scope("loss"):
            sce = tf.nn.softmax_cross_entropy_with_logits_v2
            log_probs = tf.nn.log_softmax(logits)
            self.probs = tf.nn.softmax(logits)
            loss = -tf.reduce_sum(tf.one_hot(
                input_y, self.cls_cnt, dtype=tf.float32) * log_probs,
                                  axis=-1)
            self.loss = tf.reduce_mean(loss)
            # self.loss = sce(logits=logits, labels=tf.one_hot(input_y, self.cls_cnt))
            # self.loss = tf.reduce_mean(self.loss)
            self.total_loss = tf.reduce_sum(loss)

        prediction = tf.argmax(logits, 1, name='prediction')
        with tf.variable_scope("metrics"):
            correct_prediction = tf.equal(prediction, input_y)
            self.correct = correct_prediction
            mse = tf.reduce_sum(tf.square(prediction - input_y), name="mse")
            correct_num = tf.reduce_sum(tf.cast(correct_prediction,
                                                dtype=tf.int32),
                                        name="correct_num")
            accuracy = tf.reduce_sum(tf.cast(correct_prediction, "float"),
                                     name="accuracy")

        return self.total_loss, mse, correct_num, accuracy
예제 #11
0
파일: model.py 프로젝트: xurenlu/chatbot
class ChatModel:
    def __init__(self, chatmodel_config):
        self.chatmodel_config = chatmodel_config
        self.max_x_len = chatmodel_config.max_x_len
        self.max_y_len = chatmodel_config.max_y_len
        self.decode_max_len = chatmodel_config.max_decode_len
        self.vocab = chatmodel_config.vocab
        self.config_file = chatmodel_config.config_file
        self.ckpt_file = chatmodel_config.ckpt_file
        self.beam_width = chatmodel_config.beam_width
        self.dropout_rate = chatmodel_config.dropout_rate
        self.coverage_penalty_weight = chatmodel_config.coverage_penalty_weight
        self.length_penalty_weight = chatmodel_config.length_penalty_weight
        self.x = tf.placeholder(tf.int32,
                                shape=[None, self.max_x_len],
                                name='x')
        self.x_mask = tf.placeholder(tf.int32,
                                     shape=[None, self.max_x_len],
                                     name='x_mask')
        self.x_seg = tf.placeholder(tf.int32,
                                    shape=[None, self.max_x_len],
                                    name='x_seg')
        self.x_len = tf.placeholder(tf.int32, shape=[None], name='x_len')
        self.y = tf.placeholder(tf.int32,
                                shape=[None, self.max_y_len],
                                name='y')
        self.y_len = tf.placeholder(tf.int32, shape=[None], name='y_len')

    def create_model(self):
        self.bert_config = BertConfig.from_json_file(self.config_file)
        self.vocab_size = self.bert_config.vocab_size
        self.hidden_size = self.bert_config.hidden_size
        self.bert_model = BertModel(config=self.bert_config,
                                    input_ids=self.x,
                                    input_mask=self.x_mask,
                                    token_type_ids=self.x_seg,
                                    is_training=True,
                                    use_one_hot_embeddings=False)
        if self.ckpt_file is not None:
            tvars = tf.trainable_variables()
            self.assignment_map, self.initialized_variable_map = modeling.get_assignment_map_from_checkpoint(
                tvars, self.ckpt_file)
        X = self.bert_model.get_sequence_output()
        self.embeddings = self.bert_model.get_embedding_table()
        encoder_output = X[:, 1:, :]
        encoder_state = X[:, 0, :]
        batch_size = tf.shape(self.x)[0]
        start_token = tf.ones([batch_size], dtype=tf.int32) * self.vocab['<S>']
        train_output = tf.concat([tf.expand_dims(start_token, 1), self.y], 1)
        output_emb = tf.nn.embedding_lookup(self.embeddings, train_output)
        output_len = self.y_len
        train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            output_emb, output_len, self.embeddings, 0.1)
        input_len = self.x_len - 2
        cell = tf.contrib.rnn.GRUCell(num_units=self.hidden_size)

        def decode(scope):
            with tf.variable_scope(scope):
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=self.hidden_size,
                    memory=encoder_output,
                    memory_sequence_length=input_len)
                attention_cell = tf.contrib.seq2seq.AttentionWrapper(
                    cell=cell,
                    attention_mechanism=attention_mechanism,
                    attention_layer_size=self.hidden_size)
                out_cell = MyOutputProjectionWrapper(attention_cell,
                                                     self.vocab_size,
                                                     self.embeddings,
                                                     reuse=False)
                initial_state = out_cell.zero_state(dtype=tf.float32,
                                                    batch_size=batch_size)
                initial_state = initial_state.clone(cell_state=encoder_state)
                decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=out_cell,
                    helper=train_helper,
                    initial_state=initial_state)
                t_final_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=self.decode_max_len)
            with tf.variable_scope(scope, reuse=True):
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    encoder_output, multiplier=self.beam_width)
                tiled_encoder_state = tf.contrib.seq2seq.tile_batch(
                    encoder_state, multiplier=self.beam_width)
                tiled_input_len = tf.contrib.seq2seq.tile_batch(
                    input_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=self.hidden_size,
                    memory=tiled_encoder_output,
                    memory_sequence_length=tiled_input_len)
                attention_cell = tf.contrib.seq2seq.AttentionWrapper(
                    cell=cell,
                    attention_mechanism=attention_mechanism,
                    attention_layer_size=self.hidden_size)
                out_cell = MyOutputProjectionWrapper(attention_cell,
                                                     self.vocab_size,
                                                     self.embeddings,
                                                     reuse=True)
                initial_state = out_cell.zero_state(dtype=tf.float32,
                                                    batch_size=batch_size *
                                                    self.beam_width)
                initial_state = initial_state.clone(
                    cell_state=tiled_encoder_state)
                self.end_token = self.vocab['<T>']
                beamDecoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=out_cell,
                    embedding=self.embeddings,
                    start_tokens=start_token,
                    end_token=self.end_token,
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    coverage_penalty_weight=self.coverage_penalty_weight,
                    length_penalty_weight=self.length_penalty_weight)
                p_final_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=beamDecoder,
                    output_time_major=False,
                    maximum_iterations=self.decode_max_len)
            return t_final_output, p_final_output

        t_output, p_output = decode('decode')

        p_output = tf.identity(p_output.predicted_ids[:, :, 0],
                               name='predictions')
        return t_output, p_output

    def loss(self):
        t_output, p_output = self.create_model()
        decode_len = tf.shape(t_output.sample_id)[-1]
        y_target = self.y[:, :decode_len]
        mask_len = tf.maximum(decode_len, self.y_len)
        y_mask = tf.sequence_mask(mask_len, self.max_y_len, dtype=tf.float32)
        y_mask = y_mask[:, :decode_len]
        loss = tf.contrib.seq2seq.sequence_loss(t_output.rnn_output,
                                                y_target,
                                                weights=y_mask)
        p_output_sparse = self._convert_tensor_to_sparse(
            p_output, self.end_token)
        y_output_sparse = self._convert_tensor_to_sparse(
            self.y, self.end_token)
        distance = tf.reduce_sum(
            tf.edit_distance(p_output_sparse, y_output_sparse,
                             normalize=False))
        return loss, distance, p_output, t_output.sample_id

    def _convert_tensor_to_sparse(self, a, end_token):
        indices = tf.where(tf.not_equal(a, 0) & tf.not_equal(a, end_token))
        values = tf.gather_nd(a, indices)
        sparse_a = tf.SparseTensor(indices, values,
                                   tf.shape(a, out_type=tf.int64))
        return sparse_a