Example #1
0
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)

    # 加载数据

    # 加载vocs
    path_vocs = []
    for feature_name in feature_names:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    data_dict = init_data(
        path=config['data_params']['path_train'], feature_names=feature_names, sep=sep,
        vocs=vocs, max_len=config['model_params']['sequence_length'], model='train')

    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        path_model=config['model_params']['path_model'])

    model.fit(
        data_dict=data_dict, dev_size=config['model_params']['dev_size'])
Example #2
0
    def __init__(self):
        self.vocab_path = FLAGS.vocab_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.freeze_graph_path = FLAGS.freeze_graph_path
        self.saved_model_path = FLAGS.saved_model_path

        self.use_crf = FLAGS.use_crf
        self.num_steps = FLAGS.num_steps

        self.default_label = FLAGS.default_label
        self.default_score = FLAGS.default_predict_score

        self.data_utils = DataUtils()
        self.tensorflow_utils = TensorflowUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.sequence_labeling_model = SequenceLabelingModel()
        self.init_predict_graph()
Example #3
0
def train():
    train_iter, val_iter, test_iter = get_data_iter()
    model = SequenceLabelingModel(args, logger).cuda()
    optimizer, scheduler = get_optimizer_scheduler(model)
    early_stopping_criterion = EarlyStoppingCriterion(patience=args.early_stopping_patience)

    logger.info('Start training')

    for epoch in range(args.max_epochs):
        cur_lr = get_lr(optimizer)
        logger.info('Epoch %d, lr %.6f' % (epoch, cur_lr))

        model.train()
        train_score = []
        batch_num = len(train_iter)
        cur_num = 0
        train_iter.init_epoch()
        progress = tqdm(train_iter, mininterval=2, leave=False, file=sys.stdout)
        for i, batch in enumerate(progress):
            optimizer.zero_grad()

            batch_score = model.forward(batch)
            train_score.append(batch_score.item())
            cur_num += batch.batch_size
            progress.set_description(desc='%d/%d, train loss %.4f' % (i, batch_num, sum(train_score) / cur_num))
            batch_score.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()

        val_score = evaluate(model, val_iter, 'val')
        test_score = evaluate(model, test_iter, 'test')
        if not early_stopping_criterion.step(val_score):
            break
        scheduler.step(val_score)
Example #4
0
    def __init__(self):
        self.tfrecords_path = FLAGS.tfrecords_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.tensorboard_path = FLAGS.tensorboard_path

        self.use_crf = FLAGS.use_crf
        self.learning_rate = FLAGS.learning_rate
        self.learning_rate_decay_factor = FLAGS.learning_rate_decay_factor
        self.decay_steps = FLAGS.decay_steps
        self.clip_norm = FLAGS.clip_norm
        self.max_training_step = FLAGS.max_training_step

        self.train_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                     'train.tfrecords')
        self.test_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                    'test.tfrecords')

        self.data_utils = DataUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(
            os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.tensorflow_utils = TensorflowUtils()
        self.sequence_labeling_model = SequenceLabelingModel()
Example #5
0
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None
    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(
        path=config['data_params']['path_test'], feature_names=feature_names, sep=sep,
        vocs=vocs, max_len=max_len, model='test', use_char_feature=use_char_feature,
        word_len=word_len)

    # 加载模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])
    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # 标记
    viterbi_sequences = model.predict(data_dict)

    # 写入文件
    label_voc = dict()
    for key in vocs[-1]:
        label_voc[vocs[-1][key]] = key
    with codecs.open(config['data_params']['path_test'], 'r', encoding='utf-8') as file_r:
        sentences = file_r.read().strip().split('\n\n')
    file_result = codecs.open(
        config['data_params']['path_result'], 'w', encoding='utf-8')
    for i, sentence in enumerate(sentences):
        for j, item in enumerate(sentence.split('\n')):
            if j < len(viterbi_sequences[i]):
                file_result.write('%s\t%s\n' % (item, label_voc[viterbi_sequences[i][j]]))
            else:
                file_result.write('%s\tO\n' % item)
        file_result.write('\n')

    file_result.close()
Example #6
0
class Predict(object):
    def __init__(self):
        self.vocab_path = FLAGS.vocab_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.freeze_graph_path = FLAGS.freeze_graph_path
        self.saved_model_path = FLAGS.saved_model_path

        self.use_crf = FLAGS.use_crf
        self.num_steps = FLAGS.num_steps

        self.default_label = FLAGS.default_label
        self.default_score = FLAGS.default_predict_score

        self.data_utils = DataUtils()
        self.tensorflow_utils = TensorflowUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(
            os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.sequence_labeling_model = SequenceLabelingModel()
        self.init_predict_graph()

    def init_predict_graph(self):
        """
        init predict model graph
        :return:
        """
        # split 1-D String dense Tensor to words SparseTensor
        self.input_sentences = tf.placeholder(dtype=tf.string,
                                              shape=[None],
                                              name='input_sentences')
        sparse_words = tf.string_split(self.input_sentences, delimiter=' ')

        # slice SparseTensor
        valid_indices = tf.less(sparse_words.indices,
                                tf.constant([self.num_steps], dtype=tf.int64))
        valid_indices = tf.reshape(
            tf.split(valid_indices, [1, 1], axis=1)[1], [-1])
        valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices)

        excess_indices = tf.greater_equal(
            sparse_words.indices, tf.constant([self.num_steps],
                                              dtype=tf.int64))
        excess_indices = tf.reshape(
            tf.split(excess_indices, [1, 1], axis=1)[1], [-1])
        excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices)

        # compute sentences lengths
        int_values = tf.ones(shape=tf.shape(valid_sparse_words.values),
                             dtype=tf.int64)
        int_valid_sparse_words = tf.SparseTensor(
            indices=valid_sparse_words.indices,
            values=int_values,
            dense_shape=valid_sparse_words.dense_shape)
        input_sentences_lengths = tf.sparse_reduce_sum(int_valid_sparse_words,
                                                       axis=1)

        # sparse to dense
        default_padding_word = self.data_utils._START_VOCAB[0]
        words = tf.sparse_to_dense(
            sparse_indices=valid_sparse_words.indices,
            output_shape=[valid_sparse_words.dense_shape[0], self.num_steps],
            sparse_values=valid_sparse_words.values,
            default_value=default_padding_word)

        # dict words to ids
        with open(os.path.join(self.vocab_path, 'words_vocab.txt'),
                  encoding='utf-8',
                  mode='rt') as data_file:
            words_table_list = [
                line.strip() for line in data_file if line.strip()
            ]
        words_table_tensor = tf.constant(words_table_list, dtype=tf.string)
        words_table = lookup.index_table_from_tensor(
            mapping=words_table_tensor,
            default_value=self.data_utils._START_VOCAB_ID[3])
        # words_table = lookup.index_table_from_file(os.path.join(vocab_path, 'words_vocab.txt'), default_value=3)
        words_ids = words_table.lookup(words)

        # blstm model predict
        with tf.variable_scope('model', reuse=None):
            logits = self.sequence_labeling_model.inference(
                words_ids,
                input_sentences_lengths,
                self.num_classes,
                is_training=False)

        if self.use_crf:
            logits = tf.reshape(logits,
                                shape=[-1, self.num_steps, self.num_classes])
            transition_params = tf.get_variable(
                "transitions", [self.num_classes, self.num_classes])
            input_sentences_lengths = tf.to_int32(input_sentences_lengths)
            predict_labels_ids, sequence_scores = crf.crf_decode(
                logits, transition_params, input_sentences_lengths)
            predict_labels_ids = tf.to_int64(predict_labels_ids)
            sequence_scores = tf.reshape(sequence_scores, shape=[-1, 1])
            normalized_sequence_scores = self.tensorflow_utils.score_normalize(
                sequence_scores)
            predict_scores = tf.matmul(
                normalized_sequence_scores,
                tf.ones(shape=[1, self.num_steps], dtype=tf.float32))
        else:
            props = tf.nn.softmax(logits)
            max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1)
            predict_labels_ids = tf.reshape(max_prop_indices,
                                            shape=[-1, self.num_steps])
            predict_labels_ids = tf.to_int64(predict_labels_ids)
            predict_scores = tf.reshape(max_prop_values,
                                        shape=[-1, self.num_steps])
        predict_scores = tf.as_string(predict_scores, precision=3)

        # dict ids to labels
        with open(os.path.join(self.vocab_path, 'labels_vocab.txt'),
                  encoding='utf-8',
                  mode='rt') as data_file:
            labels_table_list = [
                line.strip() for line in data_file if line.strip()
            ]
        labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string)
        labels_table = lookup.index_to_string_table_from_tensor(
            mapping=labels_table_tensor, default_value=self.default_label)
        # labels_table = lookup.index_to_string_table_from_file(os.path.join(vocab_path, 'labels_vocab.txt'), default_value='O')
        predict_labels = labels_table.lookup(predict_labels_ids)

        sparse_predict_labels = self.tensorflow_utils.sparse_concat(
            predict_labels, valid_sparse_words, excess_sparse_words,
            self.default_label)
        sparse_predict_scores = self.tensorflow_utils.sparse_concat(
            predict_scores, valid_sparse_words, excess_sparse_words, '0.0')

        self.format_predict_labels = self.tensorflow_utils.sparse_string_join(
            sparse_predict_labels, 'predict_labels')
        self.format_predict_scores = self.tensorflow_utils.sparse_string_join(
            sparse_predict_scores, 'predict_scores')

        saver = tf.train.Saver()
        tables_init_op = tf.tables_initializer()

        self.sess = tf.Session()
        self.sess.run(tables_init_op)
        ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            print('read model from {}'.format(ckpt.model_checkpoint_path))
            saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found at %s' % self.checkpoint_path)
            return

    def predict(self, words_list):
        """
        Predict labels, the operation of transfer words to ids is processed by tensorflow tensor
        Input words list
        :param words_list:
        :return:
        """
        split_words_list = []
        map_split_indexes = []
        for index in range(len(words_list)):
            temp_words_list = self.data_utils.split_long_sentence(
                words_list[index], self.num_steps)
            map_split_indexes.append(
                list(
                    range(len(split_words_list),
                          len(split_words_list) + len(temp_words_list))))
            split_words_list.extend(temp_words_list)

        predict_labels, predict_scores = self.sess.run(
            [self.format_predict_labels, self.format_predict_scores],
            feed_dict={self.input_sentences: split_words_list})
        predict_labels_str = [
            predict_label.decode('utf-8') for predict_label in predict_labels
        ]
        predict_scores_str = [
            predict_score.decode('utf-8') for predict_score in predict_scores
        ]

        merge_predict_labels_str = []
        merge_predict_scores_str = []
        for indexes in map_split_indexes:
            merge_predict_label_str = ' '.join(
                [predict_labels_str[index] for index in indexes])
            merge_predict_labels_str.append(merge_predict_label_str)
            merge_predict_score_str = ' '.join(
                [predict_scores_str[index] for index in indexes])
            merge_predict_scores_str.append(merge_predict_score_str)

        return merge_predict_labels_str, merge_predict_scores_str

    def file_predict(self, data_filename, predict_filename):
        """
        Predict data_filename, save the predict result into predict_filename
        The label is split into single word, -B -M -E -S
        :param data_filename:
        :param predict_filename:
        :return:
        """
        print('Predict file ' + data_filename)
        sentence_list = []
        words_list = []
        labels_list = []
        predict_labels_list = []
        with open(data_filename, encoding='utf-8', mode='rt') as data_file:
            for line in data_file:
                words, labels = self.data_utils.split(line)
                if words and labels:
                    sentence_list.append(''.join(words))
                    words_list.append(' '.join(words))
                    labels_list.append(' '.join(labels))
                    predict_labels, _ = self.predict([' '.join(words)])
                    predict_labels_list.append(predict_labels[0])
        word_predict_label_list = []
        word_category_list = []
        word_predict_category_list = []
        for (words, labels, predict_labels) in zip(words_list, labels_list,
                                                   predict_labels_list):
            word_list = words.split()
            label_list = labels.split()
            predict_label_list = predict_labels.split()
            word_predict_label = ' '.join([
                word + '/' + predict_label
                for (word, predict_label) in zip(word_list, predict_label_list)
            ])
            word_predict_label_list.append(word_predict_label)
            # merge label
            merge_word_list, merge_label_list = self.data_utils.merge_label(
                word_list, label_list)
            word_category = ' '.join([
                word + '/' + label
                for (word, label) in zip(merge_word_list, merge_label_list)
                if label != self.default_label
            ])
            word_category_list.append(word_category)
            # merge predict label
            merge_predict_word_list, merge_predict_label_list = self.data_utils.merge_label(
                word_list, predict_label_list)
            word_predict_category = ' '.join([
                predict_word + '/' + predict_label
                for (predict_word, predict_label) in zip(
                    merge_predict_word_list, merge_predict_label_list)
                if predict_label != 'O'
            ])
            word_predict_category_list.append(word_predict_category)
        with open(predict_filename, encoding='utf-8',
                  mode='wt') as predict_file:
            for (sentence, word_predict_label, word_category, word_predict_category) in \
                    zip(sentence_list, word_predict_label_list, word_category_list, word_predict_category_list):
                predict_file.write('Passage: ' + sentence + '\n')
                predict_file.write('SinglePredict: ' + word_predict_label +
                                   '\n')
                predict_file.write('Merge: ' + word_category + '\n')
                predict_file.write('MergePredict: ' + word_predict_category +
                                   '\n\n')

    def freeze_graph(self):
        """
        Save graph into .pb file
        :return:
        """
        graph = tf.graph_util.convert_variables_to_constants(
            self.sess, self.sess.graph_def,
            ['init_all_tables', 'predict_labels', 'predict_scores'])
        tf.train.write_graph(graph,
                             self.freeze_graph_path,
                             'frozen_graph.pb',
                             as_text=False)
        print('Successfully freeze model to %s' % self.freeze_graph_path)

    def saved_model_pb(self):
        """
        Saved model into .ph and variables files, loading it by tensorflow serving,
        :return:
        """
        saved_model_path = os.path.join(self.saved_model_path, '1')
        if os.path.exists(saved_model_path):
            shutil.rmtree(saved_model_path)
        builder = tf.saved_model.builder.SavedModelBuilder(saved_model_path)
        input_tensor_info = tf.saved_model.utils.build_tensor_info(
            self.input_sentences)
        output_labels_tensor_info = tf.saved_model.utils.build_tensor_info(
            self.format_predict_labels)
        output_scores_tensor_info = tf.saved_model.utils.build_tensor_info(
            self.format_predict_scores)
        prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'input_sentences': input_tensor_info},
            outputs={
                'predict_labels': output_labels_tensor_info,
                'predict_scores': output_scores_tensor_info
            },
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')
        builder.add_meta_graph_and_variables(
            self.sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={'predict_segment': prediction_signature},
            legacy_init_op=legacy_init_op)
        builder.save()
        print('Successfully exported model to %s' % saved_model_path)
Example #7
0
def main():
    # 加载配置文件
    print("config5")
    with open('./train_config/config_b2b_tag_5_only_jieba.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(path=config['data_params']['path_train'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=max_len,
                          model='train',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])
Example #8
0
for feature_name in feature_names:
    path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
path_vocs.append(config['data_params']['voc_params']['label']['path'])
vocs = load_vocs(path_vocs)
print(vocs[-1])
print(len(vocs))
# 加载模型
model = SequenceLabelingModel(
    sequence_length=config['model_params']['sequence_length'],
    nb_classes=config['model_params']['nb_classes'],
    nb_hidden=config['model_params']['bilstm_params']['num_units'],
    feature_weight_shape_dict=feature_weight_shape_dict,
    feature_init_weight_dict=feature_init_weight_dict,
    feature_weight_dropout_dict=feature_weight_dropout_dict,
    dropout_rate=config['model_params']['dropout_rate'],
    nb_epoch=config['model_params']['nb_epoch'],
    feature_names=feature_names,
    batch_size=config['model_params']['batch_size'],
    train_max_patience=config['model_params']['max_patience'],
    use_crf=config['model_params']['use_crf'],
    l2_rate=config['model_params']['l2_rate'],
    rnn_unit=config['model_params']['rnn_unit'],
    learning_rate=config['model_params']['learning_rate'],
    path_model=config['model_params']['path_model'])


def predict(string):
    choiceAction = []
    choiceTarget = []
    choiceData = []
    lab = writetxt(string)
Example #9
0
class Train(object):
    def __init__(self):
        self.tfrecords_path = FLAGS.tfrecords_path
        self.checkpoint_path = FLAGS.checkpoint_path
        self.tensorboard_path = FLAGS.tensorboard_path

        self.use_crf = FLAGS.use_crf
        self.learning_rate = FLAGS.learning_rate
        self.learning_rate_decay_factor = FLAGS.learning_rate_decay_factor
        self.decay_steps = FLAGS.decay_steps
        self.clip_norm = FLAGS.clip_norm
        self.max_training_step = FLAGS.max_training_step

        self.train_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                     'train.tfrecords')
        self.test_tfrecords_filename = os.path.join(self.tfrecords_path,
                                                    'test.tfrecords')

        self.data_utils = DataUtils()
        self.num_classes = self.data_utils.get_vocabulary_size(
            os.path.join(FLAGS.vocab_path, 'labels_vocab.txt'))
        self.tensorflow_utils = TensorflowUtils()
        self.sequence_labeling_model = SequenceLabelingModel()

    def train(self):
        """
        train bilstm + crf model
        :return:
        """
        train_data = self.tensorflow_utils.read_and_decode(
            self.train_tfrecords_filename)
        train_batch_features, train_batch_labels, train_batch_features_lengths = train_data
        test_data = self.tensorflow_utils.read_and_decode(
            self.test_tfrecords_filename)
        test_batch_features, test_batch_labels, test_batch_features_lengths = test_data

        with tf.device('/cpu:0'):
            global_step = tf.Variable(0, name='global_step', trainable=False)
        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(self.learning_rate,
                                        global_step,
                                        self.decay_steps,
                                        self.learning_rate_decay_factor,
                                        staircase=True)
        optimizer = tf.train.RMSPropOptimizer(lr)

        with tf.variable_scope('model'):
            logits = self.sequence_labeling_model.inference(
                train_batch_features,
                train_batch_features_lengths,
                self.num_classes,
                is_training=True)
        train_batch_labels = tf.to_int64(train_batch_labels)

        if self.use_crf:
            loss, transition_params = self.sequence_labeling_model.crf_loss(
                logits, train_batch_labels, train_batch_features_lengths,
                self.num_classes)
        else:
            slice_logits, slice_train_batch_labels = self.sequence_labeling_model.slice_seq(
                logits, train_batch_labels, train_batch_features_lengths)
            loss = self.sequence_labeling_model.loss(slice_logits,
                                                     slice_train_batch_labels)

        with tf.variable_scope('model', reuse=True):
            accuracy_logits = self.sequence_labeling_model.inference(
                test_batch_features,
                test_batch_features_lengths,
                self.num_classes,
                is_training=False)
        test_batch_labels = tf.to_int64(test_batch_labels)
        if self.use_crf:
            accuracy = self.sequence_labeling_model.crf_accuracy(
                accuracy_logits, test_batch_labels,
                test_batch_features_lengths, transition_params,
                self.num_classes)
        else:
            slice_accuracy_logits, slice_test_batch_labels = self.sequence_labeling_model.slice_seq(
                accuracy_logits, test_batch_labels,
                test_batch_features_lengths)
            accuracy = self.sequence_labeling_model.accuracy(
                slice_accuracy_logits, slice_test_batch_labels)

        # summary
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('accuracy', accuracy)
        tf.summary.scalar('lr', lr)

        # compute and update gradient
        # train_op = optimizer.minimize(loss, global_step=global_step)

        # computer, clip and update gradient
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        clip_gradients, _ = tf.clip_by_global_norm(gradients, self.clip_norm)
        train_op = optimizer.apply_gradients(zip(clip_gradients, variables),
                                             global_step=global_step)

        init_op = tf.global_variables_initializer()
        saver = tf.train.Saver(max_to_keep=None)
        checkpoint_filename = os.path.join(self.checkpoint_path, 'model.ckpt')

        with tf.Session() as sess:
            summary_op = tf.summary.merge_all()
            writer = tf.summary.FileWriter(self.tensorboard_path, sess.graph)
            sess.run(init_op)

            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            if ckpt and ckpt.model_checkpoint_path:
                print('Continue training from the model {}'.format(
                    ckpt.model_checkpoint_path))
                saver.restore(sess, ckpt.model_checkpoint_path)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)

            max_accuracy = 0.0
            min_loss = 100000000.0
            try:
                while not coord.should_stop():
                    _, loss_value, step = sess.run(
                        [train_op, loss, global_step])
                    if step % 100 == 0:
                        accuracy_value, summary_value, lr_value = sess.run(
                            [accuracy, summary_op, lr])
                        china_tz = pytz.timezone('Asia/Shanghai')
                        current_time = datetime.datetime.now(china_tz)
                        print('[{}] Step: {}, loss: {}, accuracy: {}, lr: {}'.
                              format(current_time, step, loss_value,
                                     accuracy_value, lr_value))
                        if accuracy_value > max_accuracy and loss_value < min_loss:
                            writer.add_summary(summary_value, step)
                            data_clean.clean_checkpoint(self.checkpoint_path)
                            saver.save(sess,
                                       checkpoint_filename,
                                       global_step=step)
                            print('save model to %s-%d' %
                                  (checkpoint_filename, step))
                            max_accuracy = accuracy_value
                            min_loss = loss_value
                    if step >= self.max_training_step:
                        print('Done training after %d step' % step)
                        break
            except tf.errors.OutOfRangeError:
                print('Done training after reading all data')
            finally:
                coord.request_stop()
            coord.join(threads)
Example #10
0
def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    logger.info(feature_names)
    use_char_feature = config['model_params']['use_char_feature']
    logger.info(use_char_feature)
    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict = dict()
    feature_weight_dropout_dict = dict()
    feature_init_weight_dict = dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = config['model_params'][
            'embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = config['model_params'][
            'embed_params'][feature_name]['dropout_rate']
        # embeding mat, 比voc多了两行, 因为voc从2开始编序, 0, 1行用0填充
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']  # 词嵌矩阵位置
        # logger.info("%s init mat path: %s" % (feature_name, path_pre_train))
        with open(path_pre_train, 'rb') as file_r:
            feature_init_weight_dict[feature_name] = pickle.load(file_r)
    logger.info(feature_weight_dropout_dict)
    logger.info(feature_weight_shape_dict)
    logger.info(feature_init_weight_dict)

    # char embedding shape
    if use_char_feature:
        # 暂时不考虑
        feature_weight_shape_dict['char'] = config['model_params'][
            'embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        # 利用卷集层来提取char的信息
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']
                         ['path'])  # vocs用于将文本数字序列化
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']  # 数据的分隔方式
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']

    # 通过voc 将input f1 和输出 label 数字序列化 得到训练的输入和输出
    # data_dict = None
    data_dict = init_data(path=config['data_params']['path_train'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=max_len,
                          model='train',
                          use_char_feature=use_char_feature,
                          word_len=word_len)
    logger.info(data_dict)  # 每个特征序列化后的数据
    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],  # 句子被固定长度
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'],
        last_train_sess_path=None,  # 为了加快训练的速度我们继续载入前面训练的参数
        transfer=False)  # 是否对前面载入的参数进行迁移学习,True的话就重置LSTM的输出层

    model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])
    """
def predict(testlist):
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
    feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(path=config['data_params']['path_test'],
                          feature_names=feature_names,
                          sep=sep,
                          test_sens=testlist,
                          vocs=vocs,
                          max_len=max_len,
                          model='test',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 加载模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])
    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # print('data_dict', data_dict)
    # 标记
    result_sequences = model.predict(data_dict)

    #print('result_sequences', result_sequences)

    # 输出结果
    label_voc = dict()
    for key in vocs[-1]:
        label_voc[vocs[-1][key]] = key

    outlist = []
    for i, sentence in enumerate(testlist):
        templist = []
        for j, item in enumerate(sentence):
            #char = recheck_char(item[0])
            char = item[0]
            if j < len(result_sequences[i]):
                out = [char, label_voc[result_sequences[i][j]]]
            else:
                out = [char, 'O']
            templist.append(out)
        outlist.append(templist)
    return outlist
def export_serving_model():
    """输出tensorserving model"""
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
    feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []

    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']

    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    session = model.sess
    saver = tf.train.Saver()

    saver.restore(session, config['model_params']['path_model'])

    # 输出tensorserving model 过程
    model_version = 1
    work_dir = './Model/ner_model'

    export_path_base = work_dir
    export_path = os.path.join(tf.compat.as_bytes(export_path_base),
                               tf.compat.as_bytes(str(model_version)))

    print('Exporting trained model to', export_path)
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    # 定义输入变量
    tensor_info_input_x_f1 = tf.saved_model.utils.build_tensor_info(
        model.input_feature_ph_dict['f1'])
    tensor_info_weight_dropout_ph_dict_f1 = tf.saved_model.utils.build_tensor_info(
        model.weight_dropout_ph_dict['f1'])
    tensor_info_dropout_rate_ph = tf.saved_model.utils.build_tensor_info(
        model.dropout_rate_ph)
    tensor_info_rnn_dropout_rate_ph = tf.saved_model.utils.build_tensor_info(
        model.rnn_dropout_rate_ph)

    tensor_info_logits = tf.saved_model.utils.build_tensor_info(model.logits)
    tensor_info_actual_length = tf.saved_model.utils.build_tensor_info(
        model.sequence_actual_length)
    tensor_info_transition_params = tf.saved_model.utils.build_tensor_info(
        model.transition_params)

    # 构建过程
    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                'input_x_f1': tensor_info_input_x_f1,
                'weight_dropout_ph_dict_f1':
                tensor_info_weight_dropout_ph_dict_f1,
                'dropout_rate_ph': tensor_info_dropout_rate_ph,
                'rnn_dropout_rate_ph': tensor_info_rnn_dropout_rate_ph
            },
            outputs={
                'transition_params': tensor_info_transition_params,
                'logits': tensor_info_logits,
                'sequence_actual_length': tensor_info_actual_length,
            },
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    )

    builder.add_meta_graph_and_variables(
        session, [tf.saved_model.tag_constants.SERVING],
        signature_def_map={'ner_predict': prediction_signature},
        main_op=tf.tables_initializer(),
        strip_default_attrs=True)

    builder.save()

    print('Done exporting!')