コード例 #1
0
ファイル: bert.py プロジェクト: qlyu001/nlp_label_prediction
 def load_model(self,
                model_dir: str,
                model_config: str = "model_config.json"):
     model_config = os.path.join(model_dir, model_config)
     model_config = json.load(open(model_config))
     bert_config = json.load(
         open(os.path.join(model_dir, "bert_config.json")))
     model = BertNer(bert_config, tf.float32, model_config['num_labels'],
                     model_config['max_seq_length'])
     ids = tf.ones((1, 128), dtype=tf.int64)
     _ = model(ids, ids, ids, ids, training=False)
     model.load_weights(os.path.join(model_dir, "model.h5"))
     voacb = os.path.join(model_dir, "vocab.txt")
     tokenizer = FullTokenizer(vocab_file=voacb,
                               do_lower_case=model_config["do_lower"])
     return model, tokenizer, model_config
コード例 #2
0
 def create_model(self, num_train_step, num_warmup_step):
     """
     根据config文件选择对应的模型,并初始化
     :return:
     """
     model = BertNer(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
     return model
コード例 #3
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help=
        "Bert pre-trained model selected in the list: bert-base-cased,bert-large-cased"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    # Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev/test set.")
    parser.add_argument("--eval_on",
                        default="dev",
                        type=str,
                        help="Evaluation set, dev: Development, test: Test")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    # training stratergy arguments
    parser.add_argument(
        "--multi_gpu",
        action='store_true',
        help="Set this flag to enable multi-gpu training using MirroredStrategy."
        "Single gpu training")
    parser.add_argument(
        "--gpus",
        default='0',
        type=str,
        help="Comma separated list of gpus devices."
        "For Single gpu pass the gpu id.Default '0' GPU"
        "For Multi gpu,if gpus not specified all the available gpus will be used"
    )

    args = parser.parse_args()

    processor = NerProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.do_train:
        tokenizer = FullTokenizer(os.path.join(args.bert_model, "vocab.txt"),
                                  args.do_lower_case)

    if args.multi_gpu:
        if len(args.gpus.split(',')) == 1:
            strategy = tf.distribute.MirroredStrategy()
        else:
            gpus = [f"/gpu:{gpu}" for gpu in args.gpus.split(',')]
            strategy = tf.distribute.MirroredStrategy(devices=gpus)
    else:
        gpu = args.gpus.split(',')[0]
        strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{gpu}")

    train_examples = None
    optimizer = None
    num_train_optimization_steps = 0
    ner = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) /
            args.train_batch_size) * args.num_train_epochs
        warmup_steps = int(args.warmup_proportion *
                           num_train_optimization_steps)
        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=args.learning_rate,
            decay_steps=num_train_optimization_steps,
            end_learning_rate=0.0)
        if warmup_steps:
            learning_rate_fn = WarmUp(initial_learning_rate=args.learning_rate,
                                      decay_schedule_fn=learning_rate_fn,
                                      warmup_steps=warmup_steps)
        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=args.weight_decay,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=args.adam_epsilon,
            exclude_from_weight_decay=['layer_norm', 'bias'])

        with strategy.scope():
            ner = BertNer(args.bert_model, tf.float32, num_labels,
                          args.max_seq_length)
            loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
                reduction=tf.keras.losses.Reduction.NONE)

    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_ids for f in train_features], dtype=np.int32))
        all_input_mask = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_mask for f in train_features], dtype=np.int32))
        all_segment_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.segment_ids for f in train_features],
                       dtype=np.int32))
        all_valid_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.valid_ids for f in train_features], dtype=np.int32))
        all_label_mask = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.label_mask for f in train_features]))

        all_label_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.label_id for f in eval_features], dtype=np.int32))

        # Dataset using tf.data
        train_data = tf.data.Dataset.zip(
            (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids,
             all_label_ids, all_label_mask))
        shuffled_train_data = train_data.shuffle(buffer_size=int(
            len(train_features) * 0.1),
                                                 seed=args.seed,
                                                 reshuffle_each_iteration=True)
        batched_train_data = shuffled_train_data.batch(args.train_batch_size)
        # Distributed dataset
        dist_dataset = strategy.experimental_distribute_dataset(
            batched_train_data)

        loss_metric = tf.keras.metrics.Mean()

        epoch_bar = master_bar(range(args.num_train_epochs))
        pb_max_len = math.ceil(
            float(len(train_features)) / float(args.train_batch_size))

        def train_step(input_ids, input_mask, segment_ids, valid_ids,
                       label_ids, label_mask):
            def step_fn(input_ids, input_mask, segment_ids, valid_ids,
                        label_ids, label_mask):

                with tf.GradientTape() as tape:
                    logits = ner(input_ids,
                                 input_mask,
                                 segment_ids,
                                 valid_ids,
                                 training=True)
                    label_mask = tf.reshape(label_mask, (-1, ))
                    logits = tf.reshape(logits, (-1, num_labels))
                    logits_masked = tf.boolean_mask(logits, label_mask)
                    label_ids = tf.reshape(label_ids, (-1, ))
                    label_ids_masked = tf.boolean_mask(label_ids, label_mask)
                    cross_entropy = loss_fct(label_ids_masked, logits_masked)
                    loss = tf.reduce_sum(cross_entropy) * (
                        1.0 / args.train_batch_size)
                grads = tape.gradient(loss, ner.trainable_variables)
                optimizer.apply_gradients(
                    list(zip(grads, ner.trainable_variables)))
                return cross_entropy

            per_example_losses = strategy.experimental_run_v2(
                step_fn,
                args=(input_ids, input_mask, segment_ids, valid_ids, label_ids,
                      label_mask))
            mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                        per_example_losses,
                                        axis=0)
            return mean_loss

        for epoch in epoch_bar:
            with strategy.scope():
                for (input_ids, input_mask, segment_ids, valid_ids, label_ids,
                     label_mask) in progress_bar(dist_dataset,
                                                 total=pb_max_len,
                                                 parent=epoch_bar):
                    loss = train_step(input_ids, input_mask, segment_ids,
                                      valid_ids, label_ids, label_mask)
                    loss_metric(loss)
                    epoch_bar.child.comment = f'loss : {loss_metric.result()}'
            loss_metric.reset_states()

        # model weight save
        ner.save_weights(os.path.join(args.output_dir, "model.h5"))
        # copy vocab to output_dir
        shutil.copyfile(os.path.join(args.bert_model, "vocab.txt"),
                        os.path.join(args.output_dir, "vocab.txt"))
        # copy bert config to output_dir
        shutil.copyfile(os.path.join(args.bert_model, "bert_config.json"),
                        os.path.join(args.output_dir, "bert_config.json"))
        # save label_map and max_seq_length of trained model
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": num_labels,
            "label_map": label_map
        }
        json.dump(model_config,
                  open(os.path.join(args.output_dir, "model_config.json"),
                       "w"),
                  indent=4)

    if args.do_eval:
        # load tokenizer
        tokenizer = FullTokenizer(os.path.join(args.output_dir, "vocab.txt"),
                                  args.do_lower_case)
        # model build hack : fix
        config = json.load(
            open(os.path.join(args.output_dir, "bert_config.json")))
        ner = BertNer(config, tf.float32, num_labels, args.max_seq_length)
        ids = tf.ones((1, 128), dtype=tf.int32)
        _ = ner(ids, ids, ids, ids, training=False)
        ner.load_weights(os.path.join(args.output_dir, "model.h5"))

        # load test or development set based on argsK
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)

        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evalution *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_input_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_ids for f in eval_features], dtype=np.int32))
        all_input_mask = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_mask for f in eval_features], dtype=np.int32))
        all_segment_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.segment_ids for f in eval_features], dtype=np.int32))
        all_valid_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.valid_ids for f in eval_features], dtype=np.int32))

        all_label_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.label_id for f in eval_features], dtype=np.int32))

        eval_data = tf.data.Dataset.zip(
            (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids,
             all_label_ids))
        batched_eval_data = eval_data.batch(args.eval_batch_size)

        loss_metric = tf.keras.metrics.Mean()
        epoch_bar = master_bar(range(1))
        pb_max_len = math.ceil(
            float(len(eval_features)) / float(args.eval_batch_size))

        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for epoch in epoch_bar:
            for (input_ids, input_mask, segment_ids, valid_ids,
                 label_ids) in progress_bar(batched_eval_data,
                                            total=pb_max_len,
                                            parent=epoch_bar):
                logits = ner(input_ids,
                             input_mask,
                             segment_ids,
                             valid_ids,
                             training=False)
                logits = tf.argmax(logits, axis=2)
                for i, label in enumerate(label_ids):
                    temp_1 = []
                    temp_2 = []
                    for j, m in enumerate(label):
                        if j == 0:
                            continue
                        elif label_ids[i][j].numpy() == len(label_map):
                            y_true.append(temp_1)
                            y_pred.append(temp_2)
                            break
                        else:
                            temp_1.append(label_map[label_ids[i][j].numpy()])
                            temp_2.append(label_map[logits[i][j].numpy()])
        report = classification_report(y_true, y_pred, digits=4)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
コード例 #4
0
ファイル: predict.py プロジェクト: zoeyhub/bert-for-task
 def create_model(self):
     """
             根据config文件选择对应的模型,并初始化
             :return:
             """
     self.model = BertNer(config=self.config, is_training=False)
コード例 #5
0
ファイル: predict.py プロジェクト: zoeyhub/bert-for-task
class Predictor(object):
    def __init__(self, config):
        self.model = None
        self.config = config

        self.output_path = config["output_path"]
        self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
        self.label_to_index = self.load_vocab()
        self.word_vectors = None
        self.sequence_length = self.config["sequence_length"]

        # 创建模型
        self.create_model()
        # 加载计算图
        self.load_graph()

    def load_vocab(self):
        # 将词汇-索引映射表加载出来

        with open(os.path.join(self.output_path, "label_to_index.json"),
                  "r") as f:
            label_to_index = json.load(f)

        return label_to_index

    def padding(self, input_id, input_mask, segment_id):
        """
        对序列进行补全
        :param input_id:
        :param input_mask:
        :param segment_id:
        :return:
        """

        if len(input_id) < self.sequence_length:
            pad_input_id = input_id + [0] * (self.sequence_length -
                                             len(input_id))
            pad_input_mask = input_mask + [0] * (self.sequence_length -
                                                 len(input_mask))
            pad_segment_id = segment_id + [0] * (self.sequence_length -
                                                 len(segment_id))
            sequence_len = len(input_id)
        else:
            pad_input_id = input_id[:self.sequence_length]
            pad_input_mask = input_mask[:self.sequence_length]
            pad_segment_id = segment_id[:self.sequence_length]
            sequence_len = self.sequence_length

        return pad_input_id, pad_input_mask, pad_segment_id, sequence_len

    def sentence_to_idx(self, text):
        """
        将分词后的句子转换成idx表示
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path,
                                               do_lower_case=True)

        tokens = []
        for token in text:
            token = tokenizer.tokenize(token)
            tokens.extend(token)

        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_id)
        segment_id = [0] * len(input_id)

        input_id, input_mask, segment_id, sequence_len = self.padding(
            input_id, input_mask, segment_id)

        return [input_id], [input_mask], [segment_id], [sequence_len]

    def load_graph(self):
        """
        加载计算图
        :return:
        """
        self.sess = tf.Session()
        ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Reloading model parameters..')
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise ValueError('No such file:[{}]'.format(
                self.config["ckpt_model_path"]))

    def create_model(self):
        """
                根据config文件选择对应的模型,并初始化
                :return:
                """
        self.model = BertNer(config=self.config, is_training=False)

    def predict(self, text):
        """
        给定分词后的句子,预测其分类结果
        :param text:
        :return:
        """
        input_ids, input_masks, segment_ids, sequence_len = self.sentence_to_idx(
            text)

        prediction = self.model.infer(
            self.sess,
            dict(input_ids=input_ids,
                 input_masks=input_masks,
                 segment_ids=segment_ids,
                 sequence_len=sequence_len)).tolist()
        print(prediction)
        chunks = get_chunk(prediction, self.label_to_index)
        return chunks