def __init__(self,
              x_y_list,
              vocab_path,
              max_seq_length=256,
              vocab='base-cased',
              transform=None):
     self.max_seq_length = max_seq_length
     self.x_y_list = x_y_list
     self.vocab = vocab
     if self.vocab == 'base-cased':
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-cased', do_lower_case=False, do_basic_tokenize=True)
     elif self.vocab == 'finance-cased':
         self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                        do_lower_case=False,
                                        do_basic_tokenize=True)
     elif self.vocab == 'base-uncased':
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-uncased',
             do_lower_case=True,
             do_basic_tokenize=True)
     elif self.vocab == 'finance-uncased':
         self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                        do_lower_case=True,
                                        do_basic_tokenize=True)
Exemple #2
0
    def load_electra_model(self):
        parser = argparse.ArgumentParser()
        args = parser.parse_args()
        args.output_encoded_layers = True
        args.output_attention_layers = True
        args.output_att_score = True
        args.output_att_sum = True
        self.args = args
        # 解析配置文件, 教师模型和student模型的vocab是不变的
        # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型
        # student config:  config/chinese_bert_config_L4t.json
        # distil student model:  distil_model/gs8316.pkl
        bert_config_file_S = self.model_conf
        tuned_checkpoint_S = self.model_file
        # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
        bert_config_S = ElectraConfig.from_json_file(bert_config_file_S)
        bert_config_S.num_labels = self.num_labels

        # 加载tokenizer
        self.predict_tokenizer = BertTokenizer(vocab_file=self.vocab_file)

        # 加载模型
        self.predict_model = ElectraSPC(bert_config_S)
        assert os.path.exists(tuned_checkpoint_S), "模型文件不存在,请检查"
        state_dict_S = torch.load(tuned_checkpoint_S, map_location=self.device)
        self.predict_model.load_state_dict(state_dict_S)
        if self.verbose:
            print("模型已加载")
        logger.info(f"预测模型{tuned_checkpoint_S}加载完成")
Exemple #3
0
    def __init__(self):
        self.tokenizer = BertTokenizer(
            vocab_file=os.path.join(main_dir, 'pretrained_bert',
                                    'uncased_L-12_H-768_A-12', 'vocab.txt'))

        # generate w2i, t2i, and train data
        self.get_vocab()
    def __init__(self, vocab_type):
        self.VOCAB_DICT = {
            'bc5cdr': ('<PAD>', 'B-Chemical', 'O', 'B-Disease', 'I-Disease',
                       'I-Chemical'),
            'bionlp3g':
            ('<PAD>', 'B-Amino_acid', 'B-Anatomical_system', 'B-Cancer',
             'B-Cell', 'B-Cellular_component',
             'B-Developing_anatomical_structure', 'B-Gene_or_gene_product',
             'B-Immaterial_anatomical_entity', 'B-Multi-tissue_structure',
             'B-Organ', 'B-Organism', 'B-Organism_subdivision',
             'B-Organism_substance', 'B-Pathological_formation',
             'B-Simple_chemical', 'B-Tissue', 'I-Amino_acid',
             'I-Anatomical_system', 'I-Cancer', 'I-Cell',
             'I-Cellular_component', 'I-Developing_anatomical_structure',
             'I-Gene_or_gene_product', 'I-Immaterial_anatomical_entity',
             'I-Multi-tissue_structure', 'I-Organ', 'I-Organism',
             'I-Organism_subdivision', 'I-Organism_substance',
             'I-Pathological_formation', 'I-Simple_chemical', 'I-Tissue', 'O')
        }
        self.VOCAB = self.VOCAB_DICT[vocab_type]
        self.tag2idx = {v: k for k, v in enumerate(self.VOCAB)}
        self.idx2tag = {k: v for k, v in enumerate(self.VOCAB)}

        self.batch_size = 128
        self.lr = 0.0001
        self.n_epochs = 30

        self.tokenizer = BertTokenizer(vocab_file=parameters.VOCAB_FILE,
                                       do_lower_case=False)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    def load_model(self):
        parser = argparse.ArgumentParser()
        args = parser.parse_args()
        args.output_encoded_layers = True
        args.output_attention_layers = True
        args.output_att_score = True
        args.output_att_sum = True
        self.args = args
        # 解析配置文件, 教师模型和student模型的vocab是不变的
        self.vocab_file = "bert_model/vocab.txt"
        # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型
        # student config:  config/chinese_bert_config_L4t.json
        # distil student model:  distil_model/gs8316.pkl
        self.bert_config_file_S = "bert_model/config.json"
        self.tuned_checkpoint_S = "trained_teacher_model/gs3024.pkl"
        self.max_seq_length = 70
        # 预测的batch_size大小
        self.predict_batch_size = 64
        # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
        bert_config_S = BertConfig.from_json_file(self.bert_config_file_S)

        # 加载tokenizer
        tokenizer = BertTokenizer(vocab_file=self.vocab_file)

        # 加载模型
        model_S = BertSPCSimple(bert_config_S,
                                num_labels=self.num_labels,
                                args=self.args)
        state_dict_S = torch.load(self.tuned_checkpoint_S,
                                  map_location=self.device)
        model_S.load_state_dict(state_dict_S)
        if self.verbose:
            print("模型已加载")

        return tokenizer, model_S
    def __init__(self, mode='training'):
        self.mode = mode

        with open(dir_path + '/data/tag2idx.json', 'r') as f:
            self.tag2idx = json.load(f)

        self.idx2tag = dict(zip(self.tag2idx.values(), self.tag2idx.keys()))

        # load pretrained BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=False)

        # load BERT tokenizer with untokenizing frames
        never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
        added_never_split = []
        added_never_split.append('<tgt>')
        added_never_split.append('</tgt>')
        added_never_split_tuple = tuple(added_never_split)
        never_split_tuple += added_never_split_tuple
        vocab_file_path = dir_path + '/data/bert-multilingual-cased-dict-add-frames'
        self.tokenizer_with_frame = BertTokenizer(
            vocab_file_path,
            do_lower_case=False,
            max_len=256,
            never_split=never_split_tuple)
 def __init__(self, config):
     self.juman_tokenizer = JumanTokenizer(config)
     self.rouge_calculator = RougeNCalc()
     self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                         do_lower_case=False, do_basic_tokenize=False)
     self.trim_input = 0
     self.trim_clss = 0
Exemple #8
0
 def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
     self.juman_tokenizer = JumanTokenizer()
     self.model = BertModel.from_pretrained(bert_path)
     self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                         do_lower_case=False,
                                         do_basic_tokenize=False)
     self.use_cuda = use_cuda
Exemple #9
0
    def load_macbert_model(self):
        parser = argparse.ArgumentParser()
        args = parser.parse_args()
        args.output_encoded_layers = True
        args.output_attention_layers = True
        args.output_att_score = True
        args.output_att_sum = True
        self.args = args
        # 解析配置文件, 教师模型和student模型的vocab是不变的
        self.vocab_file = "mac_bert_model/vocab.txt"
        # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型
        # student config:  config/chinese_bert_config_L4t.json
        # distil student model:  distil_model/gs8316.pkl
        self.bert_config_file_S = "mac_bert_model/config.json"
        self.tuned_checkpoint_S = "trained_teacher_model/macbert_2290_cosmetics_weibo.pkl"
        # self.tuned_checkpoint_S = "trained_teacher_model/macbert_894_cosmetics.pkl"
        # self.tuned_checkpoint_S = "trained_teacher_model/macbert_teacher_max75len_5000.pkl"
        # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
        bert_config_S = BertConfig.from_json_file(self.bert_config_file_S)

        # 加载tokenizer
        tokenizer = BertTokenizer(vocab_file=self.vocab_file)

        # 加载模型
        model_S = BertSPCSimple(bert_config_S,
                                num_labels=self.num_labels,
                                args=self.args)
        state_dict_S = torch.load(self.tuned_checkpoint_S,
                                  map_location=self.device)
        model_S.load_state_dict(state_dict_S)
        if self.verbose:
            print("模型已加载")
        self.predict_tokenizer = tokenizer
        self.predict_model = model_S
        logger.info(f"macbert预测模型加载完成")
def get_bert(bert_type='bert'):
    tokenizer, model = None, None
    if (bert_type == 'bert'):
        ######## bert ###########

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')

        ########################

    if (bert_type == 'biobert'):
        #### Bio BERT #########

        model = bm.from_pretrained('biobert_v1.1_pubmed')
        tokenizer = BertTokenizer(vocab_file="biobert_v1.1_pubmed/vocab.txt",
                                  do_lower_case=True)

        #### Bio BERT #########

    if (bert_type == 'scibert'):
        #### sci bert #########

        config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased',
                                            output_hidden_states=False)
        tokenizer = AutoTokenizer.from_pretrained(
            'allenai/scibert_scivocab_uncased')
        model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased',
                                          config=config)

        #######################

    return tokenizer, model
Exemple #11
0
    def __init__(self, bert_path):
        vocab_file_name = 'vocab.txt'
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False, do_basic_tokenize=False)
        self.vocab_size = len(self.bert_tokenizer.vocab)

        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)

        # 除外するヘッダ等トークン
        except_tokens = ["[MASK]", 
        #"[PAD]",
        "[UNK]", "[CLS]", "[SEP]",
        "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※"
        ]
        self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens]

        # vocab_sizeのうち、except_ids以外は、利用する
        self.candidate_ids = [i for i in range(self.vocab_size)
                        if i not in self.except_ids]
Exemple #12
0
 def __init__(self, data_path, config, add_cls=False, add_sep=False):
     self.config = config
     self.sents, self.tags = load_tsv(data_path,
                                      add_cls=add_cls,
                                      add_sep=add_sep)
     self.tokenizer = BertTokenizer(vocab_file=config.vocab_path,
                                    do_lower_case=False)
     self.tokenize()
Exemple #13
0
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_basic_tokenize=False)
        self.cls_id = self.bert_tokenizer.vocab['[CLS]']
        self.mask_id = self.bert_tokenizer.vocab['[MASK]']
        self.bert_model = 'PATH_TO_BERTJPN'

        self.cp = 'checkpoint/jp/cp_step_710000.pt'
        self.opt = 'checkpoint/jp/opt_step_710000.pt'
Exemple #14
0
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_basic_tokenize=False)
        self.cls_id = self.bert_tokenizer.vocab['[CLS]']
        self.mask_id = self.bert_tokenizer.vocab['[MASK]']
        self.bert_model = '/model/Japanese/bert/Japanese_L-12_H-768_A-12_E-30_BPE'

        self.cp = '/checkpoint/jp/cp_step_710000.pt'
        self.opt = '/checkpoint/jp/opt_step_710000.pt'
Exemple #15
0
    def load_train_model(self):
        """
        初始化训练的模型
        :return:
        """
        parser = argparse.ArgumentParser()
        args = parser.parse_args()
        args.output_encoded_layers = True
        args.output_attention_layers = True
        args.output_att_score = True
        args.output_att_sum = True
        self.learning_rate = 2e-05
        #学习率 warmup的比例
        self.warmup_proportion = 0.1
        self.num_train_epochs = 1
        #使用的学习率scheduler
        self.schedule = 'slanted_triangular'
        self.s_opt1 = 30.0
        self.s_opt2 = 0.0
        self.s_opt3 = 1.0
        self.weight_decay_rate = 0.01
        #训练多少epcoh保存一次模型
        self.ckpt_frequency = 1
        #模型和日志保存的位置
        self.output_dir = "output_root_dir/train_api"
        #梯度累积步数
        self.gradient_accumulation_steps = 1
        self.args = args
        # 解析配置文件, 教师模型和student模型的vocab是不变的
        self.vocab_file = "mac_bert_model/vocab.txt"
        self.bert_config_file_S = "mac_bert_model/config.json"
        self.tuned_checkpoint_S = "mac_bert_model/pytorch_model.bin"
        # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
        bert_config_S = BertConfig.from_json_file(self.bert_config_file_S)

        # 加载tokenizer
        tokenizer = BertTokenizer(vocab_file=self.vocab_file)

        # 加载模型
        model_S = BertSPCSimple(bert_config_S,
                                num_labels=self.num_labels,
                                args=self.args)
        state_dict_S = torch.load(self.tuned_checkpoint_S,
                                  map_location=self.device)
        state_weight = {
            k[5:]: v
            for k, v in state_dict_S.items() if k.startswith('bert.')
        }
        missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                       strict=False)
        #验证下参数没有丢失
        assert len(missing_keys) == 0
        self.train_tokenizer = tokenizer
        self.train_model = model_S
        logger.info(f"训练模型{self.tuned_checkpoint_S}加载完成")
Exemple #16
0
 def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
     # 日本語文章をBERTに食わせるためにJumanを読み込む
     self.juman_tokenizer = JumanTokenizer()
     # 事前学習済みのBERTモデルを読み込む
     self.model = BertModel.from_pretrained(bert_path)
     # 事前学習済みのBERTモデルのTokenizerを読み込む
     self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                         do_lower_case=False,
                                         do_basic_tokenize=False)
     # CUDA-GPUを利用するかどうかのフラグ読み込み
     self.use_cuda = use_cuda
Exemple #17
0
    def __init__(self,
                 texts,
                 vocab_path,
                 max_seq_length=512,
                 vocab='finance-uncased'):
        self.texts = texts
        self.dict_labels = {'lower': 0, 'maintain': 1, 'raise': 2}

        self.max_seq_length = max_seq_length
        self.vocab = vocab
        if self.vocab == 'finance-uncased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=True,
                                           do_basic_tokenize=True)
Exemple #18
0
class Param:
    batch_size = 32
    lr = 1e-4
    n_epochs = 64
    p = 0.3
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    pad_token = 0
    cls_token = '[CLS]'
    sep_token = '[SEP]'
    vocab_path = '../biobert_v1.0_pubmed_pmc/vocab.txt'
    bert_config = '../biobert_v1.0_pubmed_pmc/bert_config.json'
    bert_weight = '../biobert_v1.0_pubmed_pmc/weight/pytorch_weight'
    sep_sent = True
    num_fold = 5
    tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=True)
def get_bert(bert_type='bert'):
    tokenizer, model = None, None
    if (bert_type == 'bert'):
        ######## bert ###########

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')

        ########################

    if (bert_type == 'biobert'):
        #### Bio BERT #########
        '''
        config = BertConfig(vocab_size_or_config_json_file="biobert_model/bert_config.json")

        model = BertModel(config)
        tmp_d = torch.load("biobert_model/pytorch_model.bin")
        state_dict = OrderedDict()
        for i in list(tmp_d.keys())[:199]:
            x = i
            if i.find('bert') > -1:
                x = '.'.join(i.split('.')[1:])
            state_dict[x] = tmp_d[i]
            '
        #print(state_dict)
        model.load_state_dict(tmp_d)
        tokenizer=BertTokenizer(vocab_file="biobert_model/vocab.txt", do_lower_case=True)
        '''

        model = bm.from_pretrained('biobert_v1.1_pubmed')
        tokenizer = BertTokenizer(vocab_file="biobert_v1.1_pubmed/vocab.txt",
                                  do_lower_case=True)

        #### Bio BERT #########

    if (bert_type == 'scibert'):
        #### sci bert #########

        config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased',
                                            output_hidden_states=True)
        tokenizer = AutoTokenizer.from_pretrained(
            'allenai/scibert_scivocab_uncased')
        model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased',
                                          config=config)

        #######################

    return tokenizer, model
Exemple #20
0
def main():
    # 解析参数
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")

    # 解析参数, 判断使用的设备
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    # 准备task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    # 所有的labels
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # 读取数据
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)

    eval_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           evaluate=True)
    logger.info("评估数据集已加载")

    model_S = BertSPCSimple(bert_config_S, num_labels=num_labels, args=args)
    # 加载student模型
    assert args.tuned_checkpoint_S is not None
    state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
    model_S.load_state_dict(state_dict_S)
    logger.info("Student模型已加载")

    # 开始预测
    res = predict(model_S, eval_dataset, args=args)
    print(res)
Exemple #21
0
    def load_predict_model(
            self,
            type,
            model_file="trained_teacher_model/components_albert.pkl"):
        """
        :param type: 加载哪种类型的模型,是成分的,还是其它的
        :type type: 任意一种类型,加载不同的模型 "component","effect","fragrance","pack","skin","promotion","service","price"
        :param model_file:
        :type model_file:
        :return:
        :rtype:
        """
        parser = argparse.ArgumentParser()
        args = parser.parse_args()
        args.output_encoded_layers = True
        args.output_attention_layers = True
        args.output_att_score = True
        args.output_att_sum = True
        self.args = args
        # 解析配置文件, 教师模型和student模型的vocab是不变的
        self.vocab_file = "albert_model/vocab.txt"
        # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型
        # student config:  config/chinese_bert_config_L4t.json
        # distil student model:  distil_model/gs8316.pkl
        self.bert_config_file_S = "albert_model/config.json"
        self.tuned_checkpoint_S = model_file
        # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
        bert_config_S = AlbertConfig.from_json_file(self.bert_config_file_S)
        bert_config_S.num_labels = self.num_labels

        # 加载tokenizer
        tokenizer = BertTokenizer(vocab_file=self.vocab_file)

        # 加载模型
        model_S = AlbertSPC(bert_config_S)
        state_dict_S = torch.load(self.tuned_checkpoint_S,
                                  map_location=self.device)
        model_S.load_state_dict(state_dict_S)
        if self.verbose:
            print("模型已加载")
        self.predict_tokenizer[type] = tokenizer
        self.predict_model[type] = model_S
        logger.info(f"预测模型{model_file}加载完成")
Exemple #22
0
    def __init__(self, vocab_type):
        
        self.VOCAB_DICT = {
            'bc5cdr': ('<PAD>', 'B-Chemical', 'O', 'B-Disease' , 'I-Disease', 'I-Chemical'),
            'i2b2' : ('<PAD>', 'B-treatment', 'B-test', 'B-problem', 'I-treatment', 'I-test', 'I-problem', 'O'),
            'relations' : ('<PAD>','TrCP', 'TeCP', 'TrWP', 'TeRP', 'PIP', 'TrAP', 'TrIP', 'TrNAP', 'None')
        }
        self.VOCAB = self.VOCAB_DICT[vocab_type]
        self.tag2idx = {v:k for k,v in enumerate(self.VOCAB)}
        self.idx2tag = {k:v for k,v in enumerate(self.VOCAB)}

        self.batch_size = 4
        self.relations_batch_size = 4
        self.lr = 0.0001
        self.n_epochs = 30 
        self.hidden_size = 384

        self.tokenizer = BertTokenizer(vocab_file=parameters.VOCAB_FILE, do_lower_case=False)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
Exemple #23
0
 def setupModel(self):
     """ Load the classification model.
     """
     num_labels = len(self.labels)
     vocab_type = "finance-uncased"
     self.max_seq_length = 256
     if torch.cuda.is_available():
         self.device = torch.device("cuda")
     else:
         self.device = torch.device("cpu")
     self.model = BertClassification(weight_path=self.model_weights_path,
                                     num_labels=num_labels,
                                     vocab=vocab_type)
     self.model.load_state_dict(
         torch.load(self.pretuned_modelfile, map_location=self.device))
     self.model.to(self.device)
     self.tokenizer = BertTokenizer(vocab_file=self.vocab_path,
                                    do_lower_case=True,
                                    do_basic_tokenize=True)
def convert_data2(path1, path2, max_length, number, seq1, seq2):
    """转ID,进行padding,再加上CLP、SEP之后"""
    tokenizer = BertTokenizer('./model/bert-base-chinese/vocab.txt')
    input_id = []
    input_mask = []
    segment_id = []
    # number = 0
    print(len(seq1))

    for i in range(number):
        tokens_a = tokenizer.tokenize(seq1[i])
        tokens_b = tokenizer.tokenize(seq2[i])
        # print(seq2[i])
        # print(tokens_b)
        while True:
            if (len(tokens_a) + len(tokens_b)) <= max_length - 3:
                break
            else:
                # print(tokens_b)
                # tokens_b.pop()
                tokens_a = tokens_a[: int((max_length - 3) * len(tokens_a)/(len(tokens_a) + len(tokens_b)))]
                tokens_b = tokens_b[: int((max_length - 3) * len(tokens_b)/(len(tokens_a) + len(tokens_b)))]
        # 头尾加上[CLS] [SEP]标签
        tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
        tokens = tokens_a + tokens_b + ['[SEP]']
        input_id_ = tokenizer.convert_tokens_to_ids(tokens)
        segment_id_ = [0] * len(tokens_a) + [1] * (len(tokens_b) + 1)
        input_mask_ = [1] * len(tokens)
        # segment_id是用于区分token_a和token_b的
        # input_mask用于区分padding
        padding_ = [0] * (max_length - len(tokens))
        # 所有的输入进入bert的配置参数都要加上padding
        input_id_ += padding_
        segment_id_ += padding_
        input_mask_ += padding_
        # 每条语句放入列表中[sentence_num, MAX_LENGTH]
        input_id.append(input_id_)
        input_mask.append(input_mask_)
        segment_id.append(segment_id_)

    return input_id, input_mask, segment_id
    def __init__(self, **kwargs):
        """Loads a BertTokenizer using bert_pretrained_pytorch

        :param kwargs:
        """
        super(WordPieceVectorizer1D, self).__init__(kwargs.get('transform_fn'))
        from pytorch_pretrained_bert import BertTokenizer
        self.max_seen = 128
        handle = kwargs.get('embed_file')
        custom_vocab = kwargs.get('vocab_file')
        if custom_vocab is None:
            self.tokenizer = BertTokenizer.from_pretrained(handle,
                                                           do_lower_case=True)
        else:
            special_tokens = kwargs.get('special_tokens')
            never_split = ('[UNK]', '[SEP]', '[PAD]', '[CLS]',
                           '[MASK]') + special_tokens
            self.tokenizer = BertTokenizer(custom_vocab,
                                           do_basic_tokenize=True,
                                           never_split=never_split)
        self.mxlen = kwargs.get('mxlen', -1)
Exemple #26
0
def bert_pipeline(train_test, num_train_data, do_lower_case=False):
    # base & large share the same vocab
    if do_lower_case:
        vocab_file = find_data("uncased_L-12_H-768_A-12/vocab.txt")
    else:
        vocab_file = find_data("cased_L-12_H-768_A-12/vocab.txt")
    print("Tokenization ...")
    start_time = time.time()
    # vocab_file relative path
    tokenizer = BertTokenizer(vocab_file=vocab_file,
                              do_lower_case= do_lower_case,
                              max_len=MAX_LEN, do_basic_tokenize=False) #(find_data(vocab_file))
    ''' w2v_tokenize '''
    if do_lower_case:
        word_sequences = convert_lines(train_test['comment_text'].str.lower(), MAX_LEN, tokenizer)
    else:
        word_sequences = convert_lines(train_test['comment_text'], MAX_LEN, tokenizer)
    assert isinstance(word_sequences, np.ndarray)
    train_word_sequences = word_sequences[:num_train_data]
    test_word_sequences = word_sequences[num_train_data:]
    print("--- %s seconds ---" % (time.time() - start_time))
    print("=" * 50)

    ''' save_preprocess '''
    print("Saving Preprocessed data ...")
    start_time = time.time()
    if do_lower_case:
        np.save(find_data("bert_train_seq_uncased"), train_word_sequences)
        np.save(find_data("bert_test_seq_uncased"), test_word_sequences)
        save_bert_vocabulary(tokenizer, find_data("bert_vocab_uncased.txt"))
    else:
        np.save(find_data("bert_train_seq_cased"), train_word_sequences)
        np.save(find_data("bert_test_seq_cased"), test_word_sequences)
        save_bert_vocabulary(tokenizer, find_data("bert_vocab_cased.txt"))
    print("--- %s seconds ---" % (time.time() - start_time))
    print("=" * 50)
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #Prepare GLUE task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    # eg: MNLI,['contradiction', 'entailment', 'neutral'] --> [“矛盾”,“必然”,“中立”]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    #read data
    train_dataset = None
    eval_datasets = None
    num_train_steps = None
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    # 加载数据集, 计算steps
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        if args.aux_task_name:
            aux_train_dataset = load_and_cache_examples(args,
                                                        args.aux_task_name,
                                                        tokenizer,
                                                        evaluate=False,
                                                        is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset(
                [train_dataset, aux_train_dataset])
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size) * args.num_train_epochs
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli",
                           "mnli-mm") if args.task_name == "mnli" else (
                               args.task_name, )
        for eval_task in eval_task_names:
            eval_datasets.append(
                load_and_cache_examples(args,
                                        eval_task,
                                        tokenizer,
                                        evaluate=True))
    logger.info("数据集已加载")

    #加载模型并初始化, 只用student模型,其实这里相当于在MNLI数据上训练教师模型,只训练一个模型
    model_S = BertForGLUESimple(bert_config_S,
                                num_labels=num_labels,
                                args=args)
    #初始化student模型
    if args.load_model_type == 'bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        if args.only_load_embedding:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items()
                if k.startswith('bert.embeddings')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            logger.info(f"Missing keys {list(missing_keys)}")
        else:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items() if k.startswith('bert.')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            assert len(missing_keys) == 0
        logger.info("Model loaded")
    elif args.load_model_type == 'all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
        model_S.load_state_dict(state_dict_S)
        logger.info("Model loaded")
    else:
        logger.info("Model is randomly initialized.")
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_S = torch.nn.DataParallel(model_S)  #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d",
                    len(all_trainable_params))
        # 优化器设置
        optimizer = BERTAdam(all_trainable_params,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             s_opt1=args.s_opt1,
                             s_opt2=args.s_opt2,
                             s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### 蒸馏 ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            ckpt_frequency=args.ckpt_frequency,
            log_dir=args.output_dir,
            output_dir=args.output_dir,
            device=args.device)

        #执行监督训练,而不是蒸馏。它可以用于训练teacher模型。初始化模型
        distiller = BasicTrainer(train_config=train_config,
                                 model=model_S,
                                 adaptor=BertForGLUESimpleAdaptorTraining)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.forward_batch_size,
                                      drop_last=True)
        callback_func = partial(predict,
                                eval_datasets=eval_datasets,
                                args=args)
        with distiller:
            distiller.train(optimizer,
                            scheduler=None,
                            dataloader=train_dataloader,
                            num_epochs=args.num_train_epochs,
                            callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S, eval_datasets, step=0, args=args)
        print(res)
    def __init__(self, mode='training', language='ko', version=1.0):
        version = str(version)
        self.mode = mode
        if language == 'en':
            data_path = dir_path + '/koreanframenet/resource/info/fn' + version + '_'
        else:
            data_path = dir_path + '/koreanframenet/resource/info/kfn' + version + '_'
        with open(data_path + 'lu2idx.json', 'r') as f:
            self.lu2idx = json.load(f)
        if version == '1.5':
            fname = dir_path + '/koreanframenet/resource/info/fn1.5_frame2idx.json'
        else:
            fname = dir_path + '/koreanframenet/resource/info/fn1.7_frame2idx.json'
        with open(fname, 'r') as f:
            #self.sense2idx = json.load(f)
            self.frame2idx = json.load(f)
        with open(data_path + 'lufrmap.json', 'r') as f:
            #self.lusensemap = json.load(f)
            self.lufrmap = json.load(f)
        with open(dir_path + '/koreanframenet/resource/info/fn1.7_fe2idx.json',
                  'r') as f:
            self.arg2idx = json.load(f)
        with open(
                dir_path + '/koreanframenet/resource/info/fn1.7_frargmap.json',
                'r') as f:
            self.frargmap = json.load(f)
        with open(
                dir_path +
                '/koreanframenet/resource/info/fn1.7_bio_fe2idx.json',
                'r') as f:
            self.bio_arg2idx = json.load(f)
        with open(
                dir_path +
                '/koreanframenet/resource/info/fn1.7_bio_frargmap.json',
                'r') as f:
            self.bio_frargmap = json.load(f)

        self.idx2frame = dict(
            zip(self.frame2idx.values(), self.frame2idx.keys()))
        self.idx2lu = dict(zip(self.lu2idx.values(), self.lu2idx.keys()))
        self.idx2arg = dict(zip(self.arg2idx.values(), self.arg2idx.keys()))
        self.idx2bio_arg = dict(
            zip(self.bio_arg2idx.values(), self.bio_arg2idx.keys()))

        # load pretrained BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=False)

        # load BERT tokenizer with untokenizing frames
        never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
        added_never_split = []
        added_never_split.append('<tgt>')
        added_never_split.append('</tgt>')
        #         for frame in self.frame2idx:
        #             added_never_split.append('['+frame+']')
        added_never_split_tuple = tuple(added_never_split)
        never_split_tuple += added_never_split_tuple
        vocab_file_path = dir_path + '/data/bert-multilingual-cased-dict-add-frames'
        self.tokenizer_with_frame = BertTokenizer(
            vocab_file_path,
            do_lower_case=False,
            max_len=512,
            never_split=never_split_tuple)
Exemple #29
0
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_T = BertConfig.from_json_file(args.bert_config_file_T)
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_T.max_position_embeddings
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #Prepare GLUE task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    #read data
    train_dataset = None
    eval_datasets = None
    num_train_steps = None
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    # 加载数据集
    if args.do_train:
        train_dataset, examples = load_and_cache_examples(args,
                                                          args.task_name,
                                                          tokenizer,
                                                          evaluate=False)
        if args.aux_task_name:
            aux_train_dataset, examples = load_and_cache_examples(
                args,
                args.aux_task_name,
                tokenizer,
                evaluate=False,
                is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset(
                [train_dataset, aux_train_dataset])
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size) * args.num_train_epochs
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli",
                           "mnli-mm") if args.task_name == "mnli" else (
                               args.task_name, )
        for eval_task in eval_task_names:
            eval_dataset, examples = load_and_cache_examples(args,
                                                             eval_task,
                                                             tokenizer,
                                                             evaluate=True)
            eval_datasets.append(eval_dataset)
    logger.info("数据集加载成功")

    #加载模型,加载teacher和student模型
    model_T = BertForGLUESimple(bert_config_T,
                                num_labels=num_labels,
                                args=args)
    model_S = BertForGLUESimple(bert_config_S,
                                num_labels=num_labels,
                                args=args)
    #加载teacher模型参数
    if args.tuned_checkpoint_T is not None:
        state_dict_T = torch.load(args.tuned_checkpoint_T, map_location='cpu')
        model_T.load_state_dict(state_dict_T)
        model_T.eval()
    else:
        assert args.do_predict is True
    #Load student
    if args.load_model_type == 'bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        if args.only_load_embedding:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items()
                if k.startswith('bert.embeddings')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            logger.info(f"Missing keys {list(missing_keys)}")
        else:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items() if k.startswith('bert.')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            assert len(missing_keys) == 0
        logger.info("Model loaded")
    elif args.load_model_type == 'all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
        model_S.load_state_dict(state_dict_S)
        logger.info("Model loaded")
    else:
        logger.info("Student模型没有可加载参数,随机初始化参数 randomly initialized.")
    model_T.to(device)
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_T = torch.nn.DataParallel(model_T)  #,output_device=n_gpu-1)
            model_S = torch.nn.DataParallel(model_S)  #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d",
                    len(all_trainable_params))
        #优化器配置
        optimizer = BERTAdam(all_trainable_params,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             s_opt1=args.s_opt1,
                             s_opt2=args.s_opt2,
                             s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### DISTILLATION ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            ckpt_frequency=args.ckpt_frequency,
            log_dir=args.output_dir,
            output_dir=args.output_dir,
            device=args.device)
        # 定义了一些固定的matches配置文件
        from matches import matches
        intermediate_matches = None
        if isinstance(args.matches, (list, tuple)):
            intermediate_matches = []
            for match in args.matches:
                intermediate_matches += matches[match]
        logger.info(f"中间层match信息: {intermediate_matches}")
        distill_config = DistillationConfig(
            temperature=args.temperature,
            intermediate_matches=intermediate_matches)

        logger.info(f"训练配置: {train_config}")
        logger.info(f"蒸馏配置: {distill_config}")
        adaptor_T = partial(BertForGLUESimpleAdaptor,
                            no_logits=args.no_logits,
                            no_mask=args.no_inputs_mask)
        adaptor_S = partial(BertForGLUESimpleAdaptor,
                            no_logits=args.no_logits,
                            no_mask=args.no_inputs_mask)
        # 支持中间状态匹配的通用蒸馏模型
        distiller = GeneralDistiller(train_config=train_config,
                                     distill_config=distill_config,
                                     model_T=model_T,
                                     model_S=model_S,
                                     adaptor_T=adaptor_T,
                                     adaptor_S=adaptor_S)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.forward_batch_size,
                                      drop_last=True)
        callback_func = partial(predict,
                                eval_datasets=eval_datasets,
                                args=args,
                                examples=examples)
        with distiller:
            distiller.train(optimizer,
                            scheduler=None,
                            dataloader=train_dataloader,
                            num_epochs=args.num_train_epochs,
                            callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S,
                      eval_datasets,
                      step=0,
                      args=args,
                      examples=examples,
                      label_list=label_list)
        print(res)
Exemple #30
0
from pytorch_pretrained_bert import BertTokenizer, BertForQuestionAnswering, BertConfig

config_file = "../config/bert_base_config.json"
vocab_file = "../config/vocab.txt"
config = BertConfig(config_file)
model = BertForQuestionAnswering(config)

tokenizer = BertTokenizer(vocab_file)
print(tokenizer.vocab["i"])

for k, v in model.state_dict().items():
    print(k)