Ejemplo n.º 1
0
    def __init__(self, charWindowSize, maxSentenceLenth, lr, epoch, batchSize,
                 emb_dim, hidden_dim, dropout, datatype, embedding_model_type,
                 old_or_new, is_noised, which_epoch_to_test):
        self.charWindowSize = charWindowSize  #字符的共现窗口大小,即同时观察当前字符前后多少个字符
        self.maxSentenceLenth = maxSentenceLenth  #文章的最大长度
        self.lr = lr  #学习率
        self.epoch = epoch  #迭代轮数
        self.batchSize = batchSize  #一次喂入多少条数据
        self.emb_dir = emb_dim  #每个词的embedding维数
        self.hidden_dim = hidden_dim  #隐藏层的维度,也即第一层LSTM层的记忆体个数
        self.dropout = dropout
        self.bert_token = BertTokenizer.from_pretrained('bert-base-chinese')
        self.model_config = BertConfig.from_pretrained('bert-base-chinese')
        self.datatype = datatype  #数据集类型
        self.embedding_model_type = embedding_model_type  #预训练模型是Bert还是word2vec
        self.old_or_new = old_or_new
        self.is_noised = is_noised
        self.which_epoch_to_test = which_epoch_to_test

        # GPU information
        os.environ["CUDA_VISIBLE_DEVICES"] = '1'  # use GPU with ID=0

        if self.datatype == 'ccf':  #CCF数据集
            self.ctg_dic = [
                'O', 'B-position', 'M-position', 'E-position', 'B-name',
                'M-name', 'E-name', 'B-organization', 'M-organization',
                'E-organization', 'B-movie', 'M-movie', 'E-movie', 'B-email',
                'M-email', 'E-email', 'B-mobile', 'M-mobile', 'E-mobile',
                'B-company', 'M-company', 'E-company', 'B-book', 'M-book',
                'E-book', 'B-QQ', 'M-QQ', 'E-QQ', 'B-scene', 'M-scene',
                'E-scene', 'B-address', 'M-address', 'E-address', 'B-game',
                'M-game', 'E-game', 'B-government', 'M-government',
                'E-government', 'B-vx', 'M-vx', 'E-vx', 'H'
            ]
            self.category_num = 44
            if self.is_noised == 'noised':
                self.trainfilepath = r'./resource/' + self.old_or_new + '/ccf_14_noised_train.txt'  #原文本路径
                self.testfilepath = r'./resource/' + self.old_or_new + '/ccf_14_noised_test.txt'  #原文本路径
            else:
                self.trainfilepath = r'./resource/' + self.old_or_new + '/ccf_14_train.txt'  # 原文本路径
                self.testfilepath = r'./resource/' + self.old_or_new + '/ccf_14_test.txt'  # 原文本路径
            self.datapath = r'./cache/data/ccf/'  #喂进模型的数据路径
            self.model_variable = r'../NER/cache/variable/ccf/'  #模型的可训练参数保存路径
        elif self.datatype == 'cluener':  #10分类数据集
            self.ctg_dic = [
                'O', 'B-company', 'M-company', 'E-company', 'B-name', 'M-name',
                'E-name', 'B-email', 'M-email', 'E-email', 'B-mobile',
                'M-mobile', 'E-mobile', 'B-game', 'M-game', 'E-game', 'B-QQ',
                'M-QQ', 'E-QQ', 'B-organization', 'M-organization',
                'E-organization', 'B-movie', 'M-movie', 'E-movie',
                'B-position', 'M-position', 'E-position', 'B-address',
                'M-address', 'E-address', 'B-government', 'M-government',
                'E-government', 'B-scene', 'M-scene', 'E-scene', 'B-book',
                'M-book', 'E-book', 'H'
            ]
            self.category_num = 41
            if self.is_noised == 'noised':
                self.trainfilepath = r'./resource/' + self.old_or_new + '/cluener_10_noised_train.txt'  # 原文本路径
                self.testfilepath = r'./resource/' + self.old_or_new + '/cluener_10_noised_test.txt'  # 原文本路径
            else:
                self.trainfilepath = r'./resource/' + self.old_or_new + '/cluener_10_train.txt'  # 原文本路径
                self.testfilepath = r'./resource/' + self.old_or_new + '/cluener_10_test.txt'  # 原文本路径
            self.datapath = r'./cache/data/cluener/'  #喂进模型的数据路径
            self.model_variable = r'../NER/cache/variable/cluener/'  #模型的可训练参数保存路径
        elif self.datatype == 'weibo':  #微博数据集
            self.ctg_dic = [
                'O', 'B-email', 'M-email', 'E-email', 'B-mobile', 'M-mobile',
                'E-mobile', 'B-QQ', 'M-QQ', 'E-QQ', 'B-GPE', 'M-GPE', 'E-GPE',
                'B-PER', 'M-PER', 'E-PER', 'B-ORG', 'M-ORG', 'E-ORG', 'B-LOC',
                'M-LOC', 'E-LOC', 'S-PER', 'S-GPE', 'S-LOC', 'H'
            ]
            self.category_num = 26
            if self.is_noised == 'noised':
                self.trainfilepath = r'./resource/' + self.old_or_new + '/weibo_4_noised_train.txt'  #原文本路径
                self.testfilepath = r'./resource/' + self.old_or_new + '/weibo_4_noised_test.txt'  # 原文本路径
            else:
                self.trainfilepath = r'./resource/' + self.old_or_new + '/weibo_4_train.txt'  #原文本路径
                self.testfilepath = r'./resource/' + self.old_or_new + '/weibo_4_test.txt'  # 原文本路径
            self.datapath = r'./cache/data/weibo/'  #喂进模型的数据路径
            self.model_variable = r'./cache/variable/weibo/'  #模型的可训练参数保存路径
Ejemplo n.º 2
0
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn.functional as F
from utils import *
from tqdm import tqdm
from trainDataloader import BertSimDataset, BertEvalSimDataset, BertEvalSimWithLabelDataset, EnsembleEvalSimWithLabelDataset
from transformers import BertModel, BertConfig, BertTokenizer, BertForSequenceClassification

# %%
tokenizer = BertTokenizer.from_pretrained('./dataset/vocab')

eval_list = load_sim_dev('./dataset/101/c_dev_with_label')
myData_eval = EnsembleEvalSimWithLabelDataset(tokenizer, './dataset/std_data',
                                              50)

# %%
config = BertConfig.from_json_file('./dataset/bert_config.json')
config.num_labels = 2
model = BertForSequenceClassification.from_pretrained(
    './model/bert_pre58_3/pytorch_model.bin', config=config)


# %%
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.projection = nn.Sequential(nn.Linear(hidden_dim, hidden_dim),
                                        nn.ReLU(True),
                                        nn.Linear(hidden_dim, hidden_dim))

    def forward(self, encoder_outputs):
 def __init__(self):
     super().__init__()
     config = BertConfig.from_pretrained("bert-base-uncased")
     self.model = BertModel(config)
Ejemplo n.º 4
0
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
                                             -1, -1, -1)
            elif head_mask.dim() == 2:
                head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
                             )  # We can specify head_mask for each layer
            head_mask = head_mask.to(dtype=next(self.parameters(
            )).dtype)  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

        return input_ids, position_ids, token_type_ids, inputs_embeds, \
               extended_attention_mask, head_mask, encoder_hidden_states, encoder_extended_attention_mask


if __name__ == "__main__":
    config = BertConfig.from_pretrained('../data/bert_model/bert_config.json')
    tokenizer = BertTokenizer.from_pretrained('../data/bert_model/vocab.txt')
    bert = BertModel.from_pretrained(
        '../data/bert_model/chinese_wwm_pytorch.bin', config=config)
    model = SoftMaskedBert(bert, tokenizer, 2, 1, 'cpu')
    text = '中国的'
    token = tokenizer.tokenize(text)
    ids = tokenizer.convert_tokens_to_ids(token)
    ids = torch.Tensor([ids]).long()
    print(ids)
    input_mask = torch.tensor([[1, 1, 0]])
    segment_ids = torch.tensor([[0, 0, 0]])
    out = model(ids, input_mask, segment_ids)
    # out = bert(ids)
    print(out)
Ejemplo n.º 5
0
def main(args, _=None):
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    model_config = BertConfig.from_pretrained(args.in_config)
    model_config.output_hidden_states = args.output_hidden_states
    model = BertModel(config=model_config)

    checkpoint = utils.load_checkpoint(args.in_model)
    checkpoint = {"model_state_dict": checkpoint}
    utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    tokenizer = BertTokenizer.from_pretrained(args.in_vocab)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            utils.tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            bert_output = model(**batch)
            mask = batch["attention_mask"].unsqueeze(-1) \
                if args.mask_for_max_length \
                else None
            features_ = utils.process_bert_output(
                bert_output=bert_output,
                hidden_size=model.config.hidden_size,
                output_hidden_states=model.config.output_hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for key, value in features_.items():
                    name_ = key if isinstance(key, str) else f"{key:02d}"
                    _, embedding_size = value.shape
                    features[name_] = np.memmap(
                        f"{args.out_prefix}.{name_}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for key, value in features_.items():
                name_ = key if isinstance(key, str) else f"{key:02d}"
                features[name_][indices] = _detach(value)
Ejemplo n.º 6
0
 def init_encoder(cls, args, dropout: float = 0.1):
     cfg = BertConfig.from_pretrained("bert-base-uncased")
     if dropout != 0:
         cfg.attention_probs_dropout_prob = dropout
         cfg.hidden_dropout_prob = dropout
     return cls.from_pretrained("bert-base-uncased", config=cfg)
Ejemplo n.º 7
0
def train(args):

    # device setting
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # load model and tokenizer
    if args['model_name'] == "xlm-roberta-large":
        MODEL_NAME = "xlm-roberta-large"
        tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
        config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
        config.num_labels = args['num_labels']

        model = XLMRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, config=config)

    elif args['model_name'] == "roberta-base":
        MODEL_NAME = "roberta-base"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        config = RobertaConfig.from_pretrained(MODEL_NAME,
                                               output_hidden_states=True)
        config.num_labels = args['num_labels']

        model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME,
                                                                 config=config)

    elif args['model_name'] == "bert-base-multilingual-cased":
        MODEL_NAME = "bert-base-multilingual-cased"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        config = BertConfig.from_pretrained(MODEL_NAME)
        config.num_labels = args['num_labels']

        model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                              config=config)

    else:
        MODEL_NAME = args['model_name']
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        model = AutoModel.from_pretrained(MODEL_NAME)

    # if you use entity_token
    if args['entity_token']:
        special_tokens_dict = {
            'additional_special_tokens': ["#", "@", '₩', '^']
        }
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        model.resize_token_embeddings(len(tokenizer))

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    train_dataset, valid_dataset = train_test_split(
        dataset, test_size=0.1, random_state=args['random_seed'])
    train_label = train_dataset['label'].values
    valid_label = valid_dataset['label'].values

    # pororo ner
    ner = Pororo(task="ner", lang="ko")

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer, ner, args)
    tokenized_valid = tokenized_dataset(valid_dataset, tokenizer, ner, args)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_valid_dataset = RE_Dataset(tokenized_valid, valid_label)

    # update model setting

    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.

    print("use_trainer : ", args['use_trainer'])

    if args['use_trainer']:
        training_args = TrainingArguments(
            output_dir='./results',  # output directory
            save_total_limit=5,  # number of total save model.
            save_steps=500,  # model saving step.
            num_train_epochs=args['epochs'],  # total number of training epochs
            learning_rate=args['lr'],  # learning_rate
            per_device_train_batch_size=args[
                'train_batch_size'],  # batch size per device during training
            per_device_eval_batch_size=args[
                'eval_batch_size'],  # batch size for evaluation
            warmup_steps=args[
                'warmup_steps'],  # number of warmup steps for learning rate scheduler
            weight_decay=args['weight_decay'],  # strength of weight decay
            logging_dir='./logs',  # directory for storing logs
            logging_steps=args['logging_steps'],  # log saving step.
            label_smoothing_factor=args['label_smoothing_factor'],
            evaluation_strategy=
            'steps',  # evaluation strategy to adopt during training
            # `no`: No evaluation during training.
            # `steps`: Evaluate every `eval_steps`.
            # `epoch`: Evaluate every end of epoch.
            eval_steps=100,  # evaluation step.
        )
        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=RE_train_dataset,  # training dataset
            eval_dataset=RE_valid_dataset,  # evaluation dataset
            compute_metrics=compute_metrics  # define metrics function
        )

        # train model
        trainer.train()

    else:
        custom_trainer(model, device, RE_train_dataset, RE_valid_dataset, args)
Ejemplo n.º 8
0
# 4
parser.add_argument('--resultpath', type=str, help='where to save the LM model')
args = parser.parse_args()

import pandas as pd
import regex as re



if args.LM == 'Bert':
    from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM

    config = BertConfig(vocab_size=28996,
                        max_position_embeddings=512,
                        num_attention_heads=12,
                        num_hidden_layers=12,
                        #type_vocab_size=2, default is 2
                        )
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
    model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config)
    #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
    # 12-layer, 768-hidden, 12-heads, 110M parameters.

elif args.LM == 'RoBerta':
    from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

    config = RobertaConfig(vocab_size=50265,
                           max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,
Ejemplo n.º 9
0
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
# from tensorflow.keras import model`s, layers, preprocessing as kprocessing
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
import streamlit as st
# from argparse import ArgumentParser
import lime
from lime.lime_text import LimeTextExplainer

MODELS = {
    "BERT": "model_noprocess.h5"
}
model_name = 'bert-base-uncased'

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
repo_root = os.path.dirname(os.path.abspath(__file__))[:os.path.dirname(os.path.abspath(__file__)).find("Assignment_1")+13]
import_model = load_model(repo_root+"/models/model_noprocess.h5")
class_names = ['1', '2', '3', '4', '5']
explainer = LimeTextExplainer(class_names=class_names)
print(repo_root)
# Obtain the CSS for Buttons to be displayed
@st.cache(suppress_st_warning=True, allow_output_mutation=True)
def get_button_css(button_id):
    custom_css = f"""
        <style>
            #{button_id} {{
Ejemplo n.º 10
0
     if line == '': break
     line = line.rstrip()
     label_dict[line] = line_id
     line_id += 1
     
 if 'albert' in args.model:
   model_type = 'albert'
   tokenizer = AlbertTokenizer(vocab_file = args.tokenizer)
   config = AlbertConfig.from_json_file(args.config)
   model = AlbertModel.from_pretrained(pretrained_model_name_or_path = None,
     config = config,
     state_dict = torch.load(args.model))
 elif 'bert' in args.model:
   model_type = 'bert'
   tokenizer = BertTokenizer(vocab_file = args.tokenizer)
   config = BertConfig.from_json_file(args.config)
   model = BertModel.from_pretrained(pretrained_model_name_or_path = None,
     config = config,
     state_dict = torch.load(args.model))
 elif 'electra' in args.model:
   model_type = 'electra'
   tokenizer = ElectraTokenizer(vocab_file = args.tokenizer)
   config = ElectraConfig.from_json_file(args.config)
   model = ElectraModel.from_pretrained(pretrained_model_name_or_path = None,
     config = config,
     state_dict = torch.load(args.model))
 else:
   raise NotImplementedError("The model is currently not supported")
 
 def process_line(line):
   data = json.loads(line)
Ejemplo n.º 11
0
print('there are {} kinds of relation in train.'.format(len(set(train_label))))
print('there are {} kinds of relation in dev.'.format(len(set(val_label))))
print('there are {} kinds of relation in test.'.format(len(set(test_label))))
print('number of union of train and dev: {}'.format(len(set(train_label) & set(val_label))))
print('number of union of dev and test: {}'.format(len(set(val_label) & set(test_label))))
print('number of union of train and test: {}'.format(len(set(train_label) & set(test_label))))

property2idx, idx2property, pid2vec = data_helper.generate_attribute(train_label, val_label, test_label)

print(len(training_data))
print(len(dev_data))
print(len(test_data))

bertconfig = BertConfig.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad',
                                        num_labels=len(set(train_label)),
                                        finetuning_task='wiki-zero-shot')
bertconfig.relation_emb_dim = 1024
bertconfig.margin = args.gamma
bertconfig.alpha = args.alpha
bertconfig.dist_func = args.dist_func

model = ZSBert.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', config=bertconfig)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

trainset = data_helper.WikiDataset('train', training_data, pid2vec, property2idx)
trainloader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=data_helper.create_mini_batch, shuffle=True)
Ejemplo n.º 12
0
id_candidate_len_list = np.array(id_candidate_len_list)
sorted_index = np.argsort(id_candidate_len_list)
id_candidate_list_sorted = []
for i in range(len(id_candidate_list)):
    id_candidate_list_sorted.append(id_candidate_list[sorted_index[i]])

# hyperparameters
max_seq_len = 384
max_question_len = 64
learning_rate = 0.00005
batch_size = 960
num_epoch = 1

# build model
model_path = '../../huggingface_pretrained/bert-base-uncased/'
config = BertConfig.from_pretrained(model_path)
config.num_labels = 5
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
model = BertForQuestionAnswering.from_pretrained('weights/epoch1/',
                                                 config=config)

model.cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
model, optimizer = amp.initialize(model,
                                  optimizer,
                                  opt_level="O1",
                                  verbosity=0)
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

# testing
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        help="The name of the task for training.")
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--bert_model",
                        default="bert-base-uncased",
                        type=str,
                        help="student bert model configuration folder")
    parser.add_argument("--encoder_checkpoint",
                        default=None,
                        type=str,
                        help="check point for student encoder")
    parser.add_argument("--cls_checkpoint",
                        default=None,
                        type=str,
                        help="check point for student classifier")
    parser.add_argument("--alpha",
                        default=0.95,
                        type=float,
                        help="alpha for distillation")
    parser.add_argument("--T",
                        default=10.,
                        type=float,
                        help="temperature for distillation")
    parser.add_argument("--beta",
                        default=0.0,
                        type=float,
                        help="weight for AT loss")
    parser.add_argument("--fc_layer_idx",
                        default=None,
                        type=str,
                        help="layers ids we will put FC layers on")
    parser.add_argument("--normalize_patience",
                        default=False,
                        help="normalize patience or not")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="do training or not")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="do evaluation during training or not")

    parser.add_argument("--train_type", default="finetune_teacher",
                        choices=["finetune_teacher","train_student"],
                        help="choose which to train")
    parser.add_argument("--log_every_step",
                        default=50,
                        type=int,
                        help="output to log every global x training steps, default is 1")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=1000,
                        help="Log every X updates steps.")
    parser.add_argument('--student_hidden_layers',
                        type=int,
                        default=12,
                        help="number of transformer layers for student, default is None (use all layers)")
    parser.add_argument('--teacher_prediction',
                        type=str,
                        default=None,
                        help="teacher prediction file to guild the student's output")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    args = parser.parse_args()

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    logger.info('actual batch size on all GPU = %d' % args.train_batch_size)

    if args.train_type == 'finetune_teacher':
        args.student_hidden_layers = 12 if 'base' in args.bert_model else 24
        args.alpha = 0.0   # alpha = 0 is equivalent to fine-tuning for KD
    elif args.train_type == "train_student":
        args.student_hidden_layers = 6
        args.kd_model = "kd.cls"
        args.alpha = 0.7
        args.beta = 500
        args.T = 10
        args.fc_layer_idx = "1,3,5,7,9"   # this for pkd-skip
        args.normalize_patience = True
    else:
        raise ValueError("please pick train_type from finetune_teacher,train_student")

    if args.encoder_checkpoint is None:
        args.encoder_checkpoint = os.path.join(args.bert_model, 'pytorch_model.bin')
        logger.info('encoder checkpoint not provided, use pre-trained at %s instead' % args.encoder_checkpoint)

    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir))


    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    #args.n_gpu = 1
    logger.info("device: {} n_gpu: {}".format(args.device, args.n_gpu))

    # set seed
    set_seed(args)

    # prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    args.num_labels = len(label_list)

    # prepare tokenizer and model
    config = BertConfig()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True)

    config.output_hidden_states = True

    encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers)
    classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0)

    n_student_layer = len(encoder.bert.encoder.layer)
    encoder = load_model(encoder, args.encoder_checkpoint, args, 'student', verbose=True)
    logger.info('*' * 77)
    classifier = load_model(classifier, args.cls_checkpoint, args, 'classifier', verbose=True)


    n_param_student = count_parameters(encoder) + count_parameters(classifier)
    logger.info('number of layers in student model = %d' % n_student_layer)
    logger.info('num parameters in student model are %d' % n_param_student)

    # Training
    if args.do_train:
        read_set = 'train'
        if args.train_type == "train_student":
            assert args.teacher_prediction is not None
            assert args.alpha > 0
            logger.info('loading teacher\'s predictoin')
            teacher_predictions = pickle.load(open(args.teacher_prediction, 'rb'))['train'] if args.teacher_prediction is not None else None
            logger.info('teacher acc = %.2f, teacher loss = %.5f' % (
            teacher_predictions['acc'] * 100, teacher_predictions['loss']))
            train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer,
                                                                      SequentialSampler,
                                                                      batch_size=args.train_batch_size,
                                                                      knowledge=teacher_predictions['pred_logit'],
                                                                      extra_knowledge=teacher_predictions[
                                                                          'feature_maps'])
        else:
            assert args.alpha == 0
            logger.info("runing teacher fine-tuning")
            train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer,
                                                                      SequentialSampler,
                                                                      batch_size=args.train_batch_size)

        global_step, tr_loss = train(args, train_dataloader, encoder, classifier, tokenizer)
        #################
        # information of teacher model (like [CLS])
        #################
        if args.train_type == "finetune_teacher":
            all_res = {'train': None}

            encoder_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.encoder.pkl')
            cls_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.cls.pkl')
            print("encoder_file")

            encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers)
            classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0)

            encoder = load_model(encoder, encoder_file, args, 'exact', verbose=True)
            classifier = load_model(classifier, cls_file, args, 'exact', verbose=True)
            
            train_res = eval_model_dataloader(encoder, classifier, train_dataloader, args.device, detailed=True,
                                              verbose=False)
            all_res['train'] = train_res

            logger.info('saving teacher results')

            fname = os.path.join(args.output_dir,
                                 args.task_name + f'_teacher_{args.student_hidden_layers}layer_information.pkl')
            with open(fname, 'wb') as fp:
                pickle.dump(all_res, fp)

        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Evaluation
    if args.do_eval:


        test_examples, test_dataloader, test_label_ids = get_task_dataloader(args, 'dev', tokenizer,
                                                                             SequentialSampler,
                                                                             batch_size=args.eval_batch_size)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        result = evaluate(args, test_label_ids, encoder,classifier,test_dataloader)

        output_test_file = os.path.join(args.output_dir, "test_results_" + '.txt')
        with open(output_test_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    return
Ejemplo n.º 14
0
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "Process device: %s, n_gpu: %s, 16-bits training: %s",
        device,
        args.n_gpu,
        args.fp16,
    )

    config = BertConfig.from_pretrained('bert-base-uncased' if args.pretrained_model is None else args.pretrained_model,
        num_labels=args.num_labels,
    )
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased' if args.pretrained_model is None else args.pretrained_model,)
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased' if args.pretrained_model is None else args.pretrained_model,
        config=config,
    )

    model.to(args.device)

    writer = ResultWriter(args.experiments_dir)
    results = {}

    if args.do_train:
        train_dataset = IMDBDataset(args.train_file, tokenizer, args.train_max_len)
        valid_dataset = IMDBDataset(args.valid_file, tokenizer, args.train_max_len)
        unlabeled_dataset = None
Ejemplo n.º 15
0
    use_edu = (unit == 'edu')
    unit_length_limit = args.unit_length_limit
    word_length_limit = args.word_length_limit
    batch_size = args.batch_size
    if not os.path.exists(hyp_path):
        os.makedirs(hyp_path)
    if not os.path.exists(ref_path):
        os.makedirs(ref_path)

    if torch.cuda.is_available():
        device = torch.device("cuda:%d" % (args.device))
        torch.cuda.set_device(device)
    else:
        device = torch.device("cpu")

    config = BertConfig.from_json_file(bert_config)

    print('load model')
    model = DiscoExtSumm(args, load_pretrained_bert=False,
                         bert_config=config).to(device)
    if torch.cuda.is_available():
        model = model.to(device)
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
    model.eval()

    # train = SummarizationDataset(inputs_dir,is_test=False)
    # train_dataloader = SummarizationDataLoader(train,is_test=False,device=1,batch_size=8)
    # pos_weight = get_posweight(inputs_dir).to(device)
    pos_weight = torch.FloatTensor([10.11]).to(device)
    print('load data')
    # attention_folder = 'nyt/attn_map_nuc_norm'
Ejemplo n.º 16
0
def get_kobert_config():
    return BertConfig.from_dict(kobert_config)
Ejemplo n.º 17
0
    def __init__(self,
                 args,
                 device,
                 checkpoint=None,
                 bert_from_extractive=None):
        super(AbsSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        if bert_from_extractive is not None:
            self.bert.model.load_state_dict(dict([
                (n[11:], p) for n, p in bert_from_extractive.items()
                if n.startswith('bert.model')
            ]),
                                            strict=True)

        if (args.encoder == 'baseline'):
            bert_config = BertConfig(
                self.bert.model.config.vocab_size,
                hidden_size=args.enc_hidden_size,
                num_hidden_layers=args.enc_layers,
                num_attention_heads=8,
                intermediate_size=args.enc_ff_size,
                hidden_dropout_prob=args.enc_dropout,
                attention_probs_dropout_prob=args.enc_dropout)
            self.bert.model = BertModel(bert_config)

        if (args.max_pos > 512):
            my_pos_embeddings = nn.Embedding(
                args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:
                                          512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[
                512:] = self.bert.model.embeddings.position_embeddings.weight.data[
                    -1][None, :].repeat(args.max_pos - 512, 1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
        self.vocab_size = self.bert.model.config.vocab_size
        tgt_embeddings = nn.Embedding(self.vocab_size,
                                      self.bert.model.config.hidden_size,
                                      padding_idx=0)
        if (self.args.share_emb):
            tgt_embeddings.weight = copy.deepcopy(
                self.bert.model.embeddings.word_embeddings.weight)

        self.decoder = TransformerDecoder(self.args.dec_layers,
                                          self.args.dec_hidden_size,
                                          heads=self.args.dec_heads,
                                          d_ff=self.args.dec_ff_size,
                                          dropout=self.args.dec_dropout,
                                          embeddings=tgt_embeddings)

        self.generator = get_generator(self.vocab_size,
                                       self.args.dec_hidden_size, device)
        self.generator[0].weight = self.decoder.embeddings.weight

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            for p in self.generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
                else:
                    p.data.zero_()
            if (args.use_bert_emb):
                tgt_embeddings = nn.Embedding(
                    self.vocab_size,
                    self.bert.model.config.hidden_size,
                    padding_idx=0)
                tgt_embeddings.weight = copy.deepcopy(
                    self.bert.model.embeddings.word_embeddings.weight)
                self.decoder.embeddings = tgt_embeddings
                self.generator[0].weight = self.decoder.embeddings.weight

        self.to(device)
Ejemplo n.º 18
0
import glob
import logging
import pickle
import re

from time import time
from torch import nn
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, Dataset

from transformers import (BertConfig, BertForMaskedLM, BertTokenizer)

bert_layer = BertForMaskedLM.from_pretrained(
    'allenai/scibert_scivocab_uncased', output_hidden_states=True)
bert_config = BertConfig()


class LoadDataSet(Dataset):
    def __init__(self, filename, maxlen=64):
        self.df = pd.read_csv(filename, encoding='utf-8')
        self.tokenizer = BertTokenizer.from_pretrained(
            'allenai/scibert_scivocab_uncased')
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        sentence = self.df.loc[index, 'Processed Text']
Ejemplo n.º 19
0
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

hvd.init()
if args.cuda:
    # Horovod: pin GPU to local rank.
    #print('local rank: ', hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())

cudnn.benchmark = True

DATAPATH='/datasets/shshi'
pretrained_path='%s/pretrained'%DATAPATH

if args.model == 'bert_base':
    config = BertConfig.from_json_file('bert_base_config.json')
else:
    config = BertConfig.from_json_file('bert_config.json')
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
    config.vocab_size += 8 - (config.vocab_size % 8)

vocab_size=config.vocab_size
#tokenizer = BertTokenizer.from_pretrained(pretrained_path)
#model = BertForPreTraining.from_pretrained(pretrained_path)
model = BertForPreTraining(config)

if args.cuda:
    model.cuda()

max_len = args.sentence_len
def train(fold_all):
    # config = BertConfig.from_pretrained('../../model_lib/robert/pytorch/chinese_roberta_wwm_large_ext_pytorch/bert_config.json')
    # config = BertConfig.from_pretrained('../../model_lib/bert/pytorch/xs/bert_config.json')
    # bert-wwm-ext
    config = BertConfig.from_pretrained(
        '../../model_lib/bert/pytorch/bert-wwm-ext/bert_config.json')

    print('开始训练...')
    for fold_index in range(FOLD):
        # set fold parameter
        BEST_F1 = 0
        BEST_EPOCH = 0
        loss_list = []
        f1_list = []
        flag = 0

        print('正在加载模型...')
        if USE_GPU:
            model = BertForSequenceClassification.from_pretrained(
                '../../model_lib/bert/pytorch/bert-wwm-ext/',
                config=config).cuda()
            # model = BertForSequenceClassification.from_pretrained('../../model_lib/bert/pytorch/xs/', config=config).cuda()
        else:
            model = BertForSequenceClassification.from_pretrained(
                '../../model_lib/bert/pytorch/bert-wwm-ext/', config=config)
            # model = BertForSequenceClassification.from_pretrained('../../model_lib/bert/pytorch/xs/', config=config)
        optimizer = AdamW(model.parameters(), lr=LR, correct_bias=False)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=WARMUP_STEPS,
                                         t_total=T_TOTAL)  # T_TOTAL?

        # 制作交叉验证的数据集
        train_list = []
        for _ in range(5):
            if _ != fold_index:
                train_list = train_list + fold_all[_]
        dev_list = fold_all[fold_index]

        train_bert_list = utils.bert_input(train_list)
        dev_bert_list = utils.bert_input(dev_list)
        train_dataset = layers.Train_Dataset(train_bert_list)
        dev_dataset = layers.Train_Dataset(dev_bert_list)
        train_dataloader = DataLoader(dataset=train_dataset,
                                      batch_size=BATCH_SIZE,
                                      shuffle=True)
        dev_dataloader = DataLoader(dataset=dev_dataset,
                                    batch_size=BATCH_SIZE,
                                    shuffle=False)

        for epoch in range(EPOCH):
            model.train()
            for text, label in train_dataloader:
                # 转text label为tensor
                text = [sub_text.tolist() for sub_text in text]
                label = [int(sub_label) for sub_label in label]
                if USE_GPU:
                    text = torch.tensor(text).t().cuda()  # 为什么要转置?
                    label = torch.tensor(label).cuda()
                else:
                    text = torch.tensor(text).t()
                    label = torch.tensor(label)

                # 输入模型
                outputs = model(text, labels=label)
                loss, logits = outputs[:2]

                # 优化
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                # 存储单批次f1 loss
                f1 = utils.batch_f1(logits, label)
                f1_list.append(f1)
                loss_list.append(loss.item())
                flag += 1

                # 输出f1 loss
                if flag % 200 == 0:
                    f1_mean = np.mean(f1_list)
                    loss_mean = np.mean(loss_list)
                    f1_list = []
                    loss_list = []
                    print('fold: {} | epoch: {} | f1: {} | loss: {}'.format(
                        fold_index, epoch, f1_mean, loss_mean))

            # 验证集,每个epoch验证一次
            f1_val = val(model, dev_dataloader)

            print(
                '***********************************************************************'
            )
            print('fold: {} | epoch: {} | 验证集F1值: {}'.format(
                fold_index, epoch, f1_val))
            if f1_val > BEST_F1:
                BEST_F1 = f1_val
                BEST_EPOCH = epoch
                # v1:0
                torch.save(
                    model, 'bert_wwm_ext_f5k_epoch2_lr1_ml84_bs24_' +
                    str(fold_index) + 'k_' + 'best_model.m')
                # torch.cuda.empty_cache()
            print('fold: {} | 验证集最优F1值: {}'.format(fold_index, BEST_F1))
            print('fold: {} | 验证集最优epoch: {}'.format(fold_index, BEST_EPOCH))
            print(
                '***********************************************************************'
            )
Ejemplo n.º 21
0
import pandas as pd
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, BertConfig
import datetime

device = torch.device('cuda')
test = pd.read_csv('../data/Dataset/test.csv')

model_path = '../user_data/pre-trained/chinese_roberta_wwm_large_ext_pytorch/'

test_category = test['category'].values
test_query1 = test['query1'].values
test_query2 = test['query2'].values

bert_config = BertConfig.from_pretrained(model_path + 'bert_config.json', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt', config=bert_config)


class BertForClass(nn.Module):
    def __init__(self, n_classes=2):
        super(BertForClass, self).__init__()
        self.model_name = 'BertForClass'
        self.bert_model = BertModel.from_pretrained(model_path, config=bert_config)
        self.classifier = nn.Linear(bert_config.hidden_size * 2, n_classes)

    def forward(self, input_ids, input_masks, segment_ids):
        sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids,
                                                                        attention_mask=input_masks)

        seq_avg = torch.mean(sequence_output, dim=1)
Ejemplo n.º 22
0
def init_train_env(args, tbert_type):
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args.seed, args.n_gpu)
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    if tbert_type == 'twin' or tbert_type == "T":
        model = TBertT(BertConfig(), args.code_bert)
    elif tbert_type == 'siamese' or tbert_type == "I":
        model = TBertI(BertConfig(), args.code_bert)
    elif tbert_type == 'siamese2' or tbert_type == "I2":
        model = TBertI2(BertConfig(), args.code_bert)
    elif tbert_type == 'single' or tbert_type == "S":
        model = TBertS(BertConfig(), args.code_bert)
    else:
        raise Exception("TBERT type not found")
    args.tbert_type = tbert_type
    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
    return model
Ejemplo n.º 23
0
                'tok_to_orig_index': [s['tok_to_orig_index'] for s in samples],
                'para_offset': [s['para_offset'] for s in samples],
                "true_answers": [s['true_answers'] for s in samples],
                'net_input': net_input,
            }


if __name__ == "__main__":
    index_path = "retrieval/index_data/para_embed_3_28_c10000.npy"
    raw_data = "../data/nq-train.txt"

    from transformers import BertConfig, BertTokenizer
    from retrieval.retriever import BertForRetriever
    from config import get_args
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_config = BertConfig.from_pretrained('bert-base-uncased')
    args = get_args()
    retriever = BertForRetriever(bert_config, args)

    from utils import load_saved
    retriever_path = "retrieval/logs/splits_3_28_c10000-seed42-bsz640-fp16True-retrieve-from94_c1000_continue_from_failed-lr1e-05-bert-base-uncased-filterTrue/checkpoint_best.pt"
    retriever = load_saved(retriever, retriever_path)
    retriever.cuda()

    sampler = OnlineSampler(index_path, raw_data, tokenizer,
                            args.max_query_length, args.max_seq_length)

    sampler.shuffle()
    retriever.eval()
    for batch in sampler.load(retriever):
        if batch is not {}:
Ejemplo n.º 24
0
    downstream_model_dir="nlpbook/checkpoint-paircls",
    max_seq_length=64,
)


# %% load model
import torch
from transformers import BertConfig, BertForSequenceClassification

fine_tuned_model_ckpt = torch.load(
    args.downstream_model_checkpoint_fpath,
    map_location=torch.device("cuda")
)

pt_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels = fine_tuned_model_ckpt['state_dict']['model.classifier.bias'].shape.numel(),
)

model = BertForSequenceClassification(pt_model_config)
model.load_state_dict({k.replace("model.", ""):v for k,v in fine_tuned_model_ckpt['state_dict'].items()})
model.eval()

# %%
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)
# %%

def inference_fn(premise, hypothesis):
Ejemplo n.º 25
0
    print('intent num:', len(intent_vocab))
    print('tag num:', len(tag_vocab))
    for data_key in ['val', 'test']:
        dataloader.load_data(
            json.load(
                open(os.path.join(data_dir, '{}_data.json'.format(data_key)))),
            data_key)
        print('{} set size: {}'.format(data_key,
                                       len(dataloader.data[data_key])))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    bert_config = BertConfig.from_pretrained(
        config['model']['pretrained_weights'])

    model = JointBERT(bert_config,
                      DEVICE,
                      dataloader.tag_dim,
                      dataloader.intent_dim,
                      context=config['model']['context'])
    # model.from_pretrained(os.path.join(output_dir, 'pytorch_model.bin'))
    model.load_state_dict(
        torch.load(os.path.join(output_dir, 'pytorch_model.bin'), DEVICE))
    model.to(DEVICE)
    model.eval()

    batch_size = config['model']['batch_size']

    for data_key in ['val', 'test']:
Ejemplo n.º 26
0
def dual_bert():
    set_seed(33)

    opt = Adam(learning_rate=2e-5)

    id1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    id2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    mask1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    mask2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    atn1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    atn2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)

    config = BertConfig()
    config.output_hidden_states = False  # Set to True to obtain hidden states
    bert_model1 = TFBertModel.from_pretrained('bert-base-uncased',
                                              config=config)
    bert_model2 = TFBertModel.from_pretrained('bert-base-uncased',
                                              config=config)

    embedding1 = bert_model1(id1, attention_mask=mask1, token_type_ids=atn1)[0]
    embedding2 = bert_model2(id2, attention_mask=mask2, token_type_ids=atn2)[0]

    embedding1 = keras.layers.Bidirectional(  # 加上这个就变成了双向lstm
        keras.layers.LSTM(  # 这个是单向lstm
            64,
            # 权重初始化
            kernel_initializer='he_normal',
            # 返回每个token的输出,如果设置为False 只出最后一个。
            return_sequences=True))(embedding1)
    embedding2 = keras.layers.Bidirectional(  # 加上这个就变成了双向lstm
        keras.layers.LSTM(  # 这个是单向lstm
            64,
            # 权重初始化
            kernel_initializer='he_normal',
            # 返回每个token的输出,如果设置为False 只出最后一个。
            return_sequences=True))(embedding2)
    #embedding1=list(embedding1)
    #embedding2 = list(embedding2)
    #embedding1 = GlobalAveragePooling1D()(embedding1)
    #embedding2 = GlobalAveragePooling1D()(embedding2)
    print("池化后的情况", type(embedding1), type(embedding2), embedding1.shape,
          embedding2.shape)
    #embedding1= Attention(128)(embedding1)
    #embedding2 = Attention(128)(embedding2)
    x = Concatenate()([embedding1, embedding2])
    #print(x.shape,"尺寸为")
    x = keras.layers.Bidirectional(  # 加上这个就变成了双向lstm
        keras.layers.LSTM(  # 这个是单向lstm
            64,
            # 权重初始化
            #kernel_initializer='he_normal',
            # 返回每个token的输出,如果设置为False 只出最后一个。
            return_sequences=True))(x)
    print(x.shape)
    print(type(x))
    #print(x[:0:0].shape)
    #x = Lambda(lambda x: x[:,-1, :], name='CLS-token')(x)  # 降维
    #x = Attention(128)(x)  # 加入attention

    x = attention(return_sequences=False)(x)

    print('成功了')

    print(type(x))
    #print(x.shape)
    #x = Lambda(lambda x: x[:, 0], name='CLS-token')(x)#降维
    #x1 = GlobalAveragePooling1D()(embedding1)
    #x2 = GlobalAveragePooling1D()(embedding2)

    #x = Concatenate()([x1, x2])

    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    #out = Dense(len(map_label), activation='softmax')(x)
    #x=tf.Tensor(x)
    #out=Dense(5, activation='sigmoid')(x)
    out = Dense(5, activation='softmax')(x)

    model = Model(inputs=[id1, mask1, atn1, id2, mask2, atn2], outputs=out)
    model.compile(loss='mean_squared_error',
                  optimizer=opt,
                  metrics=['accuracy'])  #加个评测指标

    return model
Ejemplo n.º 27
0
        return param_group['lr']


def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)


# Set random seed
set_seed(26092020)

#
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = max(AspectBasedSentimentAnalysisProsaDataset.NUM_LABELS)
config.num_labels_list = AspectBasedSentimentAnalysisProsaDataset.NUM_LABELS
#
#
#
# # Instantiate model
model = BertForMultiLabelClassification.from_pretrained(
    'indobenchmark/indobert-base-p1', config=config)
# model = fasttext.load_model("/Users/bobbyakyong/Projects/python/indonlu2/model/fasttext-cc-id/cc.id.300_no-oov_absa-prosa_uncased.txt")

model
print(model)

print(count_param(model))
Ejemplo n.º 28
0
from serbianer.load_data.load_dataset import read_and_prepare_csv, SentenceGetter, bert_load_index_tags
from serbianer.vocab import Vocab
import matplotlib.pyplot as plt
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

configuration = BertConfig()  # default parameters and configuration for BERT


def create_model(model='bert-base-multilingual-cased'):
    ## BERT encoder
    encoder = TFBertModel.from_pretrained(model)

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)
Ejemplo n.º 29
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
    # behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.

    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    # if model_args.config_name:
    #     config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    # elif model_args.model_name_or_path:
    #     config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    # else:
    #     config = CONFIG_MAPPING[model_args.model_type]()
    #     logger.warning("You are instantiating a new config instance from scratch.")
    pretrained_name = "bert-base-uncased"
    config = BertConfig.from_pretrained(pretrained_name)
    tokenizer = BertTokenizer.from_pretrained(pretrained_name)

    # if model_args.model_name_or_path:
    #     model = AutoModelForMaskedLM.from_pretrained(
    #         model_args.model_name_or_path,
    #         from_tf=bool(".ckpt" in model_args.model_name_or_path),
    #         config=config,
    #         cache_dir=model_args.cache_dir,
    #     )
    # else:
    #     logger.info("Training new model from scratch")
    #     model = AutoModelForMaskedLM.from_config(config)
    model = LitOutputBertModel(name="test",
                               config=config,
                               tokenizer=tokenizer,
                               pretrained_name=pretrained_name)
    model.bert.train()
    print("Requiring grads for bert")
    for param in model.bert.parameters(recurse=True):
        param.requires_grad = True
    for param in model.training_bert.parameters(recurse=True):
        param.requires_grad = True

    model.bert.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(
                examples["text"],
                padding=padding,
                truncation=True,
                max_length=data_args.max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name],
                             return_special_tokens_mask=True)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        if data_args.max_seq_length is None:
            max_seq_length = tokenizer.model_max_length
        else:
            if data_args.max_seq_length > tokenizer.model_max_length:
                logger.warn(
                    f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                    f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
                )
            max_seq_length = min(data_args.max_seq_length,
                                 tokenizer.model_max_length)

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    # This one will take care of randomly masking the tokens.
    tokenized_datasets["train"]._output_all_columns = True
    data_collator = CustomDataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path if
                      (model_args.model_name_or_path is not None
                       and os.path.isdir(model_args.model_name_or_path)) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_mlm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
Ejemplo n.º 30
0
    def __init__(self, device: str):
        self.device: str = device

        store_dataset_train = torch.load("model/dict_vocabs.pth",
                                         map_location=self.device)
        self.vocab_words = store_dataset_train["vocab_words"]
        self.vocab_pos_tags = store_dataset_train["vocab_pos_tags"]
        self.vocab_lemmas = store_dataset_train["vocab_lemmas"]
        self.vocab_predicates = store_dataset_train["vocab_predicates"]
        self.vocab_dependency_relations = store_dataset_train[
            "vocab_dependency_relations"]
        self.vocab_label = store_dataset_train["vocab_label"]
        net_configuration: dict = net_configurator(
            use_bert_embeddings=USE_BERT_EMBEDDINGS,
            use_crf=USE_CRF,
            use_biaffine_layer=USE_BIAFFINE_LAYER,
            use_pretrained=False,
            use_dependecy_heads=USE_DEPENDENCY_HEADS,
            use_predicates=False,
            use_syntagnet=USE_SYNTAGNET)

        # -- BERT --
        self.model_name: str = 'bert-base-cased'
        self.bert_config = BertConfig.from_pretrained(
            self.model_name, output_hidden_states=True)
        self.bert_tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = BertModel.from_pretrained(self.model_name,
                                                    config=self.bert_config)

        # Hyperparameters class
        @dataclass
        class HParams:
            label_vocabulary = self.vocab_label
            vocab_size_words: int = len(self.vocab_words)
            lstm_hidden_dim: int = 300
            embedding_dim_words: int = 300
            embedding_dim_lemmas: int = 300
            embedding_dim_relations: int = 300
            embedding_dim_predicates: int = 400
            embedding_dim_pos: int = 300
            gcn_output_dim: int = 143
            gcn_dropout_probability: float = 0.5
            gcn_hidden_dim: int = 250
            gcn_lstm_num_layers: int = 2
            bert_lstm_num_layers: int = 2
            bert_hidden_dim: int = self.bert_config.hidden_size
            num_classes: int = len(self.vocab_label)
            biaffine_lstm_num_layers: int = 2
            bidirectional: bool = True
            num_layers: int = 2
            dropout: float = 0.3
            lstm_dropout: float = 0.3
            vocab_size_pos_tags: int = len(self.vocab_pos_tags)
            vocab_size_lemmas: int = len(self.vocab_lemmas)
            vocab_size_dependency_relations: int = len(
                self.vocab_dependency_relations)
            vocab_size_predicates: int = len(self.vocab_predicates)
            device: str = self.device

        hyperparameters: HParams = HParams()

        self.net_configuration: dict = net_configuration
        model: SRL_final_MODEL = SRL_final_MODEL(
            hparams=hyperparameters,
            configurator=net_configuration).to(self.device)

        model.load_state_dict(
            torch.load('model/final_model_stored.pth',
                       map_location=self.device))
        self.model: SRL_final_MODEL = model
        self.model.eval()  # set model in eval settings