def __init__(self, charWindowSize, maxSentenceLenth, lr, epoch, batchSize, emb_dim, hidden_dim, dropout, datatype, embedding_model_type, old_or_new, is_noised, which_epoch_to_test): self.charWindowSize = charWindowSize #字符的共现窗口大小,即同时观察当前字符前后多少个字符 self.maxSentenceLenth = maxSentenceLenth #文章的最大长度 self.lr = lr #学习率 self.epoch = epoch #迭代轮数 self.batchSize = batchSize #一次喂入多少条数据 self.emb_dir = emb_dim #每个词的embedding维数 self.hidden_dim = hidden_dim #隐藏层的维度,也即第一层LSTM层的记忆体个数 self.dropout = dropout self.bert_token = BertTokenizer.from_pretrained('bert-base-chinese') self.model_config = BertConfig.from_pretrained('bert-base-chinese') self.datatype = datatype #数据集类型 self.embedding_model_type = embedding_model_type #预训练模型是Bert还是word2vec self.old_or_new = old_or_new self.is_noised = is_noised self.which_epoch_to_test = which_epoch_to_test # GPU information os.environ["CUDA_VISIBLE_DEVICES"] = '1' # use GPU with ID=0 if self.datatype == 'ccf': #CCF数据集 self.ctg_dic = [ 'O', 'B-position', 'M-position', 'E-position', 'B-name', 'M-name', 'E-name', 'B-organization', 'M-organization', 'E-organization', 'B-movie', 'M-movie', 'E-movie', 'B-email', 'M-email', 'E-email', 'B-mobile', 'M-mobile', 'E-mobile', 'B-company', 'M-company', 'E-company', 'B-book', 'M-book', 'E-book', 'B-QQ', 'M-QQ', 'E-QQ', 'B-scene', 'M-scene', 'E-scene', 'B-address', 'M-address', 'E-address', 'B-game', 'M-game', 'E-game', 'B-government', 'M-government', 'E-government', 'B-vx', 'M-vx', 'E-vx', 'H' ] self.category_num = 44 if self.is_noised == 'noised': self.trainfilepath = r'./resource/' + self.old_or_new + '/ccf_14_noised_train.txt' #原文本路径 self.testfilepath = r'./resource/' + self.old_or_new + '/ccf_14_noised_test.txt' #原文本路径 else: self.trainfilepath = r'./resource/' + self.old_or_new + '/ccf_14_train.txt' # 原文本路径 self.testfilepath = r'./resource/' + self.old_or_new + '/ccf_14_test.txt' # 原文本路径 self.datapath = r'./cache/data/ccf/' #喂进模型的数据路径 self.model_variable = r'../NER/cache/variable/ccf/' #模型的可训练参数保存路径 elif self.datatype == 'cluener': #10分类数据集 self.ctg_dic = [ 'O', 'B-company', 'M-company', 'E-company', 'B-name', 'M-name', 'E-name', 'B-email', 'M-email', 'E-email', 'B-mobile', 'M-mobile', 'E-mobile', 'B-game', 'M-game', 'E-game', 'B-QQ', 'M-QQ', 'E-QQ', 'B-organization', 'M-organization', 'E-organization', 'B-movie', 'M-movie', 'E-movie', 'B-position', 'M-position', 'E-position', 'B-address', 'M-address', 'E-address', 'B-government', 'M-government', 'E-government', 'B-scene', 'M-scene', 'E-scene', 'B-book', 'M-book', 'E-book', 'H' ] self.category_num = 41 if self.is_noised == 'noised': self.trainfilepath = r'./resource/' + self.old_or_new + '/cluener_10_noised_train.txt' # 原文本路径 self.testfilepath = r'./resource/' + self.old_or_new + '/cluener_10_noised_test.txt' # 原文本路径 else: self.trainfilepath = r'./resource/' + self.old_or_new + '/cluener_10_train.txt' # 原文本路径 self.testfilepath = r'./resource/' + self.old_or_new + '/cluener_10_test.txt' # 原文本路径 self.datapath = r'./cache/data/cluener/' #喂进模型的数据路径 self.model_variable = r'../NER/cache/variable/cluener/' #模型的可训练参数保存路径 elif self.datatype == 'weibo': #微博数据集 self.ctg_dic = [ 'O', 'B-email', 'M-email', 'E-email', 'B-mobile', 'M-mobile', 'E-mobile', 'B-QQ', 'M-QQ', 'E-QQ', 'B-GPE', 'M-GPE', 'E-GPE', 'B-PER', 'M-PER', 'E-PER', 'B-ORG', 'M-ORG', 'E-ORG', 'B-LOC', 'M-LOC', 'E-LOC', 'S-PER', 'S-GPE', 'S-LOC', 'H' ] self.category_num = 26 if self.is_noised == 'noised': self.trainfilepath = r'./resource/' + self.old_or_new + '/weibo_4_noised_train.txt' #原文本路径 self.testfilepath = r'./resource/' + self.old_or_new + '/weibo_4_noised_test.txt' # 原文本路径 else: self.trainfilepath = r'./resource/' + self.old_or_new + '/weibo_4_train.txt' #原文本路径 self.testfilepath = r'./resource/' + self.old_or_new + '/weibo_4_test.txt' # 原文本路径 self.datapath = r'./cache/data/weibo/' #喂进模型的数据路径 self.model_variable = r'./cache/variable/weibo/' #模型的可训练参数保存路径
from torch.utils.data import TensorDataset, DataLoader, Dataset import torch.nn.functional as F from utils import * from tqdm import tqdm from trainDataloader import BertSimDataset, BertEvalSimDataset, BertEvalSimWithLabelDataset, EnsembleEvalSimWithLabelDataset from transformers import BertModel, BertConfig, BertTokenizer, BertForSequenceClassification # %% tokenizer = BertTokenizer.from_pretrained('./dataset/vocab') eval_list = load_sim_dev('./dataset/101/c_dev_with_label') myData_eval = EnsembleEvalSimWithLabelDataset(tokenizer, './dataset/std_data', 50) # %% config = BertConfig.from_json_file('./dataset/bert_config.json') config.num_labels = 2 model = BertForSequenceClassification.from_pretrained( './model/bert_pre58_3/pytorch_model.bin', config=config) # %% class SelfAttention(nn.Module): def __init__(self, hidden_dim): super().__init__() self.hidden_dim = hidden_dim self.projection = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU(True), nn.Linear(hidden_dim, hidden_dim)) def forward(self, encoder_outputs):
def __init__(self): super().__init__() config = BertConfig.from_pretrained("bert-base-uncased") self.model = BertModel(config)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) ) # We can specify head_mask for each layer head_mask = head_mask.to(dtype=next(self.parameters( )).dtype) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers return input_ids, position_ids, token_type_ids, inputs_embeds, \ extended_attention_mask, head_mask, encoder_hidden_states, encoder_extended_attention_mask if __name__ == "__main__": config = BertConfig.from_pretrained('../data/bert_model/bert_config.json') tokenizer = BertTokenizer.from_pretrained('../data/bert_model/vocab.txt') bert = BertModel.from_pretrained( '../data/bert_model/chinese_wwm_pytorch.bin', config=config) model = SoftMaskedBert(bert, tokenizer, 2, 1, 'cpu') text = '中国的' token = tokenizer.tokenize(text) ids = tokenizer.convert_tokens_to_ids(token) ids = torch.Tensor([ids]).long() print(ids) input_mask = torch.tensor([[1, 1, 0]]) segment_ids = torch.tensor([[0, 0, 0]]) out = model(ids, input_mask, segment_ids) # out = bert(ids) print(out)
def main(args, _=None): batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( utils.tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = batch["attention_mask"].unsqueeze(-1) \ if args.mask_for_max_length \ else None features_ = utils.process_bert_output( bert_output=bert_output, hidden_size=model.config.hidden_size, output_hidden_states=model.config.output_hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
def init_encoder(cls, args, dropout: float = 0.1): cfg = BertConfig.from_pretrained("bert-base-uncased") if dropout != 0: cfg.attention_probs_dropout_prob = dropout cfg.hidden_dropout_prob = dropout return cls.from_pretrained("bert-base-uncased", config=cfg)
def train(args): # device setting device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load model and tokenizer if args['model_name'] == "xlm-roberta-large": MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) config = XLMRobertaConfig.from_pretrained(MODEL_NAME) config.num_labels = args['num_labels'] model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=config) elif args['model_name'] == "roberta-base": MODEL_NAME = "roberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) config = RobertaConfig.from_pretrained(MODEL_NAME, output_hidden_states=True) config.num_labels = args['num_labels'] model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=config) elif args['model_name'] == "bert-base-multilingual-cased": MODEL_NAME = "bert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) config = BertConfig.from_pretrained(MODEL_NAME) config.num_labels = args['num_labels'] model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config) else: MODEL_NAME = args['model_name'] tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) # if you use entity_token if args['entity_token']: special_tokens_dict = { 'additional_special_tokens': ["#", "@", '₩', '^'] } num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") train_dataset, valid_dataset = train_test_split( dataset, test_size=0.1, random_state=args['random_seed']) train_label = train_dataset['label'].values valid_label = valid_dataset['label'].values # pororo ner ner = Pororo(task="ner", lang="ko") # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer, ner, args) tokenized_valid = tokenized_dataset(valid_dataset, tokenizer, ner, args) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_valid_dataset = RE_Dataset(tokenized_valid, valid_label) # update model setting model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. print("use_trainer : ", args['use_trainer']) if args['use_trainer']: training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=5, # number of total save model. save_steps=500, # model saving step. num_train_epochs=args['epochs'], # total number of training epochs learning_rate=args['lr'], # learning_rate per_device_train_batch_size=args[ 'train_batch_size'], # batch size per device during training per_device_eval_batch_size=args[ 'eval_batch_size'], # batch size for evaluation warmup_steps=args[ 'warmup_steps'], # number of warmup steps for learning rate scheduler weight_decay=args['weight_decay'], # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=args['logging_steps'], # log saving step. label_smoothing_factor=args['label_smoothing_factor'], evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_valid_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train() else: custom_trainer(model, device, RE_train_dataset, RE_valid_dataset, args)
# 4 parser.add_argument('--resultpath', type=str, help='where to save the LM model') args = parser.parse_args() import pandas as pd import regex as re if args.LM == 'Bert': from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM config = BertConfig(vocab_size=28996, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=12, #type_vocab_size=2, default is 2 ) tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config) #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config) # 12-layer, 768-hidden, 12-heads, 110M parameters. elif args.LM == 'RoBerta': from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1,
import tensorflow as tf from tensorflow.keras.utils import to_categorical # from tensorflow.keras import model`s, layers, preprocessing as kprocessing from transformers import TFBertModel, BertConfig, BertTokenizerFast import streamlit as st # from argparse import ArgumentParser import lime from lime.lime_text import LimeTextExplainer MODELS = { "BERT": "model_noprocess.h5" } model_name = 'bert-base-uncased' # Load transformers config and set output_hidden_states to False config = BertConfig.from_pretrained(model_name) config.output_hidden_states = False # Load BERT tokenizer tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config) repo_root = os.path.dirname(os.path.abspath(__file__))[:os.path.dirname(os.path.abspath(__file__)).find("Assignment_1")+13] import_model = load_model(repo_root+"/models/model_noprocess.h5") class_names = ['1', '2', '3', '4', '5'] explainer = LimeTextExplainer(class_names=class_names) print(repo_root) # Obtain the CSS for Buttons to be displayed @st.cache(suppress_st_warning=True, allow_output_mutation=True) def get_button_css(button_id): custom_css = f""" <style> #{button_id} {{
if line == '': break line = line.rstrip() label_dict[line] = line_id line_id += 1 if 'albert' in args.model: model_type = 'albert' tokenizer = AlbertTokenizer(vocab_file = args.tokenizer) config = AlbertConfig.from_json_file(args.config) model = AlbertModel.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(args.model)) elif 'bert' in args.model: model_type = 'bert' tokenizer = BertTokenizer(vocab_file = args.tokenizer) config = BertConfig.from_json_file(args.config) model = BertModel.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(args.model)) elif 'electra' in args.model: model_type = 'electra' tokenizer = ElectraTokenizer(vocab_file = args.tokenizer) config = ElectraConfig.from_json_file(args.config) model = ElectraModel.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(args.model)) else: raise NotImplementedError("The model is currently not supported") def process_line(line): data = json.loads(line)
print('there are {} kinds of relation in train.'.format(len(set(train_label)))) print('there are {} kinds of relation in dev.'.format(len(set(val_label)))) print('there are {} kinds of relation in test.'.format(len(set(test_label)))) print('number of union of train and dev: {}'.format(len(set(train_label) & set(val_label)))) print('number of union of dev and test: {}'.format(len(set(val_label) & set(test_label)))) print('number of union of train and test: {}'.format(len(set(train_label) & set(test_label)))) property2idx, idx2property, pid2vec = data_helper.generate_attribute(train_label, val_label, test_label) print(len(training_data)) print(len(dev_data)) print(len(test_data)) bertconfig = BertConfig.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', num_labels=len(set(train_label)), finetuning_task='wiki-zero-shot') bertconfig.relation_emb_dim = 1024 bertconfig.margin = args.gamma bertconfig.alpha = args.alpha bertconfig.dist_func = args.dist_func model = ZSBert.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', config=bertconfig) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) model = model.to(device) trainset = data_helper.WikiDataset('train', training_data, pid2vec, property2idx) trainloader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=data_helper.create_mini_batch, shuffle=True)
id_candidate_len_list = np.array(id_candidate_len_list) sorted_index = np.argsort(id_candidate_len_list) id_candidate_list_sorted = [] for i in range(len(id_candidate_list)): id_candidate_list_sorted.append(id_candidate_list[sorted_index[i]]) # hyperparameters max_seq_len = 384 max_question_len = 64 learning_rate = 0.00005 batch_size = 960 num_epoch = 1 # build model model_path = '../../huggingface_pretrained/bert-base-uncased/' config = BertConfig.from_pretrained(model_path) config.num_labels = 5 tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True) model = BertForQuestionAnswering.from_pretrained('weights/epoch1/', config=config) model.cuda() optimizer = optim.Adam(model.parameters(), lr=1e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # testing
def main(): parser = argparse.ArgumentParser() parser.add_argument("--task_name", default=None, type=str, help="The name of the task for training.") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--bert_model", default="bert-base-uncased", type=str, help="student bert model configuration folder") parser.add_argument("--encoder_checkpoint", default=None, type=str, help="check point for student encoder") parser.add_argument("--cls_checkpoint", default=None, type=str, help="check point for student classifier") parser.add_argument("--alpha", default=0.95, type=float, help="alpha for distillation") parser.add_argument("--T", default=10., type=float, help="temperature for distillation") parser.add_argument("--beta", default=0.0, type=float, help="weight for AT loss") parser.add_argument("--fc_layer_idx", default=None, type=str, help="layers ids we will put FC layers on") parser.add_argument("--normalize_patience", default=False, help="normalize patience or not") parser.add_argument("--do_train", action='store_true', help="do training or not") parser.add_argument("--do_eval", action='store_true', help="do evaluation during training or not") parser.add_argument("--train_type", default="finetune_teacher", choices=["finetune_teacher","train_student"], help="choose which to train") parser.add_argument("--log_every_step", default=50, type=int, help="output to log every global x training steps, default is 1") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--logging_steps', type=int, default=1000, help="Log every X updates steps.") parser.add_argument('--student_hidden_layers', type=int, default=12, help="number of transformer layers for student, default is None (use all layers)") parser.add_argument('--teacher_prediction', type=str, default=None, help="teacher prediction file to guild the student's output") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") args = parser.parse_args() args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps logger.info('actual batch size on all GPU = %d' % args.train_batch_size) if args.train_type == 'finetune_teacher': args.student_hidden_layers = 12 if 'base' in args.bert_model else 24 args.alpha = 0.0 # alpha = 0 is equivalent to fine-tuning for KD elif args.train_type == "train_student": args.student_hidden_layers = 6 args.kd_model = "kd.cls" args.alpha = 0.7 args.beta = 500 args.T = 10 args.fc_layer_idx = "1,3,5,7,9" # this for pkd-skip args.normalize_patience = True else: raise ValueError("please pick train_type from finetune_teacher,train_student") if args.encoder_checkpoint is None: args.encoder_checkpoint = os.path.join(args.bert_model, 'pytorch_model.bin') logger.info('encoder checkpoint not provided, use pre-trained at %s instead' % args.encoder_checkpoint) if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir)) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() #args.n_gpu = 1 logger.info("device: {} n_gpu: {}".format(args.device, args.n_gpu)) # set seed set_seed(args) # prepare task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() args.num_labels = len(label_list) # prepare tokenizer and model config = BertConfig() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) config.output_hidden_states = True encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers) classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0) n_student_layer = len(encoder.bert.encoder.layer) encoder = load_model(encoder, args.encoder_checkpoint, args, 'student', verbose=True) logger.info('*' * 77) classifier = load_model(classifier, args.cls_checkpoint, args, 'classifier', verbose=True) n_param_student = count_parameters(encoder) + count_parameters(classifier) logger.info('number of layers in student model = %d' % n_student_layer) logger.info('num parameters in student model are %d' % n_param_student) # Training if args.do_train: read_set = 'train' if args.train_type == "train_student": assert args.teacher_prediction is not None assert args.alpha > 0 logger.info('loading teacher\'s predictoin') teacher_predictions = pickle.load(open(args.teacher_prediction, 'rb'))['train'] if args.teacher_prediction is not None else None logger.info('teacher acc = %.2f, teacher loss = %.5f' % ( teacher_predictions['acc'] * 100, teacher_predictions['loss'])) train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer, SequentialSampler, batch_size=args.train_batch_size, knowledge=teacher_predictions['pred_logit'], extra_knowledge=teacher_predictions[ 'feature_maps']) else: assert args.alpha == 0 logger.info("runing teacher fine-tuning") train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer, SequentialSampler, batch_size=args.train_batch_size) global_step, tr_loss = train(args, train_dataloader, encoder, classifier, tokenizer) ################# # information of teacher model (like [CLS]) ################# if args.train_type == "finetune_teacher": all_res = {'train': None} encoder_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.encoder.pkl') cls_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.cls.pkl') print("encoder_file") encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers) classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0) encoder = load_model(encoder, encoder_file, args, 'exact', verbose=True) classifier = load_model(classifier, cls_file, args, 'exact', verbose=True) train_res = eval_model_dataloader(encoder, classifier, train_dataloader, args.device, detailed=True, verbose=False) all_res['train'] = train_res logger.info('saving teacher results') fname = os.path.join(args.output_dir, args.task_name + f'_teacher_{args.student_hidden_layers}layer_information.pkl') with open(fname, 'wb') as fp: pickle.dump(all_res, fp) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Evaluation if args.do_eval: test_examples, test_dataloader, test_label_ids = get_task_dataloader(args, 'dev', tokenizer, SequentialSampler, batch_size=args.eval_batch_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) result = evaluate(args, test_label_ids, encoder,classifier,test_dataloader) output_test_file = os.path.join(args.output_dir, "test_results_" + '.txt') with open(output_test_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return
# Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "Process device: %s, n_gpu: %s, 16-bits training: %s", device, args.n_gpu, args.fp16, ) config = BertConfig.from_pretrained('bert-base-uncased' if args.pretrained_model is None else args.pretrained_model, num_labels=args.num_labels, ) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased' if args.pretrained_model is None else args.pretrained_model,) model = BertForSequenceClassification.from_pretrained('bert-base-uncased' if args.pretrained_model is None else args.pretrained_model, config=config, ) model.to(args.device) writer = ResultWriter(args.experiments_dir) results = {} if args.do_train: train_dataset = IMDBDataset(args.train_file, tokenizer, args.train_max_len) valid_dataset = IMDBDataset(args.valid_file, tokenizer, args.train_max_len) unlabeled_dataset = None
use_edu = (unit == 'edu') unit_length_limit = args.unit_length_limit word_length_limit = args.word_length_limit batch_size = args.batch_size if not os.path.exists(hyp_path): os.makedirs(hyp_path) if not os.path.exists(ref_path): os.makedirs(ref_path) if torch.cuda.is_available(): device = torch.device("cuda:%d" % (args.device)) torch.cuda.set_device(device) else: device = torch.device("cpu") config = BertConfig.from_json_file(bert_config) print('load model') model = DiscoExtSumm(args, load_pretrained_bert=False, bert_config=config).to(device) if torch.cuda.is_available(): model = model.to(device) model.load_state_dict(torch.load(MODEL_PATH, map_location=device)) model.eval() # train = SummarizationDataset(inputs_dir,is_test=False) # train_dataloader = SummarizationDataLoader(train,is_test=False,device=1,batch_size=8) # pos_weight = get_posweight(inputs_dir).to(device) pos_weight = torch.FloatTensor([10.11]).to(device) print('load data') # attention_folder = 'nyt/attn_map_nuc_norm'
def get_kobert_config(): return BertConfig.from_dict(kobert_config)
def __init__(self, args, device, checkpoint=None, bert_from_extractive=None): super(AbsSummarizer, self).__init__() self.args = args self.device = device self.bert = Bert(args.large, args.temp_dir, args.finetune_bert) if bert_from_extractive is not None: self.bert.model.load_state_dict(dict([ (n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model') ]), strict=True) if (args.encoder == 'baseline'): bert_config = BertConfig( self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size, num_hidden_layers=args.enc_layers, num_attention_heads=8, intermediate_size=args.enc_ff_size, hidden_dropout_prob=args.enc_dropout, attention_probs_dropout_prob=args.enc_dropout) self.bert.model = BertModel(bert_config) if (args.max_pos > 512): my_pos_embeddings = nn.Embedding( args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[: 512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[ 512:] = self.bert.model.embeddings.position_embeddings.weight.data[ -1][None, :].repeat(args.max_pos - 512, 1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings self.vocab_size = self.bert.model.config.vocab_size tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) if (self.args.share_emb): tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) self.decoder = TransformerDecoder(self.args.dec_layers, self.args.dec_hidden_size, heads=self.args.dec_heads, d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings) self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device) self.generator[0].weight = self.decoder.embeddings.weight if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() for p in self.generator.parameters(): if p.dim() > 1: xavier_uniform_(p) else: p.data.zero_() if (args.use_bert_emb): tgt_embeddings = nn.Embedding( self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) self.decoder.embeddings = tgt_embeddings self.generator[0].weight = self.decoder.embeddings.weight self.to(device)
import glob import logging import pickle import re from time import time from torch import nn from torch.optim import Adam from torch.nn import BCEWithLogitsLoss from torch.utils.data import DataLoader, Dataset from transformers import (BertConfig, BertForMaskedLM, BertTokenizer) bert_layer = BertForMaskedLM.from_pretrained( 'allenai/scibert_scivocab_uncased', output_hidden_states=True) bert_config = BertConfig() class LoadDataSet(Dataset): def __init__(self, filename, maxlen=64): self.df = pd.read_csv(filename, encoding='utf-8') self.tokenizer = BertTokenizer.from_pretrained( 'allenai/scibert_scivocab_uncased') self.maxlen = maxlen def __len__(self): return len(self.df) def __getitem__(self, index): sentence = self.df.loc[index, 'Processed Text']
args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() hvd.init() if args.cuda: # Horovod: pin GPU to local rank. #print('local rank: ', hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) cudnn.benchmark = True DATAPATH='/datasets/shshi' pretrained_path='%s/pretrained'%DATAPATH if args.model == 'bert_base': config = BertConfig.from_json_file('bert_base_config.json') else: config = BertConfig.from_json_file('bert_config.json') # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) vocab_size=config.vocab_size #tokenizer = BertTokenizer.from_pretrained(pretrained_path) #model = BertForPreTraining.from_pretrained(pretrained_path) model = BertForPreTraining(config) if args.cuda: model.cuda() max_len = args.sentence_len
def train(fold_all): # config = BertConfig.from_pretrained('../../model_lib/robert/pytorch/chinese_roberta_wwm_large_ext_pytorch/bert_config.json') # config = BertConfig.from_pretrained('../../model_lib/bert/pytorch/xs/bert_config.json') # bert-wwm-ext config = BertConfig.from_pretrained( '../../model_lib/bert/pytorch/bert-wwm-ext/bert_config.json') print('开始训练...') for fold_index in range(FOLD): # set fold parameter BEST_F1 = 0 BEST_EPOCH = 0 loss_list = [] f1_list = [] flag = 0 print('正在加载模型...') if USE_GPU: model = BertForSequenceClassification.from_pretrained( '../../model_lib/bert/pytorch/bert-wwm-ext/', config=config).cuda() # model = BertForSequenceClassification.from_pretrained('../../model_lib/bert/pytorch/xs/', config=config).cuda() else: model = BertForSequenceClassification.from_pretrained( '../../model_lib/bert/pytorch/bert-wwm-ext/', config=config) # model = BertForSequenceClassification.from_pretrained('../../model_lib/bert/pytorch/xs/', config=config) optimizer = AdamW(model.parameters(), lr=LR, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total=T_TOTAL) # T_TOTAL? # 制作交叉验证的数据集 train_list = [] for _ in range(5): if _ != fold_index: train_list = train_list + fold_all[_] dev_list = fold_all[fold_index] train_bert_list = utils.bert_input(train_list) dev_bert_list = utils.bert_input(dev_list) train_dataset = layers.Train_Dataset(train_bert_list) dev_dataset = layers.Train_Dataset(dev_bert_list) train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True) dev_dataloader = DataLoader(dataset=dev_dataset, batch_size=BATCH_SIZE, shuffle=False) for epoch in range(EPOCH): model.train() for text, label in train_dataloader: # 转text label为tensor text = [sub_text.tolist() for sub_text in text] label = [int(sub_label) for sub_label in label] if USE_GPU: text = torch.tensor(text).t().cuda() # 为什么要转置? label = torch.tensor(label).cuda() else: text = torch.tensor(text).t() label = torch.tensor(label) # 输入模型 outputs = model(text, labels=label) loss, logits = outputs[:2] # 优化 optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() # 存储单批次f1 loss f1 = utils.batch_f1(logits, label) f1_list.append(f1) loss_list.append(loss.item()) flag += 1 # 输出f1 loss if flag % 200 == 0: f1_mean = np.mean(f1_list) loss_mean = np.mean(loss_list) f1_list = [] loss_list = [] print('fold: {} | epoch: {} | f1: {} | loss: {}'.format( fold_index, epoch, f1_mean, loss_mean)) # 验证集,每个epoch验证一次 f1_val = val(model, dev_dataloader) print( '***********************************************************************' ) print('fold: {} | epoch: {} | 验证集F1值: {}'.format( fold_index, epoch, f1_val)) if f1_val > BEST_F1: BEST_F1 = f1_val BEST_EPOCH = epoch # v1:0 torch.save( model, 'bert_wwm_ext_f5k_epoch2_lr1_ml84_bs24_' + str(fold_index) + 'k_' + 'best_model.m') # torch.cuda.empty_cache() print('fold: {} | 验证集最优F1值: {}'.format(fold_index, BEST_F1)) print('fold: {} | 验证集最优epoch: {}'.format(fold_index, BEST_EPOCH)) print( '***********************************************************************' )
import pandas as pd import torch from torch import nn from transformers import BertTokenizer, BertModel, BertConfig import datetime device = torch.device('cuda') test = pd.read_csv('../data/Dataset/test.csv') model_path = '../user_data/pre-trained/chinese_roberta_wwm_large_ext_pytorch/' test_category = test['category'].values test_query1 = test['query1'].values test_query2 = test['query2'].values bert_config = BertConfig.from_pretrained(model_path + 'bert_config.json', output_hidden_states=True) tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt', config=bert_config) class BertForClass(nn.Module): def __init__(self, n_classes=2): super(BertForClass, self).__init__() self.model_name = 'BertForClass' self.bert_model = BertModel.from_pretrained(model_path, config=bert_config) self.classifier = nn.Linear(bert_config.hidden_size * 2, n_classes) def forward(self, input_ids, input_masks, segment_ids): sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks) seq_avg = torch.mean(sequence_output, dim=1)
def init_train_env(args, tbert_type): # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args.seed, args.n_gpu) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() if tbert_type == 'twin' or tbert_type == "T": model = TBertT(BertConfig(), args.code_bert) elif tbert_type == 'siamese' or tbert_type == "I": model = TBertI(BertConfig(), args.code_bert) elif tbert_type == 'siamese2' or tbert_type == "I2": model = TBertI2(BertConfig(), args.code_bert) elif tbert_type == 'single' or tbert_type == "S": model = TBertS(BertConfig(), args.code_bert) else: raise Exception("TBERT type not found") args.tbert_type = tbert_type if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) return model
'tok_to_orig_index': [s['tok_to_orig_index'] for s in samples], 'para_offset': [s['para_offset'] for s in samples], "true_answers": [s['true_answers'] for s in samples], 'net_input': net_input, } if __name__ == "__main__": index_path = "retrieval/index_data/para_embed_3_28_c10000.npy" raw_data = "../data/nq-train.txt" from transformers import BertConfig, BertTokenizer from retrieval.retriever import BertForRetriever from config import get_args tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_config = BertConfig.from_pretrained('bert-base-uncased') args = get_args() retriever = BertForRetriever(bert_config, args) from utils import load_saved retriever_path = "retrieval/logs/splits_3_28_c10000-seed42-bsz640-fp16True-retrieve-from94_c1000_continue_from_failed-lr1e-05-bert-base-uncased-filterTrue/checkpoint_best.pt" retriever = load_saved(retriever, retriever_path) retriever.cuda() sampler = OnlineSampler(index_path, raw_data, tokenizer, args.max_query_length, args.max_seq_length) sampler.shuffle() retriever.eval() for batch in sampler.load(retriever): if batch is not {}:
downstream_model_dir="nlpbook/checkpoint-paircls", max_seq_length=64, ) # %% load model import torch from transformers import BertConfig, BertForSequenceClassification fine_tuned_model_ckpt = torch.load( args.downstream_model_checkpoint_fpath, map_location=torch.device("cuda") ) pt_model_config = BertConfig.from_pretrained( args.pretrained_model_name, num_labels = fine_tuned_model_ckpt['state_dict']['model.classifier.bias'].shape.numel(), ) model = BertForSequenceClassification(pt_model_config) model.load_state_dict({k.replace("model.", ""):v for k,v in fine_tuned_model_ckpt['state_dict'].items()}) model.eval() # %% from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained( args.pretrained_model_name, do_lower_case=False, ) # %% def inference_fn(premise, hypothesis):
print('intent num:', len(intent_vocab)) print('tag num:', len(tag_vocab)) for data_key in ['val', 'test']: dataloader.load_data( json.load( open(os.path.join(data_dir, '{}_data.json'.format(data_key)))), data_key) print('{} set size: {}'.format(data_key, len(dataloader.data[data_key]))) if not os.path.exists(output_dir): os.makedirs(output_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) bert_config = BertConfig.from_pretrained( config['model']['pretrained_weights']) model = JointBERT(bert_config, DEVICE, dataloader.tag_dim, dataloader.intent_dim, context=config['model']['context']) # model.from_pretrained(os.path.join(output_dir, 'pytorch_model.bin')) model.load_state_dict( torch.load(os.path.join(output_dir, 'pytorch_model.bin'), DEVICE)) model.to(DEVICE) model.eval() batch_size = config['model']['batch_size'] for data_key in ['val', 'test']:
def dual_bert(): set_seed(33) opt = Adam(learning_rate=2e-5) id1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) id2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) atn1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) atn2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = False # Set to True to obtain hidden states bert_model1 = TFBertModel.from_pretrained('bert-base-uncased', config=config) bert_model2 = TFBertModel.from_pretrained('bert-base-uncased', config=config) embedding1 = bert_model1(id1, attention_mask=mask1, token_type_ids=atn1)[0] embedding2 = bert_model2(id2, attention_mask=mask2, token_type_ids=atn2)[0] embedding1 = keras.layers.Bidirectional( # 加上这个就变成了双向lstm keras.layers.LSTM( # 这个是单向lstm 64, # 权重初始化 kernel_initializer='he_normal', # 返回每个token的输出,如果设置为False 只出最后一个。 return_sequences=True))(embedding1) embedding2 = keras.layers.Bidirectional( # 加上这个就变成了双向lstm keras.layers.LSTM( # 这个是单向lstm 64, # 权重初始化 kernel_initializer='he_normal', # 返回每个token的输出,如果设置为False 只出最后一个。 return_sequences=True))(embedding2) #embedding1=list(embedding1) #embedding2 = list(embedding2) #embedding1 = GlobalAveragePooling1D()(embedding1) #embedding2 = GlobalAveragePooling1D()(embedding2) print("池化后的情况", type(embedding1), type(embedding2), embedding1.shape, embedding2.shape) #embedding1= Attention(128)(embedding1) #embedding2 = Attention(128)(embedding2) x = Concatenate()([embedding1, embedding2]) #print(x.shape,"尺寸为") x = keras.layers.Bidirectional( # 加上这个就变成了双向lstm keras.layers.LSTM( # 这个是单向lstm 64, # 权重初始化 #kernel_initializer='he_normal', # 返回每个token的输出,如果设置为False 只出最后一个。 return_sequences=True))(x) print(x.shape) print(type(x)) #print(x[:0:0].shape) #x = Lambda(lambda x: x[:,-1, :], name='CLS-token')(x) # 降维 #x = Attention(128)(x) # 加入attention x = attention(return_sequences=False)(x) print('成功了') print(type(x)) #print(x.shape) #x = Lambda(lambda x: x[:, 0], name='CLS-token')(x)#降维 #x1 = GlobalAveragePooling1D()(embedding1) #x2 = GlobalAveragePooling1D()(embedding2) #x = Concatenate()([x1, x2]) x = Dense(64, activation='relu')(x) x = Dropout(0.2)(x) #out = Dense(len(map_label), activation='softmax')(x) #x=tf.Tensor(x) #out=Dense(5, activation='sigmoid')(x) out = Dense(5, activation='softmax')(x) model = Model(inputs=[id1, mask1, atn1, id2, mask2, atn2], outputs=out) model.compile(loss='mean_squared_error', optimizer=opt, metrics=['accuracy']) #加个评测指标 return model
return param_group['lr'] def metrics_to_string(metric_dict): string_list = [] for key, value in metric_dict.items(): string_list.append('{}:{:.2f}'.format(key, value)) return ' '.join(string_list) # Set random seed set_seed(26092020) # tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1') config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1') config.num_labels = max(AspectBasedSentimentAnalysisProsaDataset.NUM_LABELS) config.num_labels_list = AspectBasedSentimentAnalysisProsaDataset.NUM_LABELS # # # # # Instantiate model model = BertForMultiLabelClassification.from_pretrained( 'indobenchmark/indobert-base-p1', config=config) # model = fasttext.load_model("/Users/bobbyakyong/Projects/python/indonlu2/model/fasttext-cc-id/cc.id.300_no-oov_absa-prosa_uncased.txt") model print(model) print(count_param(model))
from serbianer.load_data.load_dataset import read_and_prepare_csv, SentenceGetter, bert_load_index_tags from serbianer.vocab import Vocab import matplotlib.pyplot as plt import os import re import json import string import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tokenizers import BertWordPieceTokenizer from transformers import BertTokenizer, TFBertModel, BertConfig configuration = BertConfig() # default parameters and configuration for BERT def create_model(model='bert-base-multilingual-cased'): ## BERT encoder encoder = TFBertModel.from_pretrained(model) ## QA Model input_ids = layers.Input(shape=(max_len,), dtype=tf.int32) token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32) attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32) embedding = encoder( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask )[0] start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding) start_logits = layers.Flatten()(start_logits)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # if model_args.config_name: # config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) # elif model_args.model_name_or_path: # config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # else: # config = CONFIG_MAPPING[model_args.model_type]() # logger.warning("You are instantiating a new config instance from scratch.") pretrained_name = "bert-base-uncased" config = BertConfig.from_pretrained(pretrained_name) tokenizer = BertTokenizer.from_pretrained(pretrained_name) # if model_args.model_name_or_path: # model = AutoModelForMaskedLM.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), # config=config, # cache_dir=model_args.cache_dir, # ) # else: # logger.info("Training new model from scratch") # model = AutoModelForMaskedLM.from_config(config) model = LitOutputBertModel(name="test", config=config, tokenizer=tokenizer, pretrained_name=pretrained_name) model.bert.train() print("Requiring grads for bert") for param in model.bert.parameters(recurse=True): param.requires_grad = True for param in model.training_bert.parameters(recurse=True): param.requires_grad = True model.bert.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # This one will take care of randomly masking the tokens. tokenized_datasets["train"]._output_all_columns = True data_collator = CustomDataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def __init__(self, device: str): self.device: str = device store_dataset_train = torch.load("model/dict_vocabs.pth", map_location=self.device) self.vocab_words = store_dataset_train["vocab_words"] self.vocab_pos_tags = store_dataset_train["vocab_pos_tags"] self.vocab_lemmas = store_dataset_train["vocab_lemmas"] self.vocab_predicates = store_dataset_train["vocab_predicates"] self.vocab_dependency_relations = store_dataset_train[ "vocab_dependency_relations"] self.vocab_label = store_dataset_train["vocab_label"] net_configuration: dict = net_configurator( use_bert_embeddings=USE_BERT_EMBEDDINGS, use_crf=USE_CRF, use_biaffine_layer=USE_BIAFFINE_LAYER, use_pretrained=False, use_dependecy_heads=USE_DEPENDENCY_HEADS, use_predicates=False, use_syntagnet=USE_SYNTAGNET) # -- BERT -- self.model_name: str = 'bert-base-cased' self.bert_config = BertConfig.from_pretrained( self.model_name, output_hidden_states=True) self.bert_tokenizer = BertTokenizer.from_pretrained(self.model_name) self.bert_model = BertModel.from_pretrained(self.model_name, config=self.bert_config) # Hyperparameters class @dataclass class HParams: label_vocabulary = self.vocab_label vocab_size_words: int = len(self.vocab_words) lstm_hidden_dim: int = 300 embedding_dim_words: int = 300 embedding_dim_lemmas: int = 300 embedding_dim_relations: int = 300 embedding_dim_predicates: int = 400 embedding_dim_pos: int = 300 gcn_output_dim: int = 143 gcn_dropout_probability: float = 0.5 gcn_hidden_dim: int = 250 gcn_lstm_num_layers: int = 2 bert_lstm_num_layers: int = 2 bert_hidden_dim: int = self.bert_config.hidden_size num_classes: int = len(self.vocab_label) biaffine_lstm_num_layers: int = 2 bidirectional: bool = True num_layers: int = 2 dropout: float = 0.3 lstm_dropout: float = 0.3 vocab_size_pos_tags: int = len(self.vocab_pos_tags) vocab_size_lemmas: int = len(self.vocab_lemmas) vocab_size_dependency_relations: int = len( self.vocab_dependency_relations) vocab_size_predicates: int = len(self.vocab_predicates) device: str = self.device hyperparameters: HParams = HParams() self.net_configuration: dict = net_configuration model: SRL_final_MODEL = SRL_final_MODEL( hparams=hyperparameters, configurator=net_configuration).to(self.device) model.load_state_dict( torch.load('model/final_model_stored.pth', map_location=self.device)) self.model: SRL_final_MODEL = model self.model.eval() # set model in eval settings