def train(args, train_loader, test_loader, test_json): device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") print("loading bert.") model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=3) model.to(device) optim = AdamW(model.parameters(), lr=args.lr) print("loaded. staring training.") # best_rationale_acc = 0 for epoch in range(args.num_epoch): for batch in tqdm(train_loader): optim.zero_grad() model.train() model.zero_grad() input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs[0] loss.backward() optim.step() if epoch % 3 == 2 and not args.no_logs: total, correct, pred_rationale = evaluate(args, model, test_loader) print_score(args, total, correct) print_human_vs_model(test_json, pred_rationale) return model
def __init__(self, config_name: str, model_name: str = None, num_tags: int = 2, batch_first: bool = True) -> None: self.batch_first = batch_first if not os.path.exists(config_name): raise ValueError("未找到模型配置文件 '{}'".format(config_name)) else: self.config_name = config_name if model_name is not None: if not os.path.exists(model_name): raise ValueError("未找到模型预训练参数文件 '{}'".format(model_name)) else: self.model_name = model_name else: self.model_name = None super().__init__() self.bert_config = BertConfig.from_pretrained(self.config_name) self.bert_config.num_labels = num_tags self.model_kwargs = {'config': self.bert_config} if self.model_name is not None: self.bertModel = BertForTokenClassification.from_pretrained( self.model_name, **self.model_kwargs) else: self.bertModel = BertForTokenClassification(self.bert_config) self.crf_model = CRF(num_tags=num_tags, batch_first=batch_first)
def __init__(self, config_name: str, model_name: str = None, num_tags: int = 2, batch_first: bool = True) -> None: self.batch_first = batch_first if not os.path.exists(config_name): raise ValueError('{} config file not found'.format(config_name)) else: self.config_name = config_name if model_name is not None: if not os.path.exists(model_name): raise ValueError(' {} model file not found'.format(model_name)) else: self.model_name = model_name else: self.model_name = None if num_tags <= 0: raise ValueError(f'invalid number of tags:{num_tags}') super().__init__() #bert config文件 self.bert_config = BertConfig.from_pretrained(self.config_name) self.bert_config.num_tags = num_tags self.model_kwargs = {'config': self.bert_config} if self.model_name is not None: self.bertModel = BertForTokenClassification.from_pretrained( self.model_name, **self.model_kwargs) else: self.bertModel = BertForTokenClassification(self.bert_config) self.crfModel = CRF(num_tags=num_tags, batch_first=batch_first)
def build_model(self, args): if args.task == 'BertForTokenClassification': # obtain num_label from dataset before assign model from transformers import BertForTokenClassification, BertConfig config = BertConfig.from_json_file(args.config_file) # **YD** mention detection, num_label is by default 3 assert hasattr(args, 'num_labels') config.num_labels = args.num_labels model = BertForTokenClassification(config) # **YD** add load state_dict from pre-trained model # could make only master model to load from state_dict, not quite sure whether this works for single GPU # if distributed_utils.is_master(args) and args.hetseq_state_dict is not None: if args.hetseq_state_dict is not None: state_dict = torch.load(args.hetseq_state_dict, map_location='cpu')['model'] if args.load_state_dict_strict: model.load_state_dict(state_dict, strict=True) else: model.load_state_dict(state_dict, strict=False) elif args.transformers_state_dict is not None: state_dict = torch.load(args.transformers_state_dict, map_location='cpu') if args.load_state_dict_strict: model.load_state_dict(state_dict, strict=True) else: model.load_state_dict(state_dict, strict=False) else: raise ValueError('Unknown fine_tunning task!') return model
class BertModel(nn.Module): def __init__(self, pretrained_model_name_or_dir=None, pretrained_num_classes=None, fine_tune=False, bert_config=None): """ Buils a bert model for token classification :param pretrained_model_name_or_dir: Specify the pretrained_model_name_or_dir to load from to start from a pretrained model :param pretrained_num_classes: The number of classes for the pretrained model :param fine_tune: If fine tune is true, only the classification layer weights are tuned. :param bert_config: If this is not none, this config is used to create a BERT model from scratch using the configuration """ super().__init__() if bert_config is None: self.model = BertForTokenClassification.from_pretrained( pretrained_model_name_or_dir, num_labels=pretrained_num_classes) else: self.model = BertForTokenClassification(bert_config) # Fine tune, freeze all other weights except classifier if fine_tune: self._freeze_base_weights() def _freeze_base_weights(self): for param in self.model.base_model.parameters(): param.requires_grad = False def forward(self, *input): return self.model(*input) def save(self, path): self.model.save_pretrained(save_directory=path)
def get_text_reader(reader_name, task, num_labels): # AILAW Corpus is korean dataset. # So, model is fixed to Korean Model such as multilingual-BERT, kobert, koelectra, etc. if reader_name == "bert": if task == "classification": model_name = "bert-base-multilingual-cased" text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) else: # ner model_name = "bert-base-multilingual-cased" text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels) elif reader_name == "kobert": if task == "classification": model_name = "monologg/kobert" text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) else: # ner model_name = "monologg/kobert" text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels) elif reader_name == "koelectra": if task == "classification": model_name = "monologg/koelectra-base-discriminator" text_reader = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) else: # ner model_name = "monologg/koelectra-base-discriminator" text_reader = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels) else: raise KeyError(reader_name) return text_reader
def launch_bert(training_flag, test_flag): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') if training_flag is not None: model = BertForTokenClassification.from_pretrained( 'bert-base-uncased', num_labels=len(tags_vals)) ## ---------12 . Optimizer -> weight regularization is a solution to reduce the overfitting of a deep learning """ Last keras optimization 2020 (rates from 0.01 seem to be best hyperparamater )for weight regularization for weights layers from keras.layers import LSTM from keras.regularizers import l2 model.add(LSTM(32, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))) Note : BERT not include beta an gamma parametres for optimization """ FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) launch_training(training_path=args.training_data, training_epochs=4, valid_path=args.validate_data, training_batch_size=1, model=model, model_path=args.save + '/config.json', tokenizer=tokenizer, optimizer=optimizer) if test_flag is not None: if (args.save is not None): config = BertConfig.from_json_file(args.save + '/config.json') model = BertForTokenClassification.from_pretrained( pretrained_model_name_or_path=args.save + '/pytorch_model.bin', config=config) else: model = BertForTokenClassification.from_pretrained( 'bert-base-uncased', num_labels=len(tags_vals)) launch_test_directory(test_path=test_flag, model=model, tokenizer=tokenizer)
def main(): #################################################################### ## Data #################################################################### all_datasets = [] for dataroot in args.dataroot: curr_dataset = BinaryDataset(root_dir=dataroot, binary_format='elf', targets='start', mode='random-chunks', chunk_length=args.sequence_len) all_datasets.append(curr_dataset) # TODO: ConcatDataset. This requires the __len__() to be implemented. dataset = torch.utils.data.ConcatDataset(all_datasets) print("Dataset len() = {0}".format(len(dataset))) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) #################################################################### ## Model #################################################################### config = BertConfig( vocab_size=256, hidden_size=args.hidden_size, num_hidden_layers=args.hidden_layers, num_attention_heads=args.num_attn_heads, intermediate_size=args.hidden_size * 4, # BERT originally uses 4x hidden size for this, so copying that. hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=args.sequence_len, # Sequence length max type_vocab_size=1, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, gradient_checkpointing=False) model = BertForTokenClassification(config=config).cuda() # model = torch.nn.DataParallel(model, dim=0) optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) lossfn = torch.nn.CrossEntropyLoss() print("Beginning training") for epoch in range(args.epochs): train_loss, train_acc = train(model, lossfn, optimizer, dataloader, epoch) print( f"Train Loss: {train_loss} | Test Loss: {test_loss} | Test Acc: {test_acc}" )
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = BertForTokenClassification(config=config) model.eval() loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result)
def __init__(self, config_name:str = 'bert-base-chinese', model_name:str = None, num_tags: int = 2, batch_first:bool = True) -> None: # 记录batch_first self.batch_first = batch_first # 加载模型配置文件 if config_name != 'bert-base-chinese': if not os.path.exists(config_name): raise ValueError( "Error! No model config file: '{}'".format(config_name) ) else: self.config_name = config_name else: self.config_name = config_name # 加载预训练模型 if model_name is not None: if model_name == 'bert-base-chinese': self.model_name = model_name elif not os.path.exists(model_name): raise ValueError( "Error! No pretrained model: '{}'".format(model_name) ) else: self.model_name = model_name else: self.model_name = None if num_tags <= 0: raise ValueError(f'invalid number of tags: {num_tags}') super().__init__() self.bert_config = BertConfig.from_pretrained(self.config_name) self.bert_config.num_labels = num_tags # 如果模型不存在 if self.model_name is None: self.model_kwargs = {'config': self.bert_config} self.bertModel = BertForTokenClassification(**self.model_kwargs) elif self.model_name == 'bert-base-chinese': self.model_kwargs = {'config': self.bert_config, "from_tf": True} self.bertModel = BertForTokenClassification.from_pretrained(self.model_name, **self.model_kwargs) self.crf_model = CRF(num_tags=num_tags, batch_first=batch_first)
def __init__(self): self.tag2idx = { 'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16 } self.idx2tag = {} for key in list(self.tag2idx.keys()): self.idx2tag[self.tag2idx[key]] = key self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.model = BertForTokenClassification.from_pretrained( "bert-base-uncased", num_labels=len(self.tag2idx)) self.model.load_state_dict( torch.load("ner.dataset.4.pth", map_location=torch.device('cpu'))) self.model.eval()
def main(): # 各トークンを以下の13クラスのいずれかに分類するような固有表現抽出をしたい. labels = [ 'B-corporation', 'B-creative-work', 'B-group', 'B-location', 'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group', 'I-location', 'I-person', 'I-product', 'O' ] id2label = {i: label for i, label in enumerate(labels)} # label2id = {label: i for i, label in enumerate(labels)} # 利用する学習済みBERTモデルの名前を指定する. model_name = 'bert-large-cased' # 学習済みモデルに対応したトークナイザを生成する. tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=model_name, ) # 学習済みモデルから各トークン分類用モデルのインスタンスを生成する. model = BertForTokenClassification.from_pretrained( pretrained_model_name_or_path=model_name, id2label=id2label, # 各トークンに対する出力を13次元にしたいのでこれを渡す. ) # 一部の重みが初期化されていませんよという警告が出るが(クラス分類する層が # 初期化されていないのは当然)面倒なので無視する. print('◆ 適当な文章をモデルに流してみる.→ 14トークン×13クラスの予測結果になっている(サイズが).') sentence = 'The Empire State Building officially opened on May 1, 1931.' inputs = torch.tensor([tokenizer.encode(sentence)]) # ID列をテンソル化して渡す. outputs = model(inputs) print(outputs[0].size())
def __init__(self): super(BertClass, self).__init__() self.l1 = BertForTokenClassification.from_pretrained( 'bert-base-chinese', num_labels=21, output_attention=False, output_hidden_states=False)
def main(): args = docopt(__doc__) processors = { 'multi-sents': BertMultiSentProcessor, 'uni-sent': BertUniSentProcessor } max_input_len = int(args['--max-input-len']) tokenizer = BertWordPieceTokenizer(str(args['--path-to-vocab'])) processor_constructor = processors[str(args['--mode'])] processor = processor_constructor(max_input_len, tokenizer) if args['--ensemble']: bert_model = BertEnsemble.load_trained(str(args['--path-to-model-dir'])) elif args['--crf']: bert_model = BertWithCRF.from_pretrained(str(args['--path-to-model-dir'])) elif not args['--rule']: bert_model = BertForTokenClassification.from_pretrained(str(args['--path-to-model-dir'])) device_no = int(args['--gpu']) device = torch.device(f'cuda:{device_no}') if device_no > -1 else torch.device('cpu') if args['--crf']: bert_extractor = BertWithCRFExtractor(bert_model, tokenizer, max_input_len, device) elif not args['--rule']: bert_extractor = BertExtractor(bert_model, tokenizer, max_input_len, device) else: bert_extractor = RuleExtractor() corpus = read_corpus(str(args['--path-to-corpus-dir'])) ents_table = build_ents_table(corpus, processor, bert_extractor, batch_size=int(args['--batch-size'])) ents_table.to_csv(str(args['--path-to-output']), index=False, sep='\t') return
def _train_bert(training_data_retrieval_func): tokenizer = BertTokenizerFast.from_pretrained(BERT_BASE_MODEL) tokenizer.add_tokens(ADDITIONAL_SPECIAL_TOKENS) tokens, labels = training_data_retrieval_func() train_dataset = _get_datasets(tokens, labels, tokenizer) model = BertForTokenClassification.from_pretrained( BERT_BASE_MODEL, num_labels=len(ALL_LABEL_IDS)) model.resize_token_embeddings(len(tokenizer)) run_id = '{}_{}'.format(datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), utils.get_config('logging.filename')) training_args = TrainingArguments( output_dir=f'./bert/results/{run_id}', logging_dir=f'./bert/logs/{run_id}', logging_steps=500, save_steps=2000, per_device_train_batch_size=8, num_train_epochs=3, learning_rate=5e-5, warmup_steps=0, weight_decay=0, ) trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset) trainer.train() path_to_model = utils._get_cache_path('bert_for_SE_tagging') model.save_pretrained(path_to_model) tokenizer.save_pretrained(path_to_model)
def get_predictions(filename, outputName): label_list = ['O', 'B-CLEntity', 'I-CLEntity', 'L-CLEntity', 'U-CLEntity'] model = BertForTokenClassification.from_pretrained( "bert_ner_finetuned_iliad-with-gpu-pattern2.model") tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased") book_lines = [] book = open(filename) for line in book: line = line.strip() book_lines.append(line) book_lines = [line for line in book_lines if line] pred = [] for line in book_lines: line_tokens = tokenizer.tokenize( tokenizer.decode(tokenizer.encode(line))) line_inputs = tokenizer.encode(line, return_tensors="pt") line_outputs = model(line_inputs).logits line_predictions = torch.argmax(line_outputs, dim=2) line_pred_labels = [] for prediction in line_predictions[0].numpy(): line_pred_labels.append(label_list[prediction]) pred.append(line_pred_labels) with open(outputName, 'w') as f: f.write(json.dumps(pred)) return
def load(self, dirpath): """ Loads a trained model from specified folder on disk. Parameters ---------- dirpath : str directory from which model artifacts should be loaded Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) label_mappings = joblib.load( os.path.join(dirpath, "label_mappings.pkl")) self.label2id_ = label_mappings["label2id"] self.id2label_ = label_mappings["id2label"] self.special_tokens_ = label_mappings["special_tokens"] self.model_ = BertForTokenClassification.from_pretrained( dirpath, num_labels=len(self.label2id_), output_attentions=False, output_hidden_states=False) self.model_.to(self._device) self.tokenizer_ = BertTokenizer.from_pretrained( dirpath, do_basic_tokenize=False) return self
def load_model(args, test): # if the model is for testing, attempt to load previous arguments if test: try: prev_args = torch.load( os.path.join(args.model_dir, "train_args.bin") ) args.max_length = prev_args.max_length args.do_lower_case = prev_args.do_lower_case args.keep_accents = prev_args.keep_accents except FileNotFoundError: pass tokenizer = BertTokenizer.from_pretrained( args.model_dir, do_lower_case=args.do_lower_case, keep_accents=args.keep_accents, ) model = BertForTokenClassification.from_pretrained( args.model_dir, finetuning_task="conll2002", num_labels=len(LABEL_LIST), ).to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) return model, tokenizer
def __init__(self, num_labels=len(id2label.keys()), from_pretrained='bert-base-uncased'): super(BertForValueExtraction, self).__init__() print(f"Loading BertForTokenClassification as {from_pretrained}") self.token_classifier = BertForTokenClassification.from_pretrained( from_pretrained, num_labels=num_labels, return_dict=True)
def __init__(self, hparams, user_tokens=['<newline>', '<bullet>']): super(BertNerSystem, self).__init__() self.hparams = hparams self.hparams.model_type = self.hparams.model_type.lower() tokenizer = BertTokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, never_split=user_tokens, do_lower_case=self.hparams.do_lower_case, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, ) config = AutoConfig.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, output_past=not self.hparams.do_train, num_labels=self.hparams.num_labels, ) model = BertForTokenClassification.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=config, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, ) self.config, self.tokenizer, self.model = config, tokenizer, model self.loss = [] # for keeping track of average loss self.metrics = {} self.vocab = {v: k for k, v in self.tokenizer.get_vocab().items()}
def main(): data_table = pd.read_csv("train_table.csv") #define bert tokenizer && bert model tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained( 'bert-base-uncased', output_hidden_states=True) model.eval() print(model.config) data_embeddings = {} for index, row in data_table.iterrows(): print(index) #get sentence_tokens && label_tokens sentence = row[1] span = row[2] #get bert embeddings span_embeddings = get_bert_embedding(model, tokenizer, sentence, span) #non-propagandistic span embeddings if pd.isnull(span): data_embeddings[sentence] = span_embeddings #propagandistic span embeddings else: data_embeddings[sentence] = (span, row[3], row[4], span_embeddings) print("Writing to output file...") torch.save(data_embeddings, "data_embeddings.pt") print("Done.")
def __init__(self): super(BERTClass, self).__init__() config = BertConfig.from_pretrained("./bert-base-uncased", num_labels=len(list( tag2idx.keys()))) self.l1 = BertForTokenClassification.from_pretrained( './bert-base-uncased', config=config)
def main(num_epochs, learning_rate): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") classes = ["B", "I", "O"] tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) tag_to_idx = {t: i for i, t in enumerate(classes)} tag_to_idx['[PAD]'] = -100 idx_to_tag = {i: t for t, i in tag_to_idx.items()} train_dataloader, dev_dataloader, dev_sentences, test_dataloader, test_sentences = parse_data( tokenizer, tag_to_idx, batch_size=16) print('data loaded and tokenized') model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(classes)) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) print('model instantiated') model, dev_preds = train_model(tokenizer, tag_to_idx, model, num_epochs, train_dataloader, optimizer, device, dev_dataloader, idx_to_tag) test_preds = evaluate(model, test_dataloader, device, idx_to_tag) save_preds('dev_preds.txt', dev_preds, dev_sentences) save_preds('test_preds.txt', test_preds, test_sentences)
def __init__(self, model_path, num_labels, label_map, device): super().__init__() self.model = BertForTokenClassification.from_pretrained(model_path, num_labels=num_labels).to(device) self.transitions = torch.nn.Parameter(torch.randn(num_labels, num_labels)) # Ok, so we're going to add some constraints here self.label_map = label_map self.num_labels = num_labels
def load_model(self, model_filepath, config_filepath, pretrained_model="bert-base-cased"): """ Load cybert model. :param model_filepath: Filepath of the model (.pth or .bin) to be loaded :type model_filepath: str :param label_map_filepath: Config file (.json) to be used :type label_map_filepath: str :param pretrained_model: Name of pretrained model to be loaded from transformers repo, default is bert-base-cased :type pretrained_model: str Examples -------- >>> from clx.analytics.cybert import Cybert >>> cyparse = Cybert() >>> cyparse.load_model('/path/to/model.pth', '/path/to/config.json') """ with open(config_filepath) as f: config = json.load(f) self._label_map = {int(k): v for k, v in config["id2label"].items()} model_state_dict = torch.load(model_filepath) self._model = BertForTokenClassification.from_pretrained( pretrained_model, state_dict=model_state_dict, num_labels=len(self._label_map), ) self._model.cuda() self._model.eval()
def __init__(self, model_name, num_labels, lr): super().__init__() self.save_hyperparameters() self.bert_tc = BertForTokenClassification.from_pretrained( model_name, num_labels=num_labels )
def __init__(self, hparams: Union[Dict, Namespace]): # NOTE: internal code may pass hparams as dict **kwargs if isinstance(hparams, Dict): hparams = Namespace(**hparams) self.label_ids_to_label = LabelTokenAligner.get_ids_to_label( hparams.labels) num_labels = len(self.label_ids_to_label) super().__init__() # Enable to access arguments via self.hparams self.save_hyperparameters(hparams) self.step_count = 0 self.output_dir = Path(self.hparams.output_dir) self.cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None if self.cache_dir is not None and not os.path.exists( self.hparams.cache_dir): os.mkdir(self.cache_dir) # AutoTokenizer # trf>=4.0.0: PreTrainedTokenizerFast by default # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast... self.tokenizer_name = self.hparams.model_name_or_path self.tokenizer = BertTokenizerFast.from_pretrained( self.tokenizer_name, cache_dir=self.cache_dir, tokenize_chinese_chars=False, strip_accents=False, ) # AutoConfig config_name = self.hparams.model_name_or_path self.config: PretrainedConfig = BertConfig.from_pretrained( config_name, **({ "num_labels": num_labels } if num_labels is not None else {}), cache_dir=self.cache_dir, ) extra_model_params = ( "encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout", ) for p in extra_model_params: if getattr(self.hparams, p, None) and hasattr(self.config, p): setattr(self.config, p, getattr(self.hparams, p, None)) # AutoModelForTokenClassification self.model: PreTrainedModel = BertForTokenClassification.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=self.config, cache_dir=self.cache_dir, ) self.scheduler = None self.optimizer = None
def main(args): current_path = os.getcwd() logging.info(f'current python path {current_path}...') logging.info('Load data...') with open(f"{args.dataset}/train_dataset.pkl", "rb") as f: train_dataset = pickle.load(f) with open(f"{args.dataset}/valid_dataset.pkl", "rb") as f: valid_dataset = pickle.load(f) with open(f"{args.dataset}/test_dataset.pkl", "rb") as f: test_dataset = pickle.load(f) logging.info('Making dataloader...') train_loader = DataLoader( dataset = train_dataset, batch_size = args.batch_size, shuffle = True, collate_fn = lambda x: Bert_dataset.collate_fn(train_dataset, x) ) valid_loader = DataLoader( dataset = valid_dataset, batch_size = args.batch_size, collate_fn = lambda x: Bert_dataset.collate_fn(valid_dataset, x) ) test_loader = DataLoader( dataset = test_dataset, batch_size = args.batch_size, collate_fn = lambda x: Bert_dataset.collate_fn(test_dataset, x) ) logging.info('Load model and parameters...') model = BertForTokenClassification.from_pretrained("bert-base-chinese", num_labels = 3, output_attentions = False, output_hidden_states = False ) trainer = Trainer(model, train_loader, valid_loader) logging.info('Test validation dataset...') acc, total_loss = trainer.evaluation(test=False) print(f"device: {trainer.device} classification acc: {acc: .4f} validation loss: {total_loss:.4f}") logging.info('Start training...') trainer.training_process(early_stopping = True, n_iter_no_change = 5, max_epoch = args.max_epoch, save_params = True, verbose = True, learning_rate = args.learning_rate, save_paths = args.save_paths) logging.info('Training ends!') logging.info('Test validation dataset...') acc, total_loss = trainer.evaluation(test=False) print(f"device: {trainer.device} classification acc: {acc: .4f} validation loss: {total_loss:.4f}") logging.info('Finish!')
def __init__(self, vocab_size, emb_size, hidden_size, num_labels): super(bert_chinese_ner, self).__init__() self.bertconfig = BertConfig.from_pretrained( bert_chinese_ner.model_path, num_labels=num_labels, author="lingze") self.model = BertForTokenClassification.from_pretrained( bert_chinese_ner.model_path, config=self.bertconfig)
def load_frozen_bert( bert_pretrained_model: str, bert_state_dict: str = None, bert_config: BertConfig = None) -> BertForTokenClassification: if bert_state_dict: fine_tuned_state_dict = torch.load(bert_state_dict) bert_token_classifier = BertForTokenClassification.from_pretrained( pretrained_model_name_or_path=bert_pretrained_model, state_dict=fine_tuned_state_dict, config=bert_config) else: bert_token_classifier = BertForTokenClassification.from_pretrained( pretrained_model_name_or_path=bert_pretrained_model, config=bert_config) for p in bert_token_classifier.bert.parameters(): p.requires_grad = False return bert_token_classifier