def main(): print('Start') parser = argparse.ArgumentParser() # Add the arguments to the parser parser.add_argument("--model_name", required=True) parser.add_argument("--checkpoint_input_path", required=False) parser.add_argument("--checkpoint_output_path", required=True) parser.add_argument("--mnli_path", required=True) parser.add_argument("--squad_path", required=True) parser.add_argument("--train_squad", default=True) parser.add_argument("--train_mnli", default=True) parser.add_argument("--seed", default=1995) parser.add_argument("--learning_rate", default=5e-5, type=float) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--epochs", default=3, type=int) args = vars(parser.parse_args()) random.seed(args['seed']) def read_squad(path): path = Path(path) with open(path, 'rb') as f: squad_dict = json.load(f) contexts = [] questions = [] answers = [] for group in squad_dict['data']: for passage in group['paragraphs']: context = passage['context'] for qa in passage['qas']: question = qa['question'] for answer in qa['answers']: contexts.append(context) questions.append(question) answers.append(answer) return contexts, questions, answers squad_contexts, squad_questions, squad_answers = read_squad( args['squad_path']) random_index = random.sample(range(len(squad_answers)), 16) squad_contexts = [squad_contexts[index] for index in random_index] squad_questions = [squad_questions[index] for index in random_index] squad_answers = [squad_answers[index] for index in random_index] def parse_mnli(path): sentences_a = [] sentences_b = [] labels = [] with open(path, "r+", encoding="utf8") as f: for item in jsonlines.Reader(f): sentences_a.append(item['sentence1']) sentences_b.append(item['sentence2']) labels.append(item['gold_label']) return sentences_a, sentences_b, labels mnli_a, mnli_b, mnli_labels = parse_mnli(args['mnli_path']) random_index = random.sample(range(len(mnli_a)), 16) mnli_a = [mnli_a[index] for index in random_index] mnli_b = [mnli_b[index] for index in random_index] mnli_labels = [mnli_labels[index] for index in random_index] label_encode = {'contradiction': 0, 'neutral': 1, 'entailment': 2} mnli_labels = [label_encode[label] for label in mnli_labels] print('Done importing data') from transformers import BertTokenizer, BertTokenizerFast tokenizer = BertTokenizer.from_pretrained(args['model_name'], do_lower_case=True, padding=True, truncation=True, add_special_tokens=True, model_max_length=500) tokenizer_fast = BertTokenizerFast.from_pretrained(args['model_name'], do_lower_case=True, padding=True, truncation=True, add_special_tokens=True, model_max_length=500) from squad_processing import add_end_idx, add_token_positions add_end_idx(squad_answers, squad_contexts) squad_encodings = tokenizer_fast(squad_contexts, squad_questions, add_special_tokens=True, truncation=True, padding=True, max_length=500) # Processing of token positions add_token_positions(squad_encodings, squad_answers, tokenizer_fast) # In[69]: # MNLI mnli_encodings = tokenizer(mnli_a, mnli_b, add_special_tokens=True, max_length=500, truncation=True, padding=True) mnli_encodings['labels'] = mnli_labels from torch.utils.data import Dataset class MnliDataset(Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): #print(self.encodings['start_positions'][idx]) #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()} return { 'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long), 'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long), 'token_type_ids': torch.tensor(self.encodings['token_type_ids'][idx], dtype=torch.long), 'labels': torch.tensor(self.encodings['labels'][idx], dtype=torch.long) } def __len__(self): return len(self.encodings.input_ids) class SquadDataset(Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): #print(self.encodings['start_positions'][idx]) #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()} return { 'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long), 'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long), 'start_positions': torch.tensor(self.encodings['start_positions'][idx], dtype=torch.long), 'end_positions': torch.tensor(self.encodings['end_positions'][idx], dtype=torch.long) } def __len__(self): return len(self.encodings.input_ids) train_mnli = MnliDataset(mnli_encodings) train_squad = SquadDataset(squad_encodings) from transformers import BertPreTrainedModel, BertModel from torch import nn from torch.nn import CrossEntropyLoss # In[106]: class BertForMultiLabelSequenceClassification(BertPreTrainedModel): """BERT model for classification. This module is composed of the BERT model with a linear layer on top of the pooled output. """ def __init__(self, config, num_labels=3): super().__init__(config) self.num_labels = num_labels self.bert = BertModel(config) self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) self.classifier = torch.nn.Linear(config.hidden_size, num_labels) #self.apply(self.init_bert_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): pooled_output = self.bert(input_ids, token_type_ids, attention_mask)[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) return logits def freeze_bert_encoder(self): for param in self.bert.parameters(): param.requires_grad = False def unfreeze_bert_encoder(self): for param in self.bert.parameters(): param.requires_grad = True # In[101]: mnli_model = BertForMultiLabelSequenceClassification.from_pretrained( args['model_name']) from torch.nn import DataParallel from torch.utils.data import DataLoader from transformers import AdamW device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print(device) train_loader_mnli = DataLoader(train_mnli, batch_size=args['batch_size'], shuffle=True) mnli_model = DataParallel(mnli_model) optim = AdamW(mnli_model.parameters(), lr=args['learning_rate']) mnli_model.to(device) mnli_model.train() from barbar import Bar for epoch in range(args['epochs']): for i, batch in enumerate(Bar(train_loader_mnli)): optim.zero_grad() input_ids = batch['input_ids'].to(device, dtype=torch.long) attention_mask = batch['attention_mask'].to(device, dtype=torch.long) token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long) labels = batch['labels'].to(device, dtype=torch.long) outputs = mnli_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels) loss_fct = CrossEntropyLoss().to(device) loss = loss_fct(outputs, labels) #loss = outputs.loss loss.sum().backward() optim.step() mnli_model.eval() file_name = args['checkpoint_output_path'] + '/checkpoint_mnli.pt' torch.save(mnli_model.state_dict(), file_name) from transformers.modeling_outputs import QuestionAnsweringModelOutput class BertForQuestionAnswering(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config, add_pooling_layer=False) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss, ) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) squad_model = BertForQuestionAnswering.from_pretrained(args['model_name']) squad_model.load_state_dict(mnli_model.state_dict(), strict=False) train_loader_squad = DataLoader(train_squad, batch_size=args['batch_size'], shuffle=True) squad_model = DataParallel(squad_model) squad_model.to(device) squad_model.train() optim = AdamW(squad_model.parameters(), lr=args['learning_rate']) # In[122]: for epoch in range(args['epochs']): for i, batch in enumerate(Bar(train_loader_squad)): optim.zero_grad() input_ids = batch['input_ids'].to(device, dtype=torch.long) attention_mask = batch['attention_mask'].to(device, dtype=torch.long) start_positions = batch['start_positions'].to(device, dtype=torch.long) end_positions = batch['end_positions'].to(device, dtype=torch.long) outputs = squad_model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] loss.sum().backward() optim.step() squad_model.eval() file_name = args['checkpoint_output_path'] + '/checkpoint_mnli_squad.pt' torch.save(squad_model.state_dict(), file_name)
#define the forward pass def forward(self, sent_id, mask): #pass the inputs to the model _, cls_hs = self.bert(sent_id, attention_mask=mask) x = self.fc1(cls_hs) x = self.relu(x) x = self.dropout(x) # output layer x = self.fc2(x) # apply softmax activation x = self.softmax(x) return x tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') tokens_test = tokenizer.batch_encode_plus(df["text"].tolist(), max_length=25, padding='max_length', truncation=True) test_seq = torch.tensor(tokens_test['input_ids']) test_mask = torch.tensor(tokens_test['attention_mask']) path = 'saved_weights.pt' model = BERT_Arch(bert) model.load_state_dict(torch.load(path)) start = time.time() # get predictions for test data with torch.no_grad(): preds = model(test_seq.to(device), test_mask.to(device))
def pico_extract(text, pth_path, idx2tag): ''' tup: list of tuples (token, tag) ''' spacy_tokens = [token.text for token in nlp(text)] spacy_tokens = [t for t in spacy_tokens if t != '\u2009'] ## Tokenization pre_wgts = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' tokenizer = BertTokenizerFast.from_pretrained(pre_wgts, num_labels=13) # plain text not pre-tokenised by scispaCy, so word_ids unavailable inputs = tokenizer(spacy_tokens, is_split_into_words=True, return_offsets_mapping=True, padding=False, truncation=True) word_ids = inputs.word_ids() inputs = {key: torch.tensor(value) for key, value in inputs.items()} ## Load model model = BertForTokenClassification.from_pretrained(pre_wgts, num_labels=13) # Load checkpoint checkpoint = torch.load(pth_path, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['state_dict'], strict=False) model.cpu() model.eval() ## Run model outputs = model(inputs['input_ids'].unsqueeze(0), inputs['attention_mask'].unsqueeze(0)) logits = outputs[0].squeeze(0) # [seq_len, n_tags] preds = torch.argmax(logits, dim=1) # [seq_len] preds = preds.numpy().tolist()[1:-1] # len=seq_len-2, remove cls/sep token ids = inputs['input_ids'][1:-1] word_ids = word_ids[1:-1] tokens = tokenizer.convert_ids_to_tokens(ids) tags = [idx2tag[idx] for idx in preds] pre_wid = None tags_new = [] for t, wid in zip(tags, word_ids): if wid != pre_wid: tags_new.append(t) pre_wid = wid # Convert back to non-sub spacy tokens/tags tags = tags_new tokens = spacy_tokens # len(tags_new) == len(spacy_tokens) # Record span start/end idxs sidxs, eidxs = [], [] for i in range(len(tags)): if i == 0 and tags[i] != 'O': sidxs.append(0) if tags[1] == 'O': eidxs.append(0) if i > 0 and i < len(tags) - 1 and tags[i] != 'O': if tags[i - 1] == 'O' and tags[i] != 'O': sidxs.append(i) if tags[i + 1] == 'O' and tags[i] != 'O': eidxs.append(i) if tags[len(tags) - 1] != 'O': sidxs.append(len(tags) - 1) eidxs.append(len(tags) - 1) tup = [] for si, ei in zip(sidxs, eidxs): ent_tokens = tokens[si:ei + 1] ent_tags = tags[si:ei + 1] # ent_tags may include multiple type of tags ents = [t.split('-')[1] for t in ent_tags] ents_set = list(set(ents)) for ent in ents_set: indices = [ idx for idx, t in enumerate(ent_tags) if t.split('-')[1] == ent ] sub = [ent_tokens[ic] for ic in indices] # sub_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(sub)) # sub_new = [] # for i, tok in enumerate(sub): # if tok.startswith("##"): # if sub_new: # sub_new[-1] = f"{sub_new[-1]}{tok[2:]}" # else: # sub_new.append(tok) sub_text = ' '.join(sub) sub_text = re.sub(r" - ", "-", sub_text) sub_text = re.sub(r" = ", "=", sub_text) sub_text = re.sub(r" / ", "/", sub_text) sub_text = re.sub(r"\( ", "(", sub_text) sub_text = re.sub(r" \)", ")", sub_text) # Remove incomplete brackets left = [(m.start(0), m.end(0)) for m in re.finditer(r1, sub_text)] right = [(m.start(0), m.end(0)) for m in re.finditer(r2, sub_text)] if len(left) > 0 and len(right) == 0: # ( sub_text = re.sub(r"\(", "", sub_text) if len(left) == 0 and len(right) > 0: # ) sub_text = re.sub(r"\)", "", sub_text) if len(left) > 0 and len(right) > 0: # )( or () if left[0][0] > right[0][0]: # )( sub_text = re.sub(r"\)", "", sub_text) sub_text = re.sub(r"\(", "", sub_text) sub_text = re.sub(r'^[\s]', "", sub_text) sub_text = re.sub(r'[\s]$', "", sub_text) sub_text = ' '.join([s for s in sub_text.split(' ') if len(s) > 1]) tup.append((ent, sub_text)) return tup, tokens, tags
from accelerate import Accelerator accelerator = Accelerator(fp16=True) device = accelerator.device # Documentation for the toolkit: https://huggingface.co/docs/accelerate/ """## Load Model and Tokenizer """ model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device) tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese") # You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly) """## Read Data - Training set: 26935 QA pairs - Dev set: 3523 QA pairs - Test set: 3492 QA pairs - {train/dev/test}_questions: - List of dicts with the following keys: - id (int) - paragraph_id (int) - question_text (string) - answer_text (string)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') config = get_config() os.environ['TOKENIZERS_PARALLELISM'] = 'true' tokenizer = BertTokenizerFast.from_pretrained(config.tokenizer) tokenizer.model_max_length = config.max_seq_length data_pipeline = data.ClassificationDataPipeline( lambda: tfds.builder(f'{config.dataset_path}/{config.dataset_name}'), tokenizer) num_train_examples = data_pipeline.dataset_builder.info.splits[ 'train'].num_examples num_train_steps = int(num_train_examples * config.num_train_epochs // config.train_batch_size) warmup_steps = int(config.warmup_proportion * num_train_steps) cooldown_steps = num_train_steps - warmup_steps is_regression_task = (data_pipeline.dataset_builder.info.features['label']. dtype == 'float32') if is_regression_task: num_classes = 1 compute_stats = compute_regression_stats else: num_classes = data_pipeline.dataset_builder.info.features[ 'label'].num_classes compute_stats = compute_classification_stats model = create_model(config, num_classes=num_classes) optimizer = create_optimizer(config, model) optimizer = optimizer.replicate() del model # don't keep a copy of the initial model optimizer = training.harmonize_across_hosts(optimizer) learning_rate_fn = training.create_learning_rate_scheduler( factors='constant * linear_warmup * linear_decay', base_learning_rate=config.learning_rate, warmup_steps=warmup_steps, steps_per_cycle=cooldown_steps, ) output_dir = get_output_dir(config) gfile.makedirs(output_dir) train_history = training.TrainStateHistory(learning_rate_fn) train_state = train_history.initial_state() if config.do_train: train_step_fn = training.create_train_step(compute_loss_and_metrics, clip_grad_norm=1.0) train_iter = data_pipeline.get_inputs( split='train', batch_size=config.train_batch_size, training=True) for step, batch in zip(range(0, num_train_steps), train_iter): optimizer, train_state = train_step_fn(optimizer, batch, train_state) if config.do_eval: # While our input pipelines use TFDS, we'll use metrics from the # HuggingFace datasets library instead. datasets.logging.set_verbosity_error() # Workaround for https://github.com/huggingface/datasets/issues/812 logging.getLogger('filelock').setLevel(logging.ERROR) eval_step = training.create_eval_fn(compute_stats) eval_results = [] if config.dataset_path == 'glue' and config.dataset_name == 'mnli': validation_splits = ['validation_matched', 'validation_mismatched'] else: validation_splits = ['validation'] for split in validation_splits: eval_iter = data_pipeline.get_inputs( split=split, batch_size=config.eval_batch_size, training=False) eval_stats = eval_step(optimizer, eval_iter) eval_metric = datasets.load_metric(config.dataset_path, config.dataset_name) eval_metric.add_batch(predictions=eval_stats['prediction'], references=eval_stats['label']) eval_metrics = eval_metric.compute() prefix = 'eval_mismatched' if split == 'validation_mismatched' else 'eval' for name, val in sorted(eval_metrics.items()): line = f'{prefix}_{name} = {val:.06f}' print(line, flush=True) eval_results.append(line) eval_results_path = os.path.join(output_dir, 'eval_results.txt') with gfile.GFile(eval_results_path, 'w') as f: for line in eval_results: f.write(line + '\n') if config.do_predict: predict_step = training.create_eval_fn(compute_stats) predict_results = [] path_map = { ('glue', 'cola', 'test'): 'CoLA.tsv', ('glue', 'mrpc', 'test'): 'MRPC.tsv', ('glue', 'qqp', 'test'): 'QQP.tsv', ('glue', 'sst2', 'test'): 'SST-2.tsv', ('glue', 'stsb', 'test'): 'STS-B.tsv', ('glue', 'mnli', 'test_matched'): 'MNLI-m.tsv', ('glue', 'mnli', 'test_mismatched'): 'MNLI-mm.tsv', ('glue', 'qnli', 'test'): 'QNLI.tsv', ('glue', 'rte', 'test'): 'RTE.tsv', # No eval on WNLI for now. BERT accuracy on WNLI is below baseline, # unless a special training recipe is used. # ('glue/wnli', 'test'): 'WNLI.tsv', } label_sets = { ('glue', 'cola'): ['0', '1'], ('glue', 'mrpc'): ['0', '1'], ('glue', 'qqp'): ['0', '1'], ('glue', 'sst2'): ['0', '1'], ('glue', 'mnli'): ['entailment', 'neutral', 'contradiction'], ('glue', 'qnli'): ['entailment', 'not_entailment'], ('glue', 'rte'): ['entailment', 'not_entailment'], } for path_map_key in path_map: candidate_dataset_path, candidate_dataset_name, split = path_map_key if (candidate_dataset_path != config.dataset_path or candidate_dataset_name != config.dataset_name): continue predict_iter = data_pipeline.get_inputs( split=split, batch_size=config.eval_batch_size, training=False) predict_stats = predict_step(optimizer, predict_iter) idxs = predict_stats['idx'] predictions = predict_stats['prediction'] tsv_path = os.path.join( output_dir, path_map[config.dataset_path, config.dataset_name, split]) with gfile.GFile(tsv_path, 'w') as f: f.write('index\tprediction\n') if is_regression_task: for idx, val in zip(idxs, predictions): f.write(f'{idx}\t{val:.06f}\n') else: label_set = label_sets[config.dataset_path, config.dataset_name] for idx, val in zip(idxs, predictions): f.write(f'{idx}\t{label_set[val]}\n') print('Wrote', tsv_path)
answer[ 'answer_end'] = end_idx - 1 # When the gold label is off by one character elif context[start_idx - 2:end_idx - 2].lower() == gold_text.lower(): answer['answer_start'] = start_idx - 2 answer[ 'answer_end'] = end_idx - 2 # When the gold label is off by two characters add_end_idx(train_answers, train_contexts) add_end_idx(val_answers, val_contexts) from transformers import BertTokenizerFast, BertModel tokenizer = BertTokenizerFast.from_pretrained( "dmis-lab/biobert-base-cased-v1.1", padding=True, truncation=True, add_special_tokens=True, model_max_length=1000000000) #model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1") train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True, max_length=500) val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True, max_length=500)
def __init__(self, args): super(BonzDataModule, self).__init__() self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', max_len=512) self.args = args
args = Namespace(**js['args']) # args.epochs = 3 random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False idx2tag = js['idx2tag'] idx2tag = {int(idx): tag for idx, tag in idx2tag.items()} tag2idx = {tag: idx for idx, tag in idx2tag.items()} softmax = nn.Softmax(dim=1) tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-v1.1', num_labels=13) device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') #%% class SemiData(): def __init__(self): self.semi_path = '/home/qwang/pre-pico/data/semi_scores_9923.csv' self.gold_path = '/home/qwang/pre-pico/data/tsv/18mar_output/pico_18mar.json' def read_conll_tsv(self, tsv_path): ''' Read seqs/tags for one tsv file seqs[i] --> ['Leicestershire', '22', 'points', ',', ...], tags[i] --> ['B-ORG', 'O', 'O', ...] ''' dat = pd.read_csv(tsv_path, sep='\t', header=None)
configfile = "config.json" else: configfile = sys.argv[1] # Read the params with open(configfile, "r") as f: config = json.load(f) globalparams = config["global_params"] encparams = config["encoder_params"] decparams = config["decoder_params"] modelparams = config["model_params"] # Load the tokenizers en_tok_path = encparams["tokenizer_path"] en_tokenizer = BertTokenizerFast(os.path.join(en_tok_path, "vocab.txt")) de_tok_path = decparams["tokenizer_path"] de_tokenizer = BertTokenizerFast(os.path.join(de_tok_path, "vocab.txt")) # Init the dataset train_en_file = globalparams["train_en_file"] train_de_file = globalparams["train_de_file"] valid_en_file = globalparams["valid_en_file"] valid_de_file = globalparams["valid_de_file"] enc_maxlength = encparams["max_length"] dec_maxlength = decparams["max_length"] batch_size = modelparams["batch_size"] train_dataset = QADataset(train_en_file, train_de_file, en_tokenizer, de_tokenizer, enc_maxlength, dec_maxlength)
if __name__ == "__main__": util.setup_seed(6) parser = argparse.ArgumentParser(description='Kil Bert Project') parser.add_argument('-d', '--data', help='data name', default='imdb', choices=['agnews', 'imdb', 'newsgroup']) args = parser.parse_args() with open('settings.json', 'r', encoding='utf-8') as f: settings = json.load(f) config = settings["bert"][args.data] config["model_name"] = 'bert-base-uncased' tokenizer = BertTokenizerFast.from_pretrained(config["model_name"]) train, test = util.get_data(args.data) train = train.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=config["max_len"]), batched=True) train = train.map(lambda e: {'labels': e['label']}, batched=True) train.set_format( type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels']) test = test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length',
pos = [pos_to_num[token.pos_] for token in doc] pos = [pos_to_num['SPECIAL'] ] + pos # add numerical representation of 'SPECIAL' tag pos.append(pos_to_num['SPECIAL']) sentences_tokenized.append(tokens) sent_pos_tags.append(pos) sent_marked = sentences_marked[idx] doc_marked = nlp(sent_marked) tokens_marked = [token.text for token in doc_marked] sentences_marked_tokenized.append(tokens_marked) print('Pre-tokenization with SpaCy is finished.') # Tokenize pre-tokenized sentences with BERT tokenizer tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-uncased', do_lower_case=True, additional_special_tokens=['[TARGET_START]', '[TARGET_END]']) encodings = tokenizer(sentences_tokenized, return_offsets_mapping=True, is_pretokenized=True) encodings_marked = tokenizer(sentences_marked_tokenized, is_pretokenized=True) print('BERT tokenization is finished.') # For each tokenized (original) sentence create a position vector that marks target tokens with 1's and the rest # of the tokens with 0's. tokenized_marked_texts = [ tokenizer.convert_ids_to_tokens(i) for i in encodings_marked['input_ids'] ] position_vectors = [] for i in range(len(tokenized_marked_texts)):
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT 로드 완료") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT 로드 완료") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base 로드 완료") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base 로드 완료") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor 로드 완료") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base 로드 완료") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 로드 완료") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base 로드 완료") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad 로드 완료") print("====================================")
import streamlit as st # from argparse import ArgumentParser import lime from lime.lime_text import LimeTextExplainer MODELS = { "BERT": "model_noprocess.h5" } model_name = 'bert-base-uncased' # Load transformers config and set output_hidden_states to False config = BertConfig.from_pretrained(model_name) config.output_hidden_states = False # Load BERT tokenizer tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config) repo_root = os.path.dirname(os.path.abspath(__file__))[:os.path.dirname(os.path.abspath(__file__)).find("Assignment_1")+13] import_model = load_model(repo_root+"/models/model_noprocess.h5") class_names = ['1', '2', '3', '4', '5'] explainer = LimeTextExplainer(class_names=class_names) print(repo_root) # Obtain the CSS for Buttons to be displayed @st.cache(suppress_st_warning=True, allow_output_mutation=True) def get_button_css(button_id): custom_css = f""" <style> #{button_id} {{ background-color: rgb(255, 255, 255); color: rgb(38, 39, 48); padding: 0.25em 0.38em; position: relative;
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') config = FLAGS.config model = create_model(config) optimizer = create_optimizer(config, model) del model # don't keep a copy of the initial model output_dir = get_output_dir(config) gfile.makedirs(output_dir) # Restore from a local checkpoint, if one exists. optimizer = checkpoints.restore_checkpoint(output_dir, optimizer) start_step = int(optimizer.state[0].step) optimizer = optimizer.replicate() optimizer = training.harmonize_across_hosts(optimizer) os.environ['TOKENIZERS_PARALLELISM'] = 'true' tokenizer = BertTokenizerFast.from_pretrained(config.tokenizer) tokenizer.model_max_length = config.max_seq_length data_pipeline = data.PretrainingDataPipeline( glob.glob('cache/pretrain.*_of_*.tfrecord'), tokenizer, max_predictions_per_seq=config.max_predictions_per_seq) learning_rate_fn = training.create_learning_rate_scheduler( factors='constant * linear_warmup * linear_decay', base_learning_rate=config.learning_rate, warmup_steps=config.num_warmup_steps, steps_per_cycle=config.num_train_steps - config.num_warmup_steps, ) train_history = training.TrainStateHistory(learning_rate_fn) train_state = train_history.initial_state() if config.do_train: train_iter = data_pipeline.get_inputs( batch_size=config.train_batch_size, training=True) train_step_fn = training.create_train_step( compute_pretraining_loss_and_metrics, clip_grad_norm=1.0) for step, batch in zip(range(start_step, config.num_train_steps), train_iter): optimizer, train_state = train_step_fn(optimizer, batch, train_state) if jax.host_id() == 0 and (step % config.save_checkpoints_steps == 0 or step == config.num_train_steps - 1): checkpoints.save_checkpoint(output_dir, optimizer.unreplicate(), step) config_path = os.path.join(output_dir, 'config.json') if not os.path.exists(config_path): with open(config_path, 'w') as f: json.dump({'model_type': 'bert', **config.model}, f) if config.do_eval: eval_iter = data_pipeline.get_inputs(batch_size=config.eval_batch_size) eval_iter = itertools.islice(eval_iter, config.max_eval_steps) eval_fn = training.create_eval_fn(compute_pretraining_stats, sample_feature_name='input_ids') eval_stats = eval_fn(optimizer, eval_iter) eval_metrics = { 'loss': jnp.mean(eval_stats['loss']), 'masked_lm_loss': jnp.mean(eval_stats['masked_lm_loss']), 'next_sentence_loss': jnp.mean(eval_stats['next_sentence_loss']), 'masked_lm_accuracy': jnp.sum(eval_stats['masked_lm_correct']) / jnp.sum(eval_stats['masked_lm_total']), 'next_sentence_accuracy': jnp.sum(eval_stats['next_sentence_correct']) / jnp.sum(eval_stats['next_sentence_total']), } eval_results = [] for name, val in sorted(eval_metrics.items()): line = f'{name} = {val:.06f}' print(line, flush=True) eval_results.append(line) eval_results_path = os.path.join(output_dir, 'eval_results.txt') with gfile.GFile(eval_results_path, 'w') as f: for line in eval_results: f.write(line + '\n')
from thai2transformers.conf import Task from thai2transformers import preprocess CACHE_DIR = f'{str(Path.home())}/.cache/huggingface_datasets' METRICS = { Task.MULTICLASS_CLS: classification_metrics, Task.MULTILABEL_CLS: multilabel_classification_metrics } PUBLIC_MODEL = { 'mbert': { 'name': 'bert-base-multilingual-cased', 'tokenizer': BertTokenizerFast.from_pretrained('bert-base-multilingual-cased'), 'config': BertConfig.from_pretrained('bert-base-multilingual-cased'), }, 'xlmr': { 'name': 'xlm-roberta-base', 'tokenizer': XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base'), 'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'), }, 'xlmr-large': { 'name': 'xlm-roberta-large', 'tokenizer': XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large'), 'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'), },
for file in files: if file.startswith("prediction"): prediction_file = os.path.join(Path(model_dir).parent, file) break if prediction_file is None: raise FileNotFoundError("no prediction file") print(f"loading predictions from {prediction_file}") dataset_properties = json.load( open(os.path.join(model_dir, "dataset_properties.json"))) target_vocab = dataset_properties["target_vocab"] special_tokens = dataset_properties["special_tokens"] tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab) source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") source_tokenizer.add_special_tokens( {"additional_special_tokens": special_tokens}) # collect per sentence length: # - similarities (similar to edit distance) # - accuracies similarities = collections.defaultdict(list) accuracies = collections.defaultdict(list) # also collect confusions (i.e., information about how tokens got wrongly predicted) confusions = {} # 3 is the SEP token (which marks end of the sequence) - if the only difference between prediction # and true sequence is the output of additional token(s) after the complete correct sequence has been # predicted, the confusion lies in the SEP token, so we'll need to add this.
df = df.drop(df[df['label'] == 0.0].index).reset_index() df_lf_x = df_lf_x.loc[df.index] # The input size is the number of linguistic features. We get this value from the # dataframe, but we remove 2 items. One for the "TWEET" column, and other for the # class input_size = df_lf_x.shape[1] # CustomBERTModel model model = CustomBERTModel(input_size, num_classes=len(df['label'].unique()) if task_type == 'classification' else 1) model.to(device) # Get the tokenizer tokenizer = BertTokenizerFast.from_pretrained(pretrained_model) # Encode label as numbers instead of user names if task_type == 'classification': df["label"] = df["label"].astype('category').cat.codes # Encode datasets to work with transformers dataset = Dataset.from_pandas(df) # Tokenizer trainset and test dataframe with the training # The tokenize function only takes care of the "tweet" # column and will create the input_ids, token_type_ids, and # attention_mask dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset)) # Finally, we "torch" the new columns. We return the rest
logger = DefaultLogger(config["log_path"], experiment_name, config["run_name"], config["run_id"], hyper_parameters) model_state_dict_dir = config["path_to_save_model"] if not os.path.exists(model_state_dict_dir): os.makedirs(model_state_dict_dir) assert config["encoder"] == "BERT", "此程序仅用于Bert模型,若使用LSTM请运行train.py" # 读取数据 train_data = json.load(open(train_data_path, "r", encoding="utf-8")) valid_data = json.load(open(valid_data_path, "r", encoding="utf-8")) # 数据预处理器 tokenizer = BertTokenizerFast.from_pretrained(config["bert_path"], add_special_tokens=False, do_lower_case=False) tokenize = tokenizer.tokenize get_tok2char_span_map = lambda text: tokenizer.encode_plus( text, return_offsets_mapping=True, add_special_tokens=False)[ "offset_mapping"] preprocessor = Preprocessor(tokenize_func=tokenize, get_tok2char_span_map_func=get_tok2char_span_map) # train and valid max token num max_tok_num = 0 all_data = train_data + valid_data for sample in all_data: tokens = tokenize(sample["text"])
def get_tokenizer(args): return BertTokenizerFast.from_pretrained(args.tokenizer_path, max_len=args.seq_len)
def __init__(self, vocab_path, strip_accents, clean_text, lowercase): common_params = {'strip_accents': strip_accents, 'clean_text': clean_text, 'lowercase': lowercase} self._tokenizer = BertTokenizerFast( vocab_file=vocab_path, **common_params )
import pylab from tensorboardX import SummaryWriter import torchvision.utils as vutils import utils import models import params import train, test from transformers import BertModel, BertConfig, BertTokenizer, BertTokenizerFast, AdamW, get_linear_schedule_with_warmup ###BERT model instead of the Extractor # create the BERTConfig, BERTTokenizer, and BERTModel model_name = "bert-base-uncased" config = BertConfig.from_pretrained(model_name, output_hidden_states=True, return_dict=True) tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True) bert = BertModel.from_pretrained(model_name, config=config) src_train_dataloader = utils.get_train_loader( '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/books.csv', tokenizer) src_test_dataloader = utils.get_test_loader( '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/books.csv', tokenizer) tgt_train_dataloader = utils.get_train_loader( '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/dvd.csv', tokenizer) tgt_test_dataloader = utils.get_test_loader( '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/dvd.csv', tokenizer)
def run(self, focused=False, focused_model=None, training_epochs=5): if focused==True: self.model_list=[focused_model] else: pass for model_name in self.model_list: training_args = TrainingArguments( output_dir='./results/'+model_name, num_train_epochs=training_epochs, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, #evaluate_during_training=True, logging_dir='./logs/'+model_name, ) model = None tokenizer = None print('Training on a dataset with ' +str(self.num_labels)+ ' labels') if model_name == "bert-base-uncased": model = BertForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels) tokenizer = BertTokenizerFast.from_pretrained(model_name) elif model_name == "albert-base-v2": tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v2') model = transformers.AlbertForSequenceClassification.from_pretrained('albert-base-v2', return_dict=True, num_labels=self.num_labels) elif model_name == "roberta-base": tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base') model = transformers.RobertaForSequenceClassification.from_pretrained('roberta-base', return_dict=True, num_labels=self.num_labels) elif model_name == "linear_SVM": tokenizer = None model = 'linear_SVM' parameters={ 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (5e-2, 1e-2,5e-3, 1e-3,5e-3), 'clf__penalty': ('l2', 'l1', 'elasticnet') } classifier=SGDClassifier(loss='hinge',random_state=42,max_iter=5,tol=None) elif model_name == "multinomial_naive_bayesian": tokenizer = None model = 'multinomial_naive_bayesian' parameters= { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1,1e-1,1e-2, 1e-3,1e-4), 'clf__fit_prior': (True, False), } classifier=MultinomialNB() if not model or not tokenizer: #use 'assert' here instead? print("ERROR") def tokenize(batch): return tokenizer(batch['text'], padding='max_length', truncation=True) if tokenizer is not None: train_dataset = self.train_dataset_raw.map(tokenize, batched=True, batch_size=len(self.train_dataset_raw)) test_dataset = self.test_dataset_raw.map(tokenize, batched=True, batch_size=len(self.train_dataset_raw)) train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) else: train_dataset = self.train_dataset_raw test_dataset = self.test_dataset_raw if model_name== "linear_SVM" or model_name== "multinomial_naive_bayesian": trainer=None pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', classifier), ]) gs_clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1) if len(train_dataset['labels'])<25: print('not enough data to use a count vectorizer, sorry!') else: gs_ind=int(len(train_dataset['labels'])/10) #use a tenth of the training dataset to do gridsearch gs_clf = gs_clf.fit(train_dataset['text'][:gs_ind], train_dataset['labels'][:gs_ind]) best_params=gs_clf.best_params_ pipeline.set_params(**best_params) pipeline.fit(train_dataset['text'], train_dataset['labels']) prediction=pipeline.predict(test_dataset['text']) precision, recall, f1, _ = precision_recall_fscore_support(test_dataset['labels'], prediction, average=None) full_report=classification_report(test_dataset['labels'], prediction) acc = accuracy_score(test_dataset['labels'], prediction) loss=hamming_loss(test_dataset['labels'], prediction) curr_metrics={ 'eval_loss': loss, 'eval_accuracy': np.mean(acc), 'eval_f1': np.mean(f1), 'eval_precision': np.mean(precision), 'eval_recall': np.mean(recall), 'eval_full_report': full_report } dump(pipeline, model_name + "_model.joblib") print('best parameters are:') print(best_params) else: trainer = Trainer(model=model, args=training_args, compute_metrics=self.compute_metrics, train_dataset=train_dataset, eval_dataset=test_dataset ) trainer.train() curr_metrics = trainer.evaluate() trainer.save_model(model_name+"_model") self.all_metrics[model_name] = curr_metrics print(curr_metrics) # adding this fully solves the out of memory (OOM) error; https://github.com/huggingface/transformers/issues/1742 del model, tokenizer, trainer # these 2 lines may not be needed gc.collect() torch.cuda.empty_cache()
def get_tokenizer(self, opt: Opt): return BertTokenizer.from_pretrained('bert-base-uncased')
def __init__(self, model_name): self.tokenizer = BertTokenizerFast.from_pretrained(model_name) self.model = EncoderDecoderModel.from_pretrained(model_name)
def __init__(self, dataset, model_init, batch_size, label_num=54, exclude=-1, masked_lm=False, masked_lm_ratio=0.2, dynamic_masked_lm=False, include_raw_text=False, seed=0, clf_type="multi_label_classify"): """Initialize. Args: dataset (dict): a dataset dict. model_init (str): the pre-trained model name. select from ``['bert-base-cased', 'bert-base-uncased', 'bert-large-cased', and 'bert-large-uncased']``. batch_size (int): the batch size in each step. exclude (int): exclude one category from the data. Use -1 (default) to include all categories. masked_lm (bool): whether to randomly replace words with mask tokens. masked_lm_ratio (float): the ratio of random masks. Ignored when masked_lm is False. dynamic_masked_lm (bool): whether to generate dynamic masked language model. lm ratio will be randomly sampled. ``dynamic_masked_lm`` and ``masked_lm`` should not be set True at the same time. include_raw_text (bool): whether to return the raw text. seed: random seed. """ self._buckets = [30, 50, 100, 200] self._max_len = self._buckets[-1] self._data = [[] for i in range(len(self._buckets))] self._batch_size = batch_size self._label_num = label_num self._tokenizer = BertTokenizerFast.from_pretrained( utils.get_transformers(model_init), do_lower_case="uncased" in model_init) self._seed = seed self._pad_tok_id = self._tokenizer.pad_token_id self._masked_lm = masked_lm self._masked_lm_ratio = masked_lm_ratio self._mask_tok_id = self._tokenizer.mask_token_id if dynamic_masked_lm and masked_lm: raise RuntimeError( "Cannot have dynamic_masked_lm and masked_lm both True.") self._dynamic_masked_lm = dynamic_masked_lm self._include_raw_text = include_raw_text self._clf_type = clf_type counter = 0 logger.info("DatasetForBert is processing data.") if isinstance(dataset, list): load_data = dataset elif isinstance(dataset, dict): load_data = dataset["data"] for item in tqdm.tqdm(load_data): y = item["label"] s0 = "[CLS] " + item["text0"] + " [SEP]" if "text1" in item: s1 = item["text1"] + " [SEP]" else: s1 = "" if y == exclude: continue counter += 1 s0_ids = self._tokenizer.convert_tokens_to_ids( self._tokenizer.tokenize(s0)) s1_ids = self._tokenizer.convert_tokens_to_ids( self._tokenizer.tokenize(s1)) text_ids = (s0_ids + s1_ids)[:self._max_len] for bucket_id in range(len(self._buckets)): if self._buckets[bucket_id] >= len(text_ids): self._data[bucket_id].append( (text_ids, y, len(s0_ids), len(s1_ids), s0 + s1)) break logger.info("Load %d documents. with filter %d.", counter, exclude) self._bucket_prob = np.asarray([len(x) for x in self._data]) self._bucket_prob = self._bucket_prob / np.sum(self._bucket_prob)
import regex as re def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) if args.LM == 'Bert': from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM config = BertConfig(vocab_size=28996, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=12, #type_vocab_size=2, default is 2 ) tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e50_b16', config=config) #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config) # 12-layer, 768-hidden, 12-heads, 110M parameters. elif args.LM == 'RoBerta': from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e50_b16', config=config)
@author: qwang """ import re from collections import defaultdict import spacy import torch from transformers import BertTokenizerFast, BertForTokenClassification, BertForSequenceClassification from transformers import logging logging.set_verbosity_error() nlp = spacy.load('en_core_sci_sm') sent_tokenizer = BertTokenizerFast.from_pretrained( 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract') sent_model = BertForSequenceClassification.from_pretrained( 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract') #%% PICO sentence detector def sent_detect(text, pth_path): # Split to sents and tokenization sents = list(nlp(text).sents) sents = [str(s) for s in sents] inputs = sent_tokenizer(sents, truncation=True, padding=True, return_tensors="pt") # Load checkpoint
class DataArgs: train_datapath: str = field(default='', metadata={"help": "training dataset path"}) val_datapath: str = field(default='', metadata={"help": "validation dataset path"}) init_model_path: str = field(default='', metadata={"help": "initial model path"}) block_size: int = field(default=512, metadata={"help": "block size"}) window_size: int = field(default=510, metadata={"help": "window size"}) finetune_self_attn: bool = field(default=False, metadata={"help": "finetune the self attention layer"}) if __name__ == '__main__': parser = HfArgumentParser((TrainingArguments, DataArgs, )) training_args, data_args = parser.parse_args_into_dataclasses(look_for_args_file=False) trelm_electra_model = TrelmElectraForMaskedLM.from_pretrained(data_args.init_model_path) trelm_electra_model_tokenizer = BertTokenizerFast.from_pretrained(data_args.init_model_path) if not data_args.finetune_self_attn: # fix the self-attention parameters for param in trelm_electra_model.trelm_electra.encoder.layer.parameters(): param.requires_grad = False logger.info(trelm_electra_model) logger.info('Evaluating trelm-electra for refernece ...') pretrain_and_evaluate(training_args, data_args, trelm_electra_model, trelm_electra_model_tokenizer, eval_only=True, model_path=None) logger.info(f'Pretraining trelm-electra ... ') pretrain_and_evaluate(training_args, data_args, trelm_electra_model, trelm_electra_model_tokenizer, eval_only=False, model_path=training_args.output_dir) model_path = training_args.output_dir
max_length=self.maxlen) inp_ids, type_ids = inp['input_ids'], inp['token_type_ids'] attention_mask = inp['attention_mask'] padding_length = self.maxlen - len(inp_ids) inp_ids = inp_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) type_ids = type_ids + ([0] * padding_length) assert len(inp_ids) == self.maxlen assert len(type_ids) == self.maxlen assert len(attention_mask) == self.maxlen return torch.tensor(inp_ids), torch.tensor(type_ids), torch.tensor( attention_mask) if __name__ == '__main__': from data import KpBioDataset from transformers import BertTokenizerFast from torch.utils.data import DataLoader BATCH_SIZE = 8 * 8 tokenizer = BertTokenizerFast.from_pretrained('./albert_base') text = '''跨國文化的國家在歐洲不同國家待了快10年 就在今年簽證完結之後回國了 本以為回來是開心的 終於喝到每天念掛的珍奶跟日食 頭1 2個月在找工作還有跟朋友團聚然後突然爆發疫症 就在這個待業期間 想慢慢適應這一切 每天也在想這到底是我想待到養老的國家嗎 畢竟自己心裡是個華人 但是習慣了西方的生活方式 家人朋友也說我太獨立 已經不太合群 之前在英國住過 ''' kp = KeyphrasePredictor(tokenizer, './albert_base', ckpt='ckpt/step_8502.ckpt') print(kp.predict(text))
def test_squad_feature_extractor(dataset): print("======Squad Feature Test Case======") tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') # First test context = 'This is a sample context. BERT will find the answer words in the context by pointing the start and end token positions.' question = 'Where are the answer words?' answer = 'in the context' start_pos = context.find(answer) input_ids, token_type_ids, start_pos, end_pos = squad_features( context, question, answer, start_pos, tokenizer) assert tokenizer.convert_ids_to_tokens(input_ids) == \ ['[CLS]', 'where', 'are', 'the', 'answer', 'words', '?', '[SEP]', \ 'this', 'is', 'a', 'sample', 'context', '.', \ 'bert', 'will', 'find', 'the', 'answer', 'words', 'in', 'the', 'context', \ 'by', 'pointing', 'the', 'start', 'and', 'end', 'token', 'positions', '.', '[SEP]'], \ "Your tokenized result does not match the expected result." assert token_type_ids == \ [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], \ "Your sentence type ids do not math the expected result" assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == ['in', 'the', 'context'], \ "The start and end tokens do not point the answer position." print("The first test passed!") # Second test context = 'Sometimes, the answer could be subwords so you may need to split them manually.' question = 'What should the answer consist of' answer = 'word' start_pos = context.find(answer) input_ids, token_type_ids, start_pos, end_pos = squad_features( context, question, answer, start_pos, tokenizer) assert tokenizer.convert_ids_to_tokens(input_ids) == \ ['[CLS]', 'what', 'should', 'the', 'answer', 'consist', 'of', '[SEP]', 'sometimes', ',', 'the', 'answer', 'could', 'be', 'sub', '##word', '##s', 'so', 'you', 'may', 'need', 'to', 'split', 'them', 'manually', '.', '[SEP]'], \ "Your tokenized result does not match the expected result." assert token_type_ids == \ [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], \ "Your sentence type ids do not math the expected result" assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == ['##word'], \ "The start and end tokens do not point the answer position." print("The second test passed!") # Third test context = 'When the answer is not given, you should return None for start_pos and end_pos.' question = 'This test case does not need a question' input_ids, token_type_ids, start_pos, end_pos = squad_features( context, question, None, None, tokenizer) assert len(input_ids) == 33, \ "Your tokenized result does not match the expected result." assert start_pos is None and end_pos is None, \ "You should return None for start_pos and end_pos when the answer is not given." print("The third test passed!") # Forth test sample = dataset[0] context = sample['context'] question = sample['question'] answer = sample['answer'] start_pos = sample['start_pos'] input_ids, token_type_ids, start_pos, end_pos = squad_features( context, question, answer, start_pos, tokenizer) assert len(input_ids) == 176, \ "Your tokenized result does not match the expected result." assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == tokenizer.tokenize(answer), \ "The start and end tokens do not point the answer position." print("The forth test passed!") # Fifth test sample = dataset[80000] context = sample['context'] question = sample['question'] answer = sample['answer'] start_pos = sample['start_pos'] input_ids, token_type_ids, start_pos, end_pos = squad_features( context, question, answer, start_pos, tokenizer) assert len(input_ids) == 165, \ "Your tokenized result does not match the expected result." assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == tokenizer.tokenize(answer), \ "The start and end tokens do not point the answer position." print("The fifth test passed!") print("All 5 tests passed!")