def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): model = DistilBertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def train(informal, formal): if torch.cuda.is_available(): device = torch.device('cuda:1') print(f'Using GPU device: {device}') else: device = torch.device('cpu') print(f'GPU is not available, using CPU device {device}') wandb.init(project="NLP_BERT") train_config = {'batch_size': 10, 'n_epochs': 200, 'save_dir':'./checkpoints/', 'lr_scheduler': { 'type': 'warmup,decay_linear', 'warmup_steps_part': 0.05, 'lr_peak': 1e-4, }} train_dataset = FormalDataset(informal, formal) model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') model.to(device) #Model training procedure optimizer = torch.optim.AdamW(model.parameters(), lr=0.,) n_steps = (len(train_dataset) // train_config['batch_size'] + 1) * train_config['n_epochs'] lr_scheduler = LrScheduler(n_steps, **train_config['lr_scheduler']) train_dataloader = DataLoader(train_dataset, batch_size=train_config['batch_size'], shuffle=True, num_workers=4, drop_last=True) criterion = nn.CrossEntropyLoss(reduction='none') for epoch in range(1,train_config['n_epochs']+1): print('\n' + '-'*40) print(f'Epoch: {epoch}') print(f'Run training...') model.train() run_epoch(train_dataloader, model, lr_scheduler, optimizer, criterion,device=device) save_checkpoint(epoch, model, lr_scheduler, optimizer, train_config['save_dir'])
def __init__( self, model=None, tokenizer=None, model_name="bert-large-uncased", mask_token="***mask***", disable_gpu=False, ): self.mask_token = mask_token self.delemmatizer = Delemmatizer() self.device = torch.device( "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu") print("using model:", model_name) print("device:", self.device) if not model: if "distilbert" in model_name: self.bert = DistilBertForMaskedLM.from_pretrained(model_name) else: self.bert = BertForMaskedLM.from_pretrained(model_name) self.bert.to(self.device) else: self.bert = model if not tokenizer: if "distilbert" in model_name: self.tokenizer = DistilBertTokenizer.from_pretrained( model_name) else: self.tokenizer = BertTokenizer.from_pretrained(model_name) else: self.tokenizer = tokenizer self.bert.eval()
def __init__(self, segment_size, output_size, dropout): super(DistillBertPunc, self).__init__() self.bert = DistilBertForMaskedLM.from_pretrained( './models/distillbert/') self.bert_vocab_size = 30522 self.bn = nn.BatchNorm1d(segment_size * self.bert_vocab_size) self.fc = nn.Linear(segment_size * self.bert_vocab_size, output_size) self.dropout = nn.Dropout(dropout)
def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path self.tokenizer = DistilBertTokenizer.from_pretrained(model_path) self.model = DistilBertForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def from_pretrained(cls, model_name: str): return cls( DistilBertForMaskedLM.from_pretrained( model_name, output_attentions=True, output_hidden_states=True, output_additional_info=True, ), DistilBertAligner.from_pretrained(model_name), )
def __init__(self, url=None): from transformers import DistilBertTokenizer, DistilBertForMaskedLM import torch self.torch = torch self.url = url if url is None: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') self.bert = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') self.bert.to(self.device) self.bert.eval()
def test(informal): if torch.cuda.is_available(): device = torch.device('cuda:3') print(f'Using GPU device: {device}') else: device = torch.device('cpu') print(f'GPU is not available, using CPU device {device}') test_config = {'batch_size': 5, 'epoch': 29, 'save_dir': './checkpoints/'} test_dataset = FormalDataset(informal) dataloader = DataLoader(test_dataset, batch_size=test_config['batch_size'], shuffle=False, num_workers=4, drop_last=False) config = DistilBertConfig() model = DistilBertForMaskedLM(config) load_model(test_config['epoch'], model, test_config['save_dir']) model.to(device) model.eval() with torch.no_grad(): for i, batch in tqdm(enumerate(dataloader)): inp = batch['input_ids'].to(device) attn = batch['attention_mask'].to(device) logits = model(input_ids=inp, attention_mask=attn)[0] preds = decode_text(test_dataset.tokenizer, logits) for seq in preds: with open('test_pred.txt', 'a') as res_file: res_file.writelines(seq + '\n')
def create_and_check_distilbert_for_masked_lm( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = DistilBertForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) try: import transformers except ModuleNotFoundError: raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`') self.model_path = model_path # self.tokenizer = AutoTokenizer.from_pretrained(model_path) # self.model = AutoModel.from_pretrained(model_path) self.tokenizer = DistilBertTokenizer.from_pretrained(model_path) self.model = DistilBertForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def __init__(self, args, vocab_subset=None): super().__init__() bert_model_name = args.bert_model_name dict_file = bert_model_name if args.bert_model_dir is not None: # load bert model from file bert_model_name = str(args.bert_model_dir) + "/" dict_file = bert_model_name + args.bert_vocab_name self.dict_file = dict_file print("loading BERT model from {}".format(bert_model_name)) else: # load bert model from huggingface cache pass # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer do_lower_case = False if 'uncased' in bert_model_name: do_lower_case = True # Load pre-trained model tokenizer (vocabulary) self.tokenizer = DistilBertTokenizer.from_pretrained(dict_file) # original vocab self.map_indices = None self.vocab = list(self.tokenizer.ids_to_tokens.values()) self._init_inverse_vocab() # Add custom tokenizer to avoid splitting the ['MASK'] token custom_basic_tokenizer = CustomBaseTokenizer( do_lower_case=do_lower_case) self.tokenizer.basic_tokenizer = custom_basic_tokenizer # Load pre-trained model (weights) # ... to get prediction/generation self.masked_bert_model = DistilBertForMaskedLM.from_pretrained( bert_model_name) self.masked_bert_model.eval() # ... to get hidden states self.bert_model = self.masked_bert_model.distilbert self.pad_id = self.inverse_vocab[BERT_PAD] self.unk_index = self.inverse_vocab[BERT_UNK]
def main(): batch_size = 4 dev_dataset = TorchDataset( file_name="./data/diverse.triplets.dev.tsv", queries_path="./data/diverse.queries.all.tsv", passages_path="./data/diverse.passages.all.tsv", ) dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False) test_dataset = TorchDataset( file_name="./data/diverse.triplets.test.tsv", queries_path="./data/diverse.queries.all.tsv", passages_path="./data/diverse.passages.all.tsv", ) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") # load model # DistilBertForSequenceClassification # DistilBertForMaskedLM model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased") model.load_state_dict(torch.load("demo_model.pt", map_location=device)) model.to(device) model.eval() data_loader = dev_dataloader N = len(data_loader) correct, total = 0, 0 start = time.time() with torch.no_grad(): for i, (queries, pos_docs, neg_docs) in enumerate(data_loader): inputs = list(queries) + list(pos_docs) + list(neg_docs) encodings = tokenizer( inputs, return_tensors="pt", truncation=True, padding=True, max_length=512, ) ids, masks = encodings["input_ids"], encodings["attention_mask"] ids = ids.to(device) # (3B, MAXLEN) masks = masks.to(device) # (3B, MAXLEN) # TODO: could add more layers after distilbert! outputs = model.distilbert(ids, masks) outputs_hidden = outputs.last_hidden_state.mean(dim=1) anchors, positives, negatives = outputs_hidden.view( 3, len(queries), -1) # compute 2 distance: positive_doc to query, negative_doc to query using l2 distance pos_dist = (anchors - positives).norm(dim=-1) # B distances neg_dist = (anchors - negatives).norm(dim=-1) # B distances # pos_dist = 1 - F.cosine_similarity(anchors, positives, dim=-1) # B distances # neg_dist = 1 - F.cosine_similarity(anchors, negatives, dim=-1) # B distances correct += (pos_dist < neg_dist).sum() total += len(queries) if i % 10 == 0: remaining_time = (time.time() - start) / (i + 1) * N - ( time.time() - start) print( f"remaining time: {remaining_time:.2f} | est. accuracy: {correct / total:.4f}" ) print(f"accuracy {correct / total}")
def get_distilkobert_lm(): """ Return DistilBertForMaskedLM for DistilKobert """ model = DistilBertForMaskedLM.from_pretrained('monologg/distilkobert') return model
from transformers import \ AlbertTokenizer, AlbertForMaskedLM,\ DistilBertTokenizer, DistilBertForMaskedLM, \ RobertaTokenizer, RobertaForMaskedLM albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval() albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2') albert_large_model = AlbertForMaskedLM.from_pretrained( 'albert-large-v2').eval() distilbert_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') distilbert_model = DistilBertForMaskedLM.from_pretrained( 'distilbert-base-cased').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx: token = ''.join(tokenizer.decode(w).split()) if token not in ignore_tokens: tokens.append(token.replace('##', '')) return '\n'.join(tokens[:top_clean])
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM # Load the tokenizer #tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512) # the one we trained ourselves (akuapem) tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512, do_lower_case=True) # the one we trained ourselves (asante, lowercase everything) #tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased") # you could also use pre-trained DistilmBERT tokenizer #tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased", do_lower_case=True) # for asante, lowercase pretrained tokenizer #tokenizer.save_vocabulary("distilabena-base-akuapem-twi-cased") # when using pretrained tokenizer, be sure to save it locally tokenizer.save_vocabulary("distilabena-base-v2-asante-twi-uncased") # saving pretrained tokenizer locally in case of asante # Load DistilBERT multilingual base checkpoint #model = DistilBertForMaskedLM.from_pretrained("distilbert-base-multilingual-cased") # pretrained DistilmBERT weights model = DistilBertForMaskedLM.from_pretrained("distilabena-base-v2-akuapem-twi-cased") # in the case of Asante Twi, start with Akuapem model weights print("Number of parameters in the model:") print(model.num_parameters()) # Create dataset object for JW300 dataset (Akuapem) or Asante Twi Bible from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, # file_path="../../data/jw300.en-tw.tw", # stage 1 - akuapem file_path="../../data/asante_twi_bible.txt", # stage 2 - asante block_size=128, ) # Create "data collator" from dataset and tokenizer - with 15% chance of masking from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) # Define training arguments from transformers import Trainer, TrainingArguments
def main(): parser = ArgumentParser('Distributed distillation example') parser.add_argument('--data_file', type=str, metavar='PATH', required=True, help='Path to file containing the data (sequences).') parser.add_argument('--output_dir', type=str, metavar='PATH', required=True, help='Path to the output directory (for logs, checkpoints, parameters, etc.).') parser.add_argument('-f', '--force', action='store_true', help='Overwrite output_dir if it already exists.') parser.add_argument('--student_config_file', type=str, metavar='PATH', required=True, help='Path to the student model configuration.') parser.add_argument('--student_weights_file', type=str, default=None, metavar='PATH', help='Path to the student model initialization weights.') parser.add_argument('--teacher_type', type=str, default=None, choices={'bert-base-uncased'}, help='The pre-trained teacher model type to initialize.') parser.add_argument('--tokenizer_vocab_file', type=str, metavar='PATH', required=True, help='Path to the tokenizer vocabulary.') parser.add_argument('--min_sequence_len', type=int, default=12, metavar='N', help='The minimum length of a sequence.') parser.add_argument('--max_sequence_len', type=int, default=512, metavar='N', help='The maximum length of a sequence.') parser.add_argument('--do_tokenize', action='store_true', help='Whether to tokenize the input.') parser.add_argument('--do_lower_case', action='store_true', help='Whether to lowercase the input when tokenizing.') parser.add_argument('-n', '--num_epochs', type=int, default=3, metavar='N', help='The number of distillation epochs.') parser.add_argument('-b', '--batch_size', type=int, default=5, metavar='N', help='The batch size.') parser.add_argument('--lr', '--learning_rate', type=float, default=5e-4, metavar='F', help='The initial learning rate.') parser.add_argument('--epsilon', type=float, default=1e-6, metavar='F', help="Adam's epsilon.") parser.add_argument('--warmup_prop', type=float, default=0.05, metavar='F', help='Linear warmup proportion.') parser.add_argument('--num_gradient_accumulation_steps', type=int, default=50, metavar='N', help='The number of gradient accumulation steps (for larger batch sizes).') parser.add_argument('--max_gradient_norm', type=float, default=5.0, metavar='F', help='The maximum gradient norm.') parser.add_argument('--soft_target_alpha', type=float, default=0.33, metavar='F', help='The relative weight of the soft target loss.') parser.add_argument('--hard_target_alpha', type=float, default=0.33, metavar='F', help='The relative weight of the hard target loss.') parser.add_argument('--cosine_emb_alpha', type=float, default=0.33, metavar='F', help='The relative weight of the cosine embedding loss.') parser.add_argument('--seed', type=int, default=42, metavar='N', help='Random seed.') parser.add_argument('-c', '--use_cuda', action='store_true', help='Whether to use cuda or not.') parser.add_argument('-d', '--use_distributed', action='store_true', help='Whether to use distributed training (distillation) or not.') parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') params = parser.parse_args() if not params.use_distributed: params.local_rank = 0 params.is_master = params.local_rank == 0 # make output_dir if Path(params.output_dir).is_dir() and not params.force: raise ValueError( f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.') if params.is_master: Path(params.output_dir).mkdir(parents=True, exist_ok=params.force) # dump params json.dump( vars(params), open(Path(params.output_dir) / 'params.json', 'w'), indent=4, sort_keys=True ) params.output_dir = Path(params.output_dir) # initialize multi-GPU if params.use_distributed: if params.is_master: logger.info('Initializing PyTorch distributed') torch.cuda.set_device(params.local_rank) torch.distributed.init_process_group( backend='nccl', init_method='env://' ) # set seed(s) if params.is_master: logger.info('Setting random seed(s)') random.seed(params.seed) np.random.seed(params.seed) torch.manual_seed(params.seed) if params.use_distributed: torch.cuda.manual_seed_all(params.seed) # initialize the student if params.is_master: logger.info('Initializing the student') student_config = DistilBertConfig.from_pretrained( params.student_config_file) student_config.output_hidden_states = True if params.student_weights_file is not None: student = DistilBertForMaskedLM.from_pretrained( params.student_weights_file, config=student_config ) else: student = DistilBertForMaskedLM(student_config) # initialize the teacher if params.is_master: logger.info('Initializing the teacher') teacher = BertForMaskedLM.from_pretrained( params.teacher_type, output_hidden_states=True) # initialize the tokenizer if params.is_master: logger.info('Initializing the tokenizer') tokenizer = BertWordPieceTokenizer( params.tokenizer_vocab_file, lowercase=params.do_lower_case ) # initialize the dataset if params.is_master: logger.info('Initializing the dataset') dataset = LanguageModelingDataset( path=params.data_file, tokenizer=tokenizer, do_tokenize=params.do_tokenize, min_sequence_len=params.min_sequence_len, max_sequence_len=params.max_sequence_len ) # initialize the sampler if params.is_master: logger.info('Initializing the sampler') group_bins = list(range(3, params.max_sequence_len, 4)) group_idxs = quantize(dataset.lengths, group_bins) sampler = GroupedBatchSampler( sampler=DistributedSampler(dataset) if params.use_distributed else RandomSampler(dataset), group_idxs=group_idxs, batch_size=params.batch_size, drop_last=False ) # initialize the dataloader if params.is_master: logger.info('Initializing the dataloader') dataloader = DataLoader( dataset=dataset, batch_sampler=sampler, collate_fn=dataset.sequences_collate_fn ) # initialize the loss function if params.is_master: logger.info('Initializing the loss function') loss_fn = SanhLoss( alphas=( params.soft_target_alpha, params.hard_target_alpha, params.cosine_emb_alpha ), reduction=('batchmean', 'mean', 'mean') ) # compute token counts if params.is_master: logger.info('Computing token counts') counter = Counter() for sequence in dataset.sequences: counter.update(sequence) token_counts = [0] * dataset._tokenizer.get_vocab_size() for k, v in counter.items(): token_counts[k] = v del counter # compute token probabilities if params.is_master: logger.info('Computing token probabilities') token_probabilities = np.maximum(token_counts, 1) ** -0.7 # give special tokens a zero probability for idx in dataset.special_tokens_map.values(): token_probabilities[idx] = 0.0 # convert to torch.FloatTensor token_probabilities = torch.FloatTensor(token_probabilities) # initialize the distiller if params.is_master: logger.info('Initializing the distiller') distiller = SanhDistiller( student=student, teacher=teacher, dataloader=dataloader, token_probabilities=token_probabilities, loss_fn=loss_fn, num_epochs=params.num_epochs, num_gradient_accumulation_steps=params.num_gradient_accumulation_steps, max_gradient_norm=params.max_gradient_norm, use_cuda=params.use_cuda, local_rank=params.local_rank, use_distributed=params.use_distributed, is_master=params.is_master, use_tqdm=True, logger=logger, ) # start the distillation if params.is_master: logger.info('Starting the distillation') distiller.distill() # save the student model config and weights if params.is_master: logger.info('Saving the student model config') json.dump( vars(student.config), open(params.output_dir / 'distilled_bert_config.json', 'w'), indent=4, sort_keys=True ) logger.info('Saving the student model weights') model_to_save = student.module if hasattr(student, 'module') else student # Take care of distributed/parallel training torch.save( model_to_save.state_dict(), params.output_dir / 'distilled_bert_weights.pth' )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--exp", type=int, default=7) parser.add_argument("--save", type=str, default="./model2_best_diverse_mean_maskedLM.pt") args = parser.parse_args() # Data and Tokenization device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") batch_size = 4 train_dataset = TorchDataset( file_name="./data/diverse.triplets.train.tsv", queries_path="./data/diverse.queries.all.tsv", passages_path="./data/diverse.passages.all.tsv", ) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) dev_dataset = TorchDataset( file_name="./data/diverse.triplets.dev.tsv", queries_path="./data/diverse.queries.all.tsv", passages_path="./data/diverse.passages.all.tsv", ) dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False) # Model Training and Evaluation NUM_EPOCHS = 1 LEARNING_RATE = 0.00003 # load model model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased") # if args.exp == 1: # pass # elif args.exp == 2: # # elif args.exp == 3: if args.exp == 7: # For Experiment7: average model = DistilBertForMaskedLM.from_pretrained( "distilbert-base-uncased") triplet_loss = nn.TripletMarginLoss(margin=1.0) elif args.exp == 6: # For Experiment6: base + cosine triplet_loss = nn.TripletMarginWithDistanceLoss( distance_function=lambda x, y: 1 - F.cosine_similarity( x, y, dim=-1), margin=1.0, ) elif args.exp == 5: # For Experiment5: base + margin = 0.1 triplet_loss = nn.TripletMarginLoss(margin=0.1) elif args.exp == 4: # For Experiment4: base triplet_loss = nn.TripletMarginLoss(margin=1.0) model.to(device) model.train() optimizer = torch.optim.Adam(model.distilbert.parameters(), lr=LEARNING_RATE) def evaluate(inputs, model, tokenizer): encodings = tokenizer( inputs, return_tensors="pt", truncation=True, padding=True, max_length=512, ) ids, masks = encodings["input_ids"], encodings["attention_mask"] outputs = model.distilbert(ids.to(device), masks.to(device)) if args.exp < 7: # Experiment: using the first index of the last layers outputs_hidden = outputs.last_hidden_state[:, 0] else: # Averaging last layers outputs_hidden = outputs.last_hidden_state.mean(dim=1) return outputs_hidden.view(3, len(queries), -1) dataloader = train_dataloader N = len(dataloader) lowest_loss = float("inf") start = time.time() learning_curve_y = [] learning_curve_x = [] for epoch in range(NUM_EPOCHS): epoch_loss = 0 for i, (queries, pos_docs, neg_docs) in enumerate(dataloader): # readability # train() # evaluate() # print() optimizer.zero_grad() # set gradient to zero anchors, positives, negatives = evaluate( inputs=list(queries + pos_docs + neg_docs), model=model, tokenizer=tokenizer, ) loss = triplet_loss(anchors, positives, negatives) loss.backward() optimizer.step() epoch_loss += float(loss) if i % 10 == 0: elapsed_time = time.time() - start remaining_time = elapsed_time * (1 / (i + 1) * N - 1) print( f"{i}: remaining time: {remaining_time:.1f} | est. epoch loss: {epoch_loss / (i + 1):.4f}" ) if i % 100 == 0: with torch.no_grad(): correct = total = 0 val_start = time.time() for dq, dp, dn in dev_dataloader: anchors, positives, negatives = evaluate( inputs=list(dq + dp + dn), model=model, tokenizer=tokenizer, ) if args.exp == 6: # cosine distance pos_dist = 1 - F.cosine_similarity( anchors, positives, dim=-1) neg_dist = 1 - F.cosine_similarity( anchors, negatives, dim=-1) else: # using l2 norm pos_dist = (anchors - positives).norm( dim=-1) # B distances neg_dist = (anchors - negatives).norm( dim=-1) # B distances correct += float((pos_dist < neg_dist).sum()) total += len(dq) if time.time() - val_start > 15: break print( f"{i}: est. validation accuracy: {correct / total:.4f}" ) learning_curve_y.append(correct / total) learning_curve_x.append(i * batch_size) # epoch normally if (epoch_loss / (i + 1)) < lowest_loss: if args.exp == 4: torch.save(model.state_dict(), "model2_best_diverse_base.pt") elif args.exp == 5: torch.save(model.state_dict(), "model2_best_diverse_margin.pt") elif args.exp == 6: torch.save(model.state_dict(), "model2_best_diverse_cosine.pt") elif args.exp == 7: torch.save(model.state_dict(), "model2_best_diverse_mean_maskedLM.pt") lowest_loss = epoch_loss / (i + 1) print(f"loss for epoch {epoch} is {epoch_loss}") generate_data_for_plot(learning_curve_y, learning_curve_x)
def test_runner(): """Test that runner executes""" train_df = pd.read_csv("data/train.csv") valid_df = pd.read_csv("data/valid.csv") teacher_config = AutoConfig.from_pretrained("bert-base-uncased", output_hidden_states=True, output_logits=True) teacher = BertForMaskedLM.from_pretrained("bert-base-uncased", config=teacher_config) student_config = AutoConfig.from_pretrained( "distilbert-base-uncased", output_hidden_states=True, output_logits=True, ) student = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased", config=student_config) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") train_dataset = LanguageModelingDataset(train_df["text"], tokenizer) valid_dataset = LanguageModelingDataset(valid_df["text"], tokenizer) collate_fn = DataCollatorForLanguageModeling(tokenizer) train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=2) valid_dataloader = DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=2) loaders = {"train": train_dataloader, "valid": valid_dataloader} callbacks = { "masked_lm_loss": MaskedLanguageModelCallback(), "mse_loss": MSELossCallback(), "cosine_loss": CosineLossCallback(), "kl_div_loss": KLDivLossCallback(), "loss": MetricAggregationCallback( prefix="loss", mode="weighted_sum", metrics={ "cosine_loss": 1.0, "masked_lm_loss": 1.0, "kl_div_loss": 1.0, "mse_loss": 1.0, }, ), "optimizer": dl.OptimizerCallback(), "perplexity": PerplexityMetricCallbackDistillation(), } model = torch.nn.ModuleDict({"teacher": teacher, "student": student}) runner = DistilMLMRunner() optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) runner.train( model=model, optimizer=optimizer, loaders=loaders, verbose=True, check=True, callbacks=callbacks, ) assert True
config = OmegaConf.load(args.config_path) print(OmegaConf.to_yaml(config)) os.environ['WANDB_DISABLED'] = 'true' tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" distilbert_config = DistilBertConfig(vocab_size=config.vocab_size, n_heads=8, dim=512, hidden_dim=2048) model = DistilBertForMaskedLM(distilbert_config) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=DATA_PATH, block_size=64) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability) training_args = TrainingArguments( output_dir=config.output_path, overwrite_output_dir=True, num_train_epochs=config.num_train_epochs, learning_rate=config.learning_rate,
def main(): parser = argparse.ArgumentParser(description="Training") parser.add_argument( "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)") parser.add_argument( "--data_file", type=str, required=True, help= "The binarized file (tokenized + tokens_to_ids) and grouped by sequence." ) parser.add_argument("--token_counts", type=str, required=True, help="The token counts in the data_file for MLM.") parser.add_argument("--force", action='store_true', help="Overwrite dump_path if it already exists.") parser.add_argument("--vocab_size", default=30522, type=int, help="The vocabulary size.") parser.add_argument( "--max_position_embeddings", default=512, type=int, help="Maximum sequence length we can model (including [CLS] and [SEP])." ) parser.add_argument( "--sinusoidal_pos_embds", action='store_false', help= "If true, the position embeddings are simply fixed with sinusoidal embeddings." ) parser.add_argument("--n_layers", default=6, type=int, help="Number of Transformer blocks.") parser.add_argument("--n_heads", default=12, type=int, help="Number of heads in the self-attention module.") parser.add_argument( "--dim", default=768, type=int, help="Dimension through the network. Must be divisible by n_heads") parser.add_argument("--hidden_dim", default=3072, type=int, help="Intermediate dimension in the FFN.") parser.add_argument("--dropout", default=0.1, type=float, help="Dropout.") parser.add_argument("--attention_dropout", default=0.1, type=float, help="Dropout in self-attention.") parser.add_argument("--activation", default='gelu', type=str, help="Activation to use in self-attention") parser.add_argument( "--tie_weights_", action='store_false', help= "If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true." ) parser.add_argument("--from_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint.") parser.add_argument( "--from_pretrained_config", default=None, type=str, help="Load student initialization architecture config.") parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"], help="Teacher type (BERT, RoBERTa).") parser.add_argument("--teacher_name", default="bert-base-uncased", type=str, help="The teacher model.") parser.add_argument("--temperature", default=2., type=float, help="Temperature for the softmax temperature.") parser.add_argument( "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0.") parser.add_argument("--alpha_mlm", default=0.5, type=float, help="Linear weight for the MLM loss. Must be >=0.") parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.") parser.add_argument( "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0.") parser.add_argument( "--mlm_mask_prop", default=0.15, type=float, help="Proportion of tokens for which we need to make a prediction.") parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.") parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.") parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.") parser.add_argument( "--mlm_smoothing", default=0.7, type=float, help= "Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec)." ) parser.add_argument( "--restrict_ce_to_mask", action='store_true', help= "If true, compute the distilation loss only the [MLM] prediction distribution." ) parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.") parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).") parser.add_argument( "--tokens_per_batch", type=int, default=-1, help= "If specified, modify the batches so that they have approximately this number of tokens." ) parser.add_argument( "--shuffle", action='store_false', help="If true, shuffle the sequence order. Default is true.") parser.add_argument( "--group_by_size", action='store_false', help= "If true, group sequences that have similar length into the same batch. Default is true." ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=50, help="Gradient accumulation for larger training batches.") parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.") parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.") parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank") parser.add_argument("--seed", type=int, default=56, help="Random seed") parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.") parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.") args = parser.parse_args() ## ARGS ## init_gpu_params(args) set_seed(args) if args.is_master: if os.path.exists(args.dump_path): if not args.force: raise ValueError( f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it' 'Use `--force` if you want to overwrite it') else: shutil.rmtree(args.dump_path) if not os.path.exists(args.dump_path): os.makedirs(args.dump_path) logger.info( f'Experiment will be dumped and logged in {args.dump_path}') ### SAVE PARAMS ### logger.info(f'Param: {args}') with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f: json.dump(vars(args), f, indent=4) git_log(args.dump_path) assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \ (args.from_pretrained_weights is not None and args.from_pretrained_config is not None) ### TOKENIZER ### if args.teacher_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.teacher_name) elif args.teacher_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name) special_tok_ids = {} for tok_name, tok_symbol in tokenizer.special_tokens_map.items(): idx = tokenizer.all_special_tokens.index(tok_symbol) special_tok_ids[tok_name] = tokenizer.all_special_ids[idx] logger.info(f'Special tokens {special_tok_ids}') args.special_tok_ids = special_tok_ids ## DATA LOADER ## logger.info(f'Loading data from {args.data_file}') with open(args.data_file, 'rb') as fp: data = pickle.load(fp) assert os.path.isfile(args.token_counts) logger.info( f'Loading token counts from {args.token_counts} (already pre-computed)' ) with open(args.token_counts, 'rb') as fp: counts = pickle.load(fp) assert len(counts) == args.vocab_size token_probs = np.maximum(counts, 1)**-args.mlm_smoothing for idx in special_tok_ids.values(): token_probs[idx] = 0. # do not predict special tokens token_probs = torch.from_numpy(token_probs) train_dataloader = Dataset(params=args, data=data) logger.info(f'Data loader created.') ## STUDENT ## if args.from_pretrained_weights is not None: assert os.path.isfile(args.from_pretrained_weights) assert os.path.isfile(args.from_pretrained_config) logger.info( f'Loading pretrained weights from {args.from_pretrained_weights}') logger.info( f'Loading pretrained config from {args.from_pretrained_config}') stu_architecture_config = DistilBertConfig.from_json_file( args.from_pretrained_config) stu_architecture_config.output_hidden_states = True student = DistilBertForMaskedLM.from_pretrained( args.from_pretrained_weights, config=stu_architecture_config) else: args.vocab_size_or_config_json_file = args.vocab_size stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True) student = DistilBertForMaskedLM(stu_architecture_config) if args.n_gpu > 0: student.to(f'cuda:{args.local_rank}') logger.info(f'Student loaded.') ## TEACHER ## if args.teacher_type == 'bert': teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True) elif args.teacher_type == 'roberta': teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True) if args.n_gpu > 0: teacher.to(f'cuda:{args.local_rank}') logger.info(f'Teacher loaded from {args.teacher_name}.') ## DISTILLER ## torch.cuda.empty_cache() distiller = Distiller(params=args, dataloader=train_dataloader, token_probs=token_probs, student=student, teacher=teacher) distiller.train() logger.info("Let's go get some drinks.")
# pos_docs = ('London City Airport is the closest, approximately 6 miles from the city centre.', "The 'K' in vitamin K is derived from the German word koagulation.. Coagulation is the process in which blood forms clots. Vitamin K facilitates the function of several proteins, including those that are responsible for blood clot formation.It plays a vital role in cell growth and in the metabolism of bone and other tissues.itamin K and Blood Clot Formation. Blood clots through a process called the 'coagulation cascade'. It's referred to as a cascade as it involves a cascade of enzymes activating each other. Fibrin is formed at the end of the cascade.", 'That part of the floor of the PELVIS that lies between the tops of the thighs. In the male, the perineum lies between the anus and the scrotum. In the female, it includes the external genitalia. The area between the opening of the v****a and the anus in a woman, or the area between the scrotum and the anus in a man.', 'Whitemarsh Island, Georgia. Whitemarsh Island (pronounced WIT-marsh) is a census-designated place (CDP) in Chatham County, Georgia, United States. The population was 6,792 at the 2010 census. It is part of the Savannah Metropolitan Statistical Area. The communities of Whitemarsh Island are a relatively affluent suburb of Savannah.', 'What is the average cost of an ambulance ride? In Los Angeles, basic emergency ambulance transport is about $1,000 to $1,100. The cost is more like $1,200 to $1,300 for a transport that requires advanced life support. Included is the cost for paramedics and the ambulance ride itself. However, companies can charge extra for mileage, supplies and equipment.', 'If you are currently using a gas or electric pool heater, the savings from solar pool heating system will pay for itself in 2-3 years of installation. For example, the average system costs about $5,500 and it typically costs about $2,000 a year to heat a pool with gas. solar pool heater can be installed for $3,500 to $8,000. Since the sun is free, it will cost you nothing to heat your pool from April through November. Plus, there are no regular maintenance fees.', 'Change Oil and Filter for Mercedes-Benz E350 costs an average of about $137. Skip the repair shop, our certified mechanics come to you. Get a quote · Book an Appointment · Get your car fixed at your home or office.', 'Lila Downs. Ana Lila Downs Sánchez, best known as Lila Downs (born September 19, 1968 · ) is an Americanâ\x80\x93Mexican singer-songwriter and actress. She performs her own compositions and the works of others in multiple genres, as well as tapping into Mexican traditional and popular music.', "The Square Deal was a program started by President Theodore Roosevelt's based on conservation of natural resources, control of corporations, and consumer protection. Oftenâ\x80¦ referred to as the three C's of Roosevelt's Square Deal.", 'Americaâ\x80\x99s Original Thrift. Goodwill Industries was founded in 1902 by Rev. Edgar J. Helms, a Methodist minister and early social innovator. Helms collected used household goods and clothing in wealthier areas of Boston, then trained and hired those who were poor to mend and repair the used goods.', 'What about Bob. This a howlingly funny movie filmed entirely at Smith Mountain Lake in Virginia. Smith Mountain Lake has a tourism motto: Closer than you think!!. There are some wonderful bed and breakfasts in this area of Virginia. One even in an historic downtown areas of a nearby town to Smith Mountain Lake.', 'Jim Edmonds. James Patrick Jim Edmonds (born June 27, 1970) is an American former center fielder in Major League Baseball and a current broadcaster for Fox Sports Midwest. He played for the California/Anaheim Angels, St. Louis Cardinals, San Diego Padres, Chicago Cubs, Milwaukee Brewers, and Cincinnati Reds.', 'If you are currently using a gas or electric pool heater, the savings from solar pool heating system will pay for itself in 2-3 years of installation. For example, the average system costs about $5,500 and it typically costs about $2,000 a year to heat a pool with gas. solar pool heater can be installed for $3,500 to $8,000. Since the sun is free, it will cost you nothing to heat your pool from April through November. Plus, there are no regular maintenance fees.', 'Cnidarians are a group of aquatic invertebrates that includes jellyfish, corals, sea anemones and hydras.', "David Delamare was born in Leicester, UK but has spent most of his life in Portland, Oregon where he enjoys the cloudy weather.Though he likes to travel, he has never driven a car. He sleeps late and works deep into the night.When he's not attending films, plays, or concerts he can usually be found at home or strolling in Portland's Hawthorne District.hough he likes to travel, he has never driven a car. He sleeps late and works deep into the night. When he's not attending films, plays, or concerts he can usually be found at home or strolling in Portland's Hawthorne District.", 'What about Bob. This a howlingly funny movie filmed entirely at Smith Mountain Lake in Virginia. Smith Mountain Lake has a tourism motto: Closer than you think!!. There are some wonderful bed and breakfasts in this area of Virginia. One even in an historic downtown areas of a nearby town to Smith Mountain Lake.') # neg_docs = ('Hilton Garden Inn London Heathrow Airport added 123 new photos to the album: HGI London Heathrow Airport Exclusive Launch Event â\x80\x94 at Hilton Garden Inn London Heathrow Airport.', 'How HIV is Transmitted. HIV is spread by sexual contact with an infected person, by sharing needles and/or syringes (primarily for drug injection) with someone who is infected, or, less commonly (and now very rarely in countries where blood is screened for HIV antibodies), through transfusions of infected blood or blood clotting factors.', 'Personal injury. A common cause associated with nonischemic priapism â\x80\x94 a persistent erection caused by excessive blood flow in the penis â\x80\x94 is trauma or injury to your genitals, pelvis or perhaps the perineum, the region involving the base of the penis and the anus.', '1 Liberty Island, exclave of New York with surrounding waters in New Jersey. 2 Shooters Island Island in the middle of Kill Van Kull, part in New Jersey and part in New York. Plum Island, Sandy Hook Bay.', "Also, Is Your Number Up A request from area fire departments, law enforcement and ambulance services - PUT YOUR PROPER ADDRESS NUMBER UP AT YOUR RESIDENCE OR BUSINESS Check it out! The Nicholas County Clerk's office is now offering a new convenience for anyone who spends a lot of time in the records room.", "Solar thermal (ST) is one of the most cost-effective renewable energy systems. Solar thermal water heating systems collect the sun's energy in the form of thermal or heat energy. The system can save a major portion of your utility bill. This solar thermal system will cost about $4,600 US (with the price decreasing all the time). 2 A solar pool heater, popular and practical, is an open loop system. 3 It's called this because water circulates back into the pool, which is (of course) an open system.", "BMW is not all top-end cars unfortunately for you amazinBimmer. BMW doesn't have competitors to the Mercedes SL, S and CL 600 models, not to mention the 65 AMG's that cost over $195K a pop. Audi is the number three brand in terms of household income behind only Porsche and Mercedes-Benz.", 'He was ridden to victory by jockey Oliver Lewis, one of thirteen African-American jockeys to compete in the race. Since then, the Kentucky Derby has been held every year at Louisvilleâ\x80\x99s Churchill Downs racetrack, making it the longest continuous held sporting event in the United States.', 'The average may be around 1000 sq.ft. but it depends on the amount of bedrooms. My listing at 4355 Nob el Drive, # 73, is way above average, especially price per sq.ft. It is 1881 sq.ft. and the price is only. $450 - 469000, an incredible deal for a beautiful townhome.BR/2BA, approx 1061 sqft units average about $403/sqft. (This one is strange because, theoretically, the price per square foot should go down as the square footage goes up.', 'A free inside look at ABM Industries salary trends. 665 salaries for 288 jobs at ABM Industries. Salaries posted anonymously by ABM Industries employees. Best Jobs in America NEW!', "The name Bob is an English baby name. In English the meaning of the name Bob is: Abbreviation of Robert. American Meaning: The name Bob is an American baby name. In American the meaning of the name Bob is: Abbreviation of Robert.German Meaning: The name Bob is a German baby name. In German the meaning of the name Bob is: Famed, bright; shining.An all-time favorite boys' name since the Middle Ages. Famous Bearers: Scottish national hero Robert the Bruce and novelist Robert Ludlum.n American the meaning of the name Bob is: Abbreviation of Robert. German Meaning: The name Bob is a German baby name. In German the meaning of the name Bob is: Famed, bright; shining. An all-time favorite boys' name since the Middle Ages.", 'Model 120, 13â\x80\x9d Pitch..............................................................................................................................Page 6. Model 120, 14â\x80\x9d Pitch..............................................................................................................................Page 7. Model 120, 15â\x80\x9d Pitch..............................................................................................................................Page 8. Model 120, 16â\x80\x9d Pitch..............................................................................................................................Page 9.', 'When people ask the question, â\x80\x9cHow much will solar panels cost,â\x80\x9d they could really be asking either, â\x80\x9cHow much does a solar panel cost,â\x80\x9d or â\x80\x9cHow much will it cost for enough solar panels to power my house?â\x80\x9d. The first question is more directly related to solar panel cost, so weâ\x80\x99ll cover that first.The answer is a little tricky because it depends on whether you are planning to buy pre-made solar panels or make them yourself. For premade solar panels, a single panel can cost about $900, or $12 per watt.A medium sized system to provide power to a small or very energy efficient house might cost $25,000 and a solar system to power a large house could cost $50,000+.he first question is more directly related to solar panel cost, so weâ\x80\x99ll cover that first. The answer is a little tricky because it depends on whether you are planning to buy pre-made solar panels or make them yourself. For premade solar panels, a single panel can cost about $900, or $12 per watt.', 'What types of animals live in a deciduous forest? Many types of animals live in a deciduous forest. Some of those animals are squirrels, deer, skunks, bears, raccoons, coyotes, and mice.', 'Childhood & Early Life. David Ortiz was born in Saint Domingo, Dominican Republic to Enrique and Angela Rosa. His father played baseball for years in Dominican pro and semipro leagues and became a source for inspiration for Ortiz.', 'Robert B. Shepard. Childhood, Family & Music: Bob Shepard was born on April 28, 1927, in Phoenix Arizona, to Chester and Dorothy Shepard. He was raised in Riverside, California, from the time of his birth to 1945. He was the eldest of four boys (Bob, Phil, Gilbert and Wayne). His father, Chester, died when Bob was 8 years old.') # # inputs = list(queries) + list(pos_docs) + list(neg_docs) # empty_list = [""] * len(inputs) # # for i, zipped in enumerate(inputs): # encodings = tokenizer( # zipped, # return_tensors="pt", # truncation=True, # padding=True, # max_length=512, # ) # print(i, encodings["input_ids"].shape, tokenizer.decode(encodings["input_ids"][0])) model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased") model.load_state_dict(torch.load("demo_model.pt", map_location=device)) model.to(device) model.eval() with torch.no_grad(): inputs = queries + documents # Tokenize queries and documents encodings = tokenizer(inputs, **tokenizer_options).to(device) ids, masks = encodings["input_ids"], encodings["attention_mask"] # Inference pass through model outputs = model.distilbert(ids, masks) outputs_hidden = outputs.last_hidden_state.mean(dim=1) # [:, 0] vec_queries = outputs_hidden[:len(queries)].unsqueeze(1) vec_documents = outputs_hidden[len(queries):].unsqueeze(0)
def train_MLM(config): conf = SimpleNamespace(**config) data_l = pd.read_pickle(conf.datapath_l) data_r = pd.read_pickle(conf.datapath_r) model_out = f"/lfs/1/sahaana/enrichment/ember/pretraining/models/{conf.model_name}" if "MARCO-1K" in conf.model_name: supervision = pd.read_pickle( conf.supervision ) #this is just using all of the provided BM25 to seed # So that in get item you can loc by these guys data_l = data_l.set_index("QID") data_r = data_r.set_index("PID") bm25_argsort = None elif "MARCO" in conf.model_name: data_l = data_l.set_index("QID") data_r = data_r.set_index("PID") bm25_argsort = pd.read_pickle(conf.bm25_argsort_path) supervision = None else: #if ("SQuAD" in conf.model_name) or ('imdb_wiki' in conf.model_name): # For these workloads, I've made sure that the index is already set, so no need to change index bm25_argsort = pd.read_pickle(conf.bm25_argsort_path) supervision = None """else: #really just for deepmatcher bm25_argsort = np.load(conf.bm25_argsort_path) supervision = None""" # Tokenizer bert_tokenizer = AutoTokenizer.from_pretrained( f'{conf.model_type}-base-uncased') data_collator = DataCollatorForEnrich(tokenizer=bert_tokenizer, mlm=True, mlm_probability=conf.mlm_probability, masking=conf.mlm_masking, num_seps=conf.mlm_num_seps) # Model if conf.model_type == 'distilbert': model_config = DistilBertConfig() if conf.from_scratch: model = DistilBertForMaskedLM(config=model_config) else: model = DistilBertForMaskedLM(config=model_config).from_pretrained( f"distilbert-base-{conf.tokenizer_casing}") elif conf.model_type == 'bert': model_config = BertConfig() if conf.from_scratch: model = BertForMaskedLM(config=model_config) else: model = BertForMaskedLM(config=model_config).from_pretrained( f"bert-base-{conf.tokenizer_casing}") # Training Data if conf.num_test == 0: train_data_l = data_l train_data_r = data_r train_bm25 = bm25_argsort else: train_idx, test_idx = sequential_tt_split(len(data_l), conf.num_train, conf.num_test) train_data_l = data_l.iloc[train_idx] ###test_data_l = data_l.iloc[test_idx] train_data_r = data_r.iloc[train_idx] ###test_data_r = data_r.iloc[test_idx] train_bm25 = bm25_argsort[train_idx] ###test_bm25 = bm25_argsort[test_idx] # Training Configs if "MARCO-1K" in conf.model_name: train_dataset = MARCO_BM25MLMDataset(train_data_l, train_data_r, supervision, bert_tokenizer, data_col=conf.data_column) else: #if ("MARCO" in conf.model_name) or ("SQuAD" in conf.model_name) or ("imdb_wiki" in conf.model_name): train_dataset = MARCO_MLMDataset(train_data_l, train_data_r, bert_tokenizer, data_col=conf.data_column, bm25_argsort=train_bm25) """else: # deepmatcher train_dataset = DM_Okapi25MLMDataset(train_data_l, train_data_r, bert_tokenizer, data_col=conf.data_column, index_bm25=False, bm25_argsort=train_bm25)""" training_args = TrainingArguments( output_dir=model_out, overwrite_output_dir=True, num_train_epochs=conf.train_epochs, per_device_train_batch_size=conf.batch_size, save_steps=10_000) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) # Train and save trainer.train() trainer.save_model(model_out)
import torch import flask import joblib import functools import time from flask import Flask from flask import request from transformers import DistilBertTokenizer, DistilBertForMaskedLM, BertForMaskedLM, BertTokenizer app = Flask(__name__) DEVICE = "cpu" MODEL_NAME = 'distilbert-base-uncased' MODEL = DistilBertForMaskedLM.from_pretrained(MODEL_NAME) PREDICTION_DICT = dict() memory = joblib.Memory("../input/", verbose=0) def predict_from_cache(sentence, mask): if sentence in PREDICTION_DICT: return PREDICTION_DICT[sentence] else: result = mask_prediction(sentence, mask) PREDICTION_DICT[sentence] = result return result @memory.cache def mask_prediction(sentence, mask): tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) mask_index = int(mask) # are text = str(sentence) #"Hello how [MASK] you doing?"
add_cross_attention=True, is_decoder=True) bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) # create tokenizer... tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased") input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... # loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss # loss.backward() config.attention_type = 'performer' model = DistilBertForMaskedLM.from_pretrained(bert2bert, config=config) wiki = load_dataset("wikipedia", "20200501.fr", split='train[:3%]') train_encodings = tokenizer(wiki['text'], padding=True, truncation=True) class WikiDataset(torch.utils.data.Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): item = [torch.tensor(val[idx]) for key, val in self.encodings.items()][0] return item
def softmax(x): return x.exp() / (x.exp().sum(-1)).unsqueeze(-1) model = None tokenizer = None model_name = "bert-large-uncased" disable_gpu = False device = torch.device("cpu") print("device:", device) if not model: print("using model:", model_name) if "distilbert" in model_name: bert = DistilBertForMaskedLM.from_pretrained(model_name) else: bert = BertForMaskedLM.from_pretrained(model_name) bert.to(device) else: print("using custom model:", model.config.architectures) bert = model bert.to(device) if not tokenizer: if "distilbert" in model_name: tokenizer = DistilBertTokenizer.from_pretrained(model_name) else: tokenizer = BertTokenizer.from_pretrained(model_name) else: tokenizer = tokenizer
import torch from transformers import DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments from datasets import load_dataset model_name = 'distilbert-base-multilingual-cased' tokenizer_name = 'distilbert-base-multilingual-cased' config = DistilBertConfig.from_pretrained(model_name) tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_name) config.attention_type = 'performer' model = DistilBertForMaskedLM.from_pretrained(model_name, config=config) wiki = load_dataset("wikipedia", "20200501.fr", split='train[:10%]') train_encodings = tokenizer(wiki['text'], max_length=8192, padding='max_length', truncation=True) class WikiDataset(torch.utils.data.Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): item = [torch.tensor(val[idx]) for key, val in self.encodings.items()][0] return item def __len__(self): length = [len(val) for key, val in self.encodings.items()][0] return length
from pathlib import Path from tokenizers import ByteLevelBPETokenizer import os import torch print(torch.cuda.is_available()) from transformers import DistilBertConfig config = DistilBertConfig() from transformers import DistilBertTokenizerFast tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") from transformers import DistilBertForMaskedLM model = DistilBertForMaskedLM(config=config) model.num_parameters() from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="../results_file.txt", block_size=128, ) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True,
train_df = df.iloc[:3000] # Getting teachers tokenizer and preparing data teacher_model_name = "bert-base-uncased" student_model_name = "distilbert-base-uncased" tokenizer = BertTokenizer.from_pretrained(teacher_model_name) dataset = LanguageModelingDataset(train_df["sentence"], teacher_model_name, sort=False) collate_fn = DataCollatorForLanguageModeling(tokenizer) dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32) # Getting teacher and student model teacher = BertForMaskedLM.from_pretrained(teacher_model_name) student = DistilBertForMaskedLM.from_pretrained(student_model_name) # needed paramteres for training params = { "n_epoch": 3, "temperature": 2.0, "alpha_ce": 0.5, "alpha_mlm": 2.0, "alpha_cos": 1.0, "alpha_mse": 1.0, "gradient_accumulation_steps": 50, "learning_rate": 5e-4, "adam_epsilon": 1e-6, "weight_decay": 0.0, "warmup_prop": 0.05, "max_grad_norm": 5.0,