Example #1
0
 def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = DistilBertForMaskedLM(config=config)
     model.eval()
     loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
def train(informal, formal):
    
    if torch.cuda.is_available():
        device = torch.device('cuda:1')
        print(f'Using GPU device: {device}')
    else:
        device = torch.device('cpu')
        print(f'GPU is not available, using CPU device {device}')
    wandb.init(project="NLP_BERT")
    
    train_config = {'batch_size': 10, 'n_epochs': 200, 'save_dir':'./checkpoints/', 'lr_scheduler': {
        'type': 'warmup,decay_linear',
        'warmup_steps_part': 0.05,
        'lr_peak': 1e-4,
    }}
    
    train_dataset = FormalDataset(informal, formal)
    model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
    model.to(device)

    #Model training procedure
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.,)
    n_steps = (len(train_dataset) // train_config['batch_size'] + 1) * train_config['n_epochs']
    lr_scheduler = LrScheduler(n_steps, **train_config['lr_scheduler'])
    train_dataloader = DataLoader(train_dataset, batch_size=train_config['batch_size'], shuffle=True, num_workers=4, drop_last=True)
    criterion = nn.CrossEntropyLoss(reduction='none')
    
    for epoch in range(1,train_config['n_epochs']+1):
        print('\n' + '-'*40)
        print(f'Epoch: {epoch}')
        print(f'Run training...')
        model.train()
        run_epoch(train_dataloader, model,
                  lr_scheduler, optimizer, criterion,device=device)
        save_checkpoint(epoch, model, lr_scheduler, optimizer, train_config['save_dir'])
Example #3
0
File: fitb.py Project: yyht/fitbert
    def __init__(
        self,
        model=None,
        tokenizer=None,
        model_name="bert-large-uncased",
        mask_token="***mask***",
        disable_gpu=False,
    ):
        self.mask_token = mask_token
        self.delemmatizer = Delemmatizer()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu")
        print("using model:", model_name)
        print("device:", self.device)

        if not model:
            if "distilbert" in model_name:
                self.bert = DistilBertForMaskedLM.from_pretrained(model_name)
            else:
                self.bert = BertForMaskedLM.from_pretrained(model_name)
            self.bert.to(self.device)
        else:
            self.bert = model

        if not tokenizer:
            if "distilbert" in model_name:
                self.tokenizer = DistilBertTokenizer.from_pretrained(
                    model_name)
            else:
                self.tokenizer = BertTokenizer.from_pretrained(model_name)
        else:
            self.tokenizer = tokenizer

        self.bert.eval()
 def __init__(self, segment_size, output_size, dropout):
     super(DistillBertPunc, self).__init__()
     self.bert = DistilBertForMaskedLM.from_pretrained(
         './models/distillbert/')
     self.bert_vocab_size = 30522
     self.bn = nn.BatchNorm1d(segment_size * self.bert_vocab_size)
     self.fc = nn.Linear(segment_size * self.bert_vocab_size, output_size)
     self.dropout = nn.Dropout(dropout)
Example #5
0
    def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
        super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
        self.model = DistilBertForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
Example #6
0
 def from_pretrained(cls, model_name: str):
     return cls(
         DistilBertForMaskedLM.from_pretrained(
             model_name,
             output_attentions=True,
             output_hidden_states=True,
             output_additional_info=True,
         ),
         DistilBertAligner.from_pretrained(model_name),
     )
Example #7
0
 def __init__(self, url=None):
     from transformers import DistilBertTokenizer, DistilBertForMaskedLM
     import torch
     self.torch = torch
     self.url = url
     if url is None:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
         self.bert = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
         self.bert.to(self.device)
         self.bert.eval()
def test(informal):
    if torch.cuda.is_available():
        device = torch.device('cuda:3')
        print(f'Using GPU device: {device}')
    else:
        device = torch.device('cpu')
        print(f'GPU is not available, using CPU device {device}')

    test_config = {'batch_size': 5, 'epoch': 29, 'save_dir': './checkpoints/'}

    test_dataset = FormalDataset(informal)
    dataloader = DataLoader(test_dataset,
                            batch_size=test_config['batch_size'],
                            shuffle=False,
                            num_workers=4,
                            drop_last=False)
    config = DistilBertConfig()
    model = DistilBertForMaskedLM(config)
    load_model(test_config['epoch'], model, test_config['save_dir'])
    model.to(device)
    model.eval()
    with torch.no_grad():
        for i, batch in tqdm(enumerate(dataloader)):
            inp = batch['input_ids'].to(device)
            attn = batch['attention_mask'].to(device)
            logits = model(input_ids=inp, attention_mask=attn)[0]
            preds = decode_text(test_dataset.tokenizer, logits)
            for seq in preds:
                with open('test_pred.txt', 'a') as res_file:
                    res_file.writelines(seq + '\n')
 def create_and_check_distilbert_for_masked_lm(
     self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = DistilBertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
Example #10
0
    def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
        super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
        try:
            import transformers
        except ModuleNotFoundError:
            raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')
            
        self.model_path = model_path

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        # self.model = AutoModel.from_pretrained(model_path)
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
        self.model = DistilBertForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
    def __init__(self, args, vocab_subset=None):
        super().__init__()

        bert_model_name = args.bert_model_name
        dict_file = bert_model_name

        if args.bert_model_dir is not None:
            # load bert model from file
            bert_model_name = str(args.bert_model_dir) + "/"
            dict_file = bert_model_name + args.bert_vocab_name
            self.dict_file = dict_file
            print("loading BERT model from {}".format(bert_model_name))
        else:
            # load bert model from huggingface cache
            pass

        # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer
        do_lower_case = False
        if 'uncased' in bert_model_name:
            do_lower_case = True

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = DistilBertTokenizer.from_pretrained(dict_file)

        # original vocab
        self.map_indices = None
        self.vocab = list(self.tokenizer.ids_to_tokens.values())
        self._init_inverse_vocab()

        # Add custom tokenizer to avoid splitting the ['MASK'] token
        custom_basic_tokenizer = CustomBaseTokenizer(
            do_lower_case=do_lower_case)
        self.tokenizer.basic_tokenizer = custom_basic_tokenizer

        # Load pre-trained model (weights)
        # ... to get prediction/generation
        self.masked_bert_model = DistilBertForMaskedLM.from_pretrained(
            bert_model_name)

        self.masked_bert_model.eval()

        # ... to get hidden states
        self.bert_model = self.masked_bert_model.distilbert

        self.pad_id = self.inverse_vocab[BERT_PAD]

        self.unk_index = self.inverse_vocab[BERT_UNK]
Example #12
0
def main():
    batch_size = 4

    dev_dataset = TorchDataset(
        file_name="./data/diverse.triplets.dev.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    test_dataset = TorchDataset(
        file_name="./data/diverse.triplets.test.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=False)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        "distilbert-base-uncased")

    # load model
    # DistilBertForSequenceClassification
    # DistilBertForMaskedLM
    model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
    model.load_state_dict(torch.load("demo_model.pt", map_location=device))
    model.to(device)

    model.eval()

    data_loader = dev_dataloader
    N = len(data_loader)

    correct, total = 0, 0
    start = time.time()
    with torch.no_grad():
        for i, (queries, pos_docs, neg_docs) in enumerate(data_loader):

            inputs = list(queries) + list(pos_docs) + list(neg_docs)

            encodings = tokenizer(
                inputs,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=512,
            )
            ids, masks = encodings["input_ids"], encodings["attention_mask"]

            ids = ids.to(device)  # (3B, MAXLEN)
            masks = masks.to(device)  # (3B, MAXLEN)

            # TODO: could add more layers after distilbert!
            outputs = model.distilbert(ids, masks)
            outputs_hidden = outputs.last_hidden_state.mean(dim=1)
            anchors, positives, negatives = outputs_hidden.view(
                3, len(queries), -1)

            # compute 2 distance: positive_doc to query, negative_doc to query using l2 distance
            pos_dist = (anchors - positives).norm(dim=-1)  # B distances
            neg_dist = (anchors - negatives).norm(dim=-1)  # B distances

            # pos_dist = 1 - F.cosine_similarity(anchors, positives, dim=-1)  # B distances
            # neg_dist = 1 - F.cosine_similarity(anchors, negatives, dim=-1)  # B distances

            correct += (pos_dist < neg_dist).sum()
            total += len(queries)

            if i % 10 == 0:
                remaining_time = (time.time() - start) / (i + 1) * N - (
                    time.time() - start)
                print(
                    f"remaining time: {remaining_time:.2f} | est. accuracy: {correct / total:.4f}"
                )

        print(f"accuracy {correct / total}")
Example #13
0
def get_distilkobert_lm():
    """ Return DistilBertForMaskedLM for DistilKobert """
    model = DistilBertForMaskedLM.from_pretrained('monologg/distilkobert')
    return model
Example #14
0
from transformers import \
    AlbertTokenizer, AlbertForMaskedLM,\
    DistilBertTokenizer, DistilBertForMaskedLM, \
    RobertaTokenizer, RobertaForMaskedLM

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval()

albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
albert_large_model = AlbertForMaskedLM.from_pretrained(
    'albert-large-v2').eval()

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-cased')
distilbert_model = DistilBertForMaskedLM.from_pretrained(
    'distilbert-base-cased').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])
Example #15
0
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM

# Load the tokenizer
#tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512) # the one we trained ourselves (akuapem)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512, do_lower_case=True) # the one we trained ourselves (asante, lowercase everything)
#tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased") # you could also use pre-trained DistilmBERT tokenizer
#tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased", do_lower_case=True) # for asante, lowercase pretrained tokenizer
#tokenizer.save_vocabulary("distilabena-base-akuapem-twi-cased") # when using pretrained tokenizer, be sure to save it locally
tokenizer.save_vocabulary("distilabena-base-v2-asante-twi-uncased") # saving pretrained tokenizer locally in case of asante 

# Load DistilBERT multilingual base checkpoint
#model = DistilBertForMaskedLM.from_pretrained("distilbert-base-multilingual-cased") # pretrained DistilmBERT weights
model = DistilBertForMaskedLM.from_pretrained("distilabena-base-v2-akuapem-twi-cased") # in the case of Asante Twi, start with Akuapem model weights
print("Number of parameters in the model:")
print(model.num_parameters())

# Create dataset object for JW300 dataset (Akuapem) or Asante Twi Bible 
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
#    file_path="../../data/jw300.en-tw.tw", # stage 1 - akuapem
    file_path="../../data/asante_twi_bible.txt", # stage 2 - asante
    block_size=128,
)

# Create "data collator" from dataset and tokenizer - with 15% chance of masking
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments
from transformers import Trainer, TrainingArguments
Example #16
0
def main():
    parser = ArgumentParser('Distributed distillation example')
    parser.add_argument('--data_file', type=str, metavar='PATH',
                        required=True, help='Path to file containing the data (sequences).')
    parser.add_argument('--output_dir', type=str, metavar='PATH', required=True,
                        help='Path to the output directory (for logs, checkpoints, parameters, etc.).')
    parser.add_argument('-f', '--force', action='store_true',
                        help='Overwrite output_dir if it already exists.')
    parser.add_argument('--student_config_file', type=str, metavar='PATH',
                        required=True, help='Path to the student model configuration.')
    parser.add_argument('--student_weights_file', type=str, default=None,
                        metavar='PATH', help='Path to the student model initialization weights.')
    parser.add_argument('--teacher_type', type=str, default=None,
                        choices={'bert-base-uncased'}, help='The pre-trained teacher model type to initialize.')
    parser.add_argument('--tokenizer_vocab_file', type=str, metavar='PATH',
                        required=True, help='Path to the tokenizer vocabulary.')
    parser.add_argument('--min_sequence_len', type=int, default=12,
                        metavar='N', help='The minimum length of a sequence.')
    parser.add_argument('--max_sequence_len', type=int, default=512,
                        metavar='N', help='The maximum length of a sequence.')
    parser.add_argument('--do_tokenize', action='store_true',
                        help='Whether to tokenize the input.')
    parser.add_argument('--do_lower_case', action='store_true',
                        help='Whether to lowercase the input when tokenizing.')
    parser.add_argument('-n', '--num_epochs', type=int, default=3,
                        metavar='N', help='The number of distillation epochs.')
    parser.add_argument('-b', '--batch_size', type=int,
                        default=5, metavar='N', help='The batch size.')
    parser.add_argument('--lr', '--learning_rate', type=float,
                        default=5e-4, metavar='F', help='The initial learning rate.')
    parser.add_argument('--epsilon', type=float, default=1e-6,
                        metavar='F', help="Adam's epsilon.")
    parser.add_argument('--warmup_prop', type=float, default=0.05,
                        metavar='F', help='Linear warmup proportion.')
    parser.add_argument('--num_gradient_accumulation_steps', type=int, default=50, metavar='N',
                        help='The number of gradient accumulation steps (for larger batch sizes).')
    parser.add_argument('--max_gradient_norm', type=float,
                        default=5.0, metavar='F', help='The maximum gradient norm.')
    parser.add_argument('--soft_target_alpha', type=float, default=0.33,
                        metavar='F', help='The relative weight of the soft target loss.')
    parser.add_argument('--hard_target_alpha', type=float, default=0.33,
                        metavar='F', help='The relative weight of the hard target loss.')
    parser.add_argument('--cosine_emb_alpha', type=float, default=0.33,
                        metavar='F', help='The relative weight of the cosine embedding loss.')
    parser.add_argument('--seed', type=int, default=42,
                        metavar='N', help='Random seed.')
    parser.add_argument('-c', '--use_cuda', action='store_true',
                        help='Whether to use cuda or not.')
    parser.add_argument('-d', '--use_distributed', action='store_true',
                        help='Whether to use distributed training (distillation) or not.')
    parser.add_argument('--local_rank', type=int, default=-1,
                        metavar='N', help='Local process rank.')
    params = parser.parse_args()

    if not params.use_distributed:
        params.local_rank = 0
    params.is_master = params.local_rank == 0

    # make output_dir
    if Path(params.output_dir).is_dir() and not params.force:
        raise ValueError(
            f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.')
    if params.is_master:
        Path(params.output_dir).mkdir(parents=True, exist_ok=params.force)

        # dump params
        json.dump(
            vars(params),
            open(Path(params.output_dir) / 'params.json', 'w'),
            indent=4,
            sort_keys=True
        )
    params.output_dir = Path(params.output_dir)

    # initialize multi-GPU
    if params.use_distributed:
        if params.is_master:
            logger.info('Initializing PyTorch distributed')
        torch.cuda.set_device(params.local_rank)
        torch.distributed.init_process_group(
            backend='nccl',
            init_method='env://'
        )

    # set seed(s)
    if params.is_master:
        logger.info('Setting random seed(s)')
    random.seed(params.seed)
    np.random.seed(params.seed)
    torch.manual_seed(params.seed)
    if params.use_distributed:
        torch.cuda.manual_seed_all(params.seed)

    # initialize the student
    if params.is_master:
        logger.info('Initializing the student')
    student_config = DistilBertConfig.from_pretrained(
        params.student_config_file)
    student_config.output_hidden_states = True
    if params.student_weights_file is not None:
        student = DistilBertForMaskedLM.from_pretrained(
            params.student_weights_file,
            config=student_config
        )
    else:
        student = DistilBertForMaskedLM(student_config)

    # initialize the teacher
    if params.is_master:
        logger.info('Initializing the teacher')
    teacher = BertForMaskedLM.from_pretrained(
        params.teacher_type, output_hidden_states=True)

    # initialize the tokenizer
    if params.is_master:
        logger.info('Initializing the tokenizer')
    tokenizer = BertWordPieceTokenizer(
        params.tokenizer_vocab_file,
        lowercase=params.do_lower_case
    )

    # initialize the dataset
    if params.is_master:
        logger.info('Initializing the dataset')
    dataset = LanguageModelingDataset(
        path=params.data_file,
        tokenizer=tokenizer,
        do_tokenize=params.do_tokenize,
        min_sequence_len=params.min_sequence_len,
        max_sequence_len=params.max_sequence_len
    )

    # initialize the sampler
    if params.is_master:
        logger.info('Initializing the sampler')
    group_bins = list(range(3, params.max_sequence_len, 4))
    group_idxs = quantize(dataset.lengths, group_bins)
    sampler = GroupedBatchSampler(
        sampler=DistributedSampler(dataset) if params.use_distributed else RandomSampler(dataset),
        group_idxs=group_idxs,
        batch_size=params.batch_size,
        drop_last=False
    )

    # initialize the dataloader
    if params.is_master:
        logger.info('Initializing the dataloader')
    dataloader = DataLoader(
        dataset=dataset,
        batch_sampler=sampler,
        collate_fn=dataset.sequences_collate_fn
    )

    # initialize the loss function
    if params.is_master:
        logger.info('Initializing the loss function')
    loss_fn = SanhLoss(
        alphas=(
            params.soft_target_alpha,
            params.hard_target_alpha,
            params.cosine_emb_alpha
        ),
        reduction=('batchmean', 'mean', 'mean')
    )

    # compute token counts
    if params.is_master:
        logger.info('Computing token counts')
    counter = Counter()
    for sequence in dataset.sequences:
        counter.update(sequence)
    token_counts = [0] * dataset._tokenizer.get_vocab_size()
    for k, v in counter.items():
        token_counts[k] = v
    del counter

    # compute token probabilities
    if params.is_master:
        logger.info('Computing token probabilities')
    token_probabilities = np.maximum(token_counts, 1) ** -0.7

    # give special tokens a zero probability
    for idx in dataset.special_tokens_map.values():
        token_probabilities[idx] = 0.0

    # convert to torch.FloatTensor
    token_probabilities = torch.FloatTensor(token_probabilities)

    # initialize the distiller
    if params.is_master:
        logger.info('Initializing the distiller')
    distiller = SanhDistiller(
        student=student,
        teacher=teacher,
        dataloader=dataloader,
        token_probabilities=token_probabilities,
        loss_fn=loss_fn,
        num_epochs=params.num_epochs,
        num_gradient_accumulation_steps=params.num_gradient_accumulation_steps,
        max_gradient_norm=params.max_gradient_norm,
        use_cuda=params.use_cuda,
        local_rank=params.local_rank,
        use_distributed=params.use_distributed,
        is_master=params.is_master,
        use_tqdm=True,
        logger=logger,
    )

    # start the distillation
    if params.is_master:
        logger.info('Starting the distillation')
    distiller.distill()

    # save the student model config and weights
    if params.is_master:
        logger.info('Saving the student model config')
        json.dump(
            vars(student.config),
            open(params.output_dir / 'distilled_bert_config.json', 'w'),
            indent=4,
            sort_keys=True
        )

        logger.info('Saving the student model weights')
        model_to_save = student.module if hasattr(student, 'module') else student  # Take care of distributed/parallel training
        torch.save(
            model_to_save.state_dict(),
            params.output_dir / 'distilled_bert_weights.pth'
        )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--exp", type=int, default=7)
    parser.add_argument("--save",
                        type=str,
                        default="./model2_best_diverse_mean_maskedLM.pt")
    args = parser.parse_args()

    # Data and Tokenization
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        "distilbert-base-uncased")

    batch_size = 4
    train_dataset = TorchDataset(
        file_name="./data/diverse.triplets.train.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)

    dev_dataset = TorchDataset(
        file_name="./data/diverse.triplets.dev.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    # Model Training and Evaluation
    NUM_EPOCHS = 1
    LEARNING_RATE = 0.00003

    # load model
    model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

    # if args.exp == 1:
    #     pass
    # elif args.exp == 2:
    #
    # elif args.exp == 3:

    if args.exp == 7:
        # For Experiment7: average
        model = DistilBertForMaskedLM.from_pretrained(
            "distilbert-base-uncased")
        triplet_loss = nn.TripletMarginLoss(margin=1.0)

    elif args.exp == 6:
        # For Experiment6: base + cosine
        triplet_loss = nn.TripletMarginWithDistanceLoss(
            distance_function=lambda x, y: 1 - F.cosine_similarity(
                x, y, dim=-1),
            margin=1.0,
        )

    elif args.exp == 5:
        # For Experiment5: base + margin = 0.1
        triplet_loss = nn.TripletMarginLoss(margin=0.1)

    elif args.exp == 4:
        # For Experiment4: base
        triplet_loss = nn.TripletMarginLoss(margin=1.0)

    model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.distilbert.parameters(),
                                 lr=LEARNING_RATE)

    def evaluate(inputs, model, tokenizer):
        encodings = tokenizer(
            inputs,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512,
        )
        ids, masks = encodings["input_ids"], encodings["attention_mask"]
        outputs = model.distilbert(ids.to(device), masks.to(device))
        if args.exp < 7:
            # Experiment: using the first index of the last layers
            outputs_hidden = outputs.last_hidden_state[:, 0]
        else:
            # Averaging last layers
            outputs_hidden = outputs.last_hidden_state.mean(dim=1)

        return outputs_hidden.view(3, len(queries), -1)

    dataloader = train_dataloader
    N = len(dataloader)
    lowest_loss = float("inf")
    start = time.time()
    learning_curve_y = []
    learning_curve_x = []

    for epoch in range(NUM_EPOCHS):
        epoch_loss = 0
        for i, (queries, pos_docs, neg_docs) in enumerate(dataloader):
            # readability
            # train()
            # evaluate()
            # print()
            optimizer.zero_grad()  # set gradient to zero
            anchors, positives, negatives = evaluate(
                inputs=list(queries + pos_docs + neg_docs),
                model=model,
                tokenizer=tokenizer,
            )

            loss = triplet_loss(anchors, positives, negatives)
            loss.backward()
            optimizer.step()

            epoch_loss += float(loss)

            if i % 10 == 0:
                elapsed_time = time.time() - start
                remaining_time = elapsed_time * (1 / (i + 1) * N - 1)
                print(
                    f"{i}: remaining time: {remaining_time:.1f} | est. epoch loss: {epoch_loss / (i + 1):.4f}"
                )

            if i % 100 == 0:
                with torch.no_grad():
                    correct = total = 0
                    val_start = time.time()
                    for dq, dp, dn in dev_dataloader:
                        anchors, positives, negatives = evaluate(
                            inputs=list(dq + dp + dn),
                            model=model,
                            tokenizer=tokenizer,
                        )
                        if args.exp == 6:
                            # cosine distance
                            pos_dist = 1 - F.cosine_similarity(
                                anchors, positives, dim=-1)
                            neg_dist = 1 - F.cosine_similarity(
                                anchors, negatives, dim=-1)
                        else:
                            # using l2 norm
                            pos_dist = (anchors - positives).norm(
                                dim=-1)  # B distances
                            neg_dist = (anchors - negatives).norm(
                                dim=-1)  # B distances

                        correct += float((pos_dist < neg_dist).sum())
                        total += len(dq)
                        if time.time() - val_start > 15:
                            break
                    print(
                        f"{i}: est. validation accuracy: {correct / total:.4f}"
                    )
                    learning_curve_y.append(correct / total)
                    learning_curve_x.append(i * batch_size)  # epoch normally

            if (epoch_loss / (i + 1)) < lowest_loss:
                if args.exp == 4:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_base.pt")
                elif args.exp == 5:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_margin.pt")
                elif args.exp == 6:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_cosine.pt")
                elif args.exp == 7:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_mean_maskedLM.pt")

                lowest_loss = epoch_loss / (i + 1)

        print(f"loss for epoch {epoch} is {epoch_loss}")

        generate_data_for_plot(learning_curve_y, learning_curve_x)
Example #18
0
def test_runner():
    """Test that runner executes"""
    train_df = pd.read_csv("data/train.csv")
    valid_df = pd.read_csv("data/valid.csv")
    teacher_config = AutoConfig.from_pretrained("bert-base-uncased",
                                                output_hidden_states=True,
                                                output_logits=True)
    teacher = BertForMaskedLM.from_pretrained("bert-base-uncased",
                                              config=teacher_config)

    student_config = AutoConfig.from_pretrained(
        "distilbert-base-uncased",
        output_hidden_states=True,
        output_logits=True,
    )
    student = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased",
                                                    config=student_config)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    train_dataset = LanguageModelingDataset(train_df["text"], tokenizer)
    valid_dataset = LanguageModelingDataset(valid_df["text"], tokenizer)

    collate_fn = DataCollatorForLanguageModeling(tokenizer)
    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=collate_fn,
                                  batch_size=2)
    valid_dataloader = DataLoader(valid_dataset,
                                  collate_fn=collate_fn,
                                  batch_size=2)
    loaders = {"train": train_dataloader, "valid": valid_dataloader}

    callbacks = {
        "masked_lm_loss":
        MaskedLanguageModelCallback(),
        "mse_loss":
        MSELossCallback(),
        "cosine_loss":
        CosineLossCallback(),
        "kl_div_loss":
        KLDivLossCallback(),
        "loss":
        MetricAggregationCallback(
            prefix="loss",
            mode="weighted_sum",
            metrics={
                "cosine_loss": 1.0,
                "masked_lm_loss": 1.0,
                "kl_div_loss": 1.0,
                "mse_loss": 1.0,
            },
        ),
        "optimizer":
        dl.OptimizerCallback(),
        "perplexity":
        PerplexityMetricCallbackDistillation(),
    }

    model = torch.nn.ModuleDict({"teacher": teacher, "student": student})
    runner = DistilMLMRunner()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    runner.train(
        model=model,
        optimizer=optimizer,
        loaders=loaders,
        verbose=True,
        check=True,
        callbacks=callbacks,
    )
    assert True
Example #19
0
config = OmegaConf.load(args.config_path)
print(OmegaConf.to_yaml(config))

os.environ['WANDB_DISABLED'] = 'true'

tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path)
tokenizer.mask_token = '[MASK]'
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"
tokenizer.unk_token = "[UNK]"

distilbert_config = DistilBertConfig(vocab_size=config.vocab_size,
                                     n_heads=8, dim=512, hidden_dim=2048)
model = DistilBertForMaskedLM(distilbert_config)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=DATA_PATH,
    block_size=64)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=config.mlm_probability)

training_args = TrainingArguments(
    output_dir=config.output_path,
    overwrite_output_dir=True,
    num_train_epochs=config.num_train_epochs,
    learning_rate=config.learning_rate,
Example #20
0
def main():
    parser = argparse.ArgumentParser(description="Training")

    parser.add_argument(
        "--dump_path",
        type=str,
        required=True,
        help="The output directory (log, checkpoints, parameters, etc.)")
    parser.add_argument(
        "--data_file",
        type=str,
        required=True,
        help=
        "The binarized file (tokenized + tokens_to_ids) and grouped by sequence."
    )
    parser.add_argument("--token_counts",
                        type=str,
                        required=True,
                        help="The token counts in the data_file for MLM.")
    parser.add_argument("--force",
                        action='store_true',
                        help="Overwrite dump_path if it already exists.")

    parser.add_argument("--vocab_size",
                        default=30522,
                        type=int,
                        help="The vocabulary size.")
    parser.add_argument(
        "--max_position_embeddings",
        default=512,
        type=int,
        help="Maximum sequence length we can model (including [CLS] and [SEP])."
    )
    parser.add_argument(
        "--sinusoidal_pos_embds",
        action='store_false',
        help=
        "If true, the position embeddings are simply fixed with sinusoidal embeddings."
    )
    parser.add_argument("--n_layers",
                        default=6,
                        type=int,
                        help="Number of Transformer blocks.")
    parser.add_argument("--n_heads",
                        default=12,
                        type=int,
                        help="Number of heads in the self-attention module.")
    parser.add_argument(
        "--dim",
        default=768,
        type=int,
        help="Dimension through the network. Must be divisible by n_heads")
    parser.add_argument("--hidden_dim",
                        default=3072,
                        type=int,
                        help="Intermediate dimension in the FFN.")
    parser.add_argument("--dropout", default=0.1, type=float, help="Dropout.")
    parser.add_argument("--attention_dropout",
                        default=0.1,
                        type=float,
                        help="Dropout in self-attention.")
    parser.add_argument("--activation",
                        default='gelu',
                        type=str,
                        help="Activation to use in self-attention")
    parser.add_argument(
        "--tie_weights_",
        action='store_false',
        help=
        "If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true."
    )

    parser.add_argument("--from_pretrained_weights",
                        default=None,
                        type=str,
                        help="Load student initialization checkpoint.")
    parser.add_argument(
        "--from_pretrained_config",
        default=None,
        type=str,
        help="Load student initialization architecture config.")
    parser.add_argument("--teacher_type",
                        default="bert",
                        choices=["bert", "roberta"],
                        help="Teacher type (BERT, RoBERTa).")
    parser.add_argument("--teacher_name",
                        default="bert-base-uncased",
                        type=str,
                        help="The teacher model.")

    parser.add_argument("--temperature",
                        default=2.,
                        type=float,
                        help="Temperature for the softmax temperature.")
    parser.add_argument(
        "--alpha_ce",
        default=0.5,
        type=float,
        help="Linear weight for the distillation loss. Must be >=0.")
    parser.add_argument("--alpha_mlm",
                        default=0.5,
                        type=float,
                        help="Linear weight for the MLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse",
                        default=0.0,
                        type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument(
        "--alpha_cos",
        default=0.0,
        type=float,
        help="Linear weight of the cosine embedding loss. Must be >=0.")
    parser.add_argument(
        "--mlm_mask_prop",
        default=0.15,
        type=float,
        help="Proportion of tokens for which we need to make a prediction.")
    parser.add_argument("--word_mask",
                        default=0.8,
                        type=float,
                        help="Proportion of tokens to mask out.")
    parser.add_argument("--word_keep",
                        default=0.1,
                        type=float,
                        help="Proportion of tokens to keep.")
    parser.add_argument("--word_rand",
                        default=0.1,
                        type=float,
                        help="Proportion of tokens to randomly replace.")
    parser.add_argument(
        "--mlm_smoothing",
        default=0.7,
        type=float,
        help=
        "Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec)."
    )
    parser.add_argument(
        "--restrict_ce_to_mask",
        action='store_true',
        help=
        "If true, compute the distilation loss only the [MLM] prediction distribution."
    )

    parser.add_argument("--n_epoch",
                        type=int,
                        default=3,
                        help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size",
                        type=int,
                        default=5,
                        help="Batch size (for each process).")
    parser.add_argument(
        "--tokens_per_batch",
        type=int,
        default=-1,
        help=
        "If specified, modify the batches so that they have approximately this number of tokens."
    )
    parser.add_argument(
        "--shuffle",
        action='store_false',
        help="If true, shuffle the sequence order. Default is true.")
    parser.add_argument(
        "--group_by_size",
        action='store_false',
        help=
        "If true, group sequences that have similar length into the same batch. Default is true."
    )

    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=50,
        help="Gradient accumulation for larger training batches.")
    parser.add_argument("--warmup_prop",
                        default=0.05,
                        type=float,
                        help="Linear warmup proportion.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--learning_rate",
                        default=5e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=5.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--initializer_range",
                        default=0.02,
                        type=float,
                        help="Random initialization range.")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--n_gpu",
                        type=int,
                        default=1,
                        help="Number of GPUs in the node.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="Distributed training - Local rank")
    parser.add_argument("--seed", type=int, default=56, help="Random seed")

    parser.add_argument("--log_interval",
                        type=int,
                        default=500,
                        help="Tensorboard logging interval.")
    parser.add_argument("--checkpoint_interval",
                        type=int,
                        default=4000,
                        help="Checkpoint interval.")
    args = parser.parse_args()

    ## ARGS ##
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.dump_path):
            if not args.force:
                raise ValueError(
                    f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
                    'Use `--force` if you want to overwrite it')
            else:
                shutil.rmtree(args.dump_path)

        if not os.path.exists(args.dump_path):
            os.makedirs(args.dump_path)
        logger.info(
            f'Experiment will be dumped and logged in {args.dump_path}')

        ### SAVE PARAMS ###
        logger.info(f'Param: {args}')
        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)
    assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
           (args.from_pretrained_weights is not None and args.from_pretrained_config is not None)

    ### TOKENIZER ###
    if args.teacher_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
    elif args.teacher_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids

    ## DATA LOADER ##
    logger.info(f'Loading data from {args.data_file}')
    with open(args.data_file, 'rb') as fp:
        data = pickle.load(fp)

    assert os.path.isfile(args.token_counts)
    logger.info(
        f'Loading token counts from {args.token_counts} (already pre-computed)'
    )
    with open(args.token_counts, 'rb') as fp:
        counts = pickle.load(fp)
        assert len(counts) == args.vocab_size
    token_probs = np.maximum(counts, 1)**-args.mlm_smoothing
    for idx in special_tok_ids.values():
        token_probs[idx] = 0.  # do not predict special tokens
    token_probs = torch.from_numpy(token_probs)

    train_dataloader = Dataset(params=args, data=data)
    logger.info(f'Data loader created.')

    ## STUDENT ##
    if args.from_pretrained_weights is not None:
        assert os.path.isfile(args.from_pretrained_weights)
        assert os.path.isfile(args.from_pretrained_config)
        logger.info(
            f'Loading pretrained weights from {args.from_pretrained_weights}')
        logger.info(
            f'Loading pretrained config from {args.from_pretrained_config}')
        stu_architecture_config = DistilBertConfig.from_json_file(
            args.from_pretrained_config)
        stu_architecture_config.output_hidden_states = True
        student = DistilBertForMaskedLM.from_pretrained(
            args.from_pretrained_weights, config=stu_architecture_config)
    else:
        args.vocab_size_or_config_json_file = args.vocab_size
        stu_architecture_config = DistilBertConfig(**vars(args),
                                                   output_hidden_states=True)
        student = DistilBertForMaskedLM(stu_architecture_config)

    if args.n_gpu > 0:
        student.to(f'cuda:{args.local_rank}')
    logger.info(f'Student loaded.')

    ## TEACHER ##
    if args.teacher_type == 'bert':
        teacher = BertForMaskedLM.from_pretrained(args.teacher_name,
                                                  output_hidden_states=True)
    elif args.teacher_type == 'roberta':
        teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name,
                                                     output_hidden_states=True)
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
    logger.info(f'Teacher loaded from {args.teacher_name}.')

    ## DISTILLER ##
    torch.cuda.empty_cache()
    distiller = Distiller(params=args,
                          dataloader=train_dataloader,
                          token_probs=token_probs,
                          student=student,
                          teacher=teacher)
    distiller.train()
    logger.info("Let's go get some drinks.")
Example #21
0
# pos_docs = ('London City Airport is the closest, approximately 6 miles from the city centre.', "The 'K' in vitamin K is derived from the German word koagulation.. Coagulation is the process in which blood forms clots. Vitamin K facilitates the function of several proteins, including those that are responsible for blood clot formation.It plays a vital role in cell growth and in the metabolism of bone and other tissues.itamin K and Blood Clot Formation. Blood clots through a process called the 'coagulation cascade'. It's referred to as a cascade as it involves a cascade of enzymes activating each other. Fibrin is formed at the end of the cascade.", 'That part of the floor of the PELVIS that lies between the tops of the thighs. In the male, the perineum lies between the anus and the scrotum. In the female, it includes the external genitalia. The area between the opening of the v****a and the anus in a woman, or the area between the scrotum and the anus in a man.', 'Whitemarsh Island, Georgia. Whitemarsh Island (pronounced WIT-marsh) is a census-designated place (CDP) in Chatham County, Georgia, United States. The population was 6,792 at the 2010 census. It is part of the Savannah Metropolitan Statistical Area. The communities of Whitemarsh Island are a relatively affluent suburb of Savannah.', 'What is the average cost of an ambulance ride? In Los Angeles, basic emergency ambulance transport is about $1,000 to $1,100. The cost is more like $1,200 to $1,300 for a transport that requires advanced life support. Included is the cost for paramedics and the ambulance ride itself. However, companies can charge extra for mileage, supplies and equipment.', 'If you are currently using a gas or electric pool heater, the savings from solar pool heating system will pay for itself in 2-3 years of installation. For example, the average system costs about $5,500 and it typically costs about $2,000 a year to heat a pool with gas. solar pool heater can be installed for $3,500 to $8,000. Since the sun is free, it will cost you nothing to heat your pool from April through November. Plus, there are no regular maintenance fees.', 'Change Oil and Filter for Mercedes-Benz E350 costs an average of about $137. Skip the repair shop, our certified mechanics come to you. Get a quote · Book an Appointment · Get your car fixed at your home or office.', 'Lila Downs. Ana Lila Downs Sánchez, best known as Lila Downs (born September 19, 1968 · ) is an Americanâ\x80\x93Mexican singer-songwriter and actress. She performs her own compositions and the works of others in multiple genres, as well as tapping into Mexican traditional and popular music.', "The Square Deal was a program started by President Theodore  Roosevelt's based on conservation of natural resources, control of  corporations, and consumer protection. Oftenâ\x80¦ referred to as the  three C's of Roosevelt's Square Deal.", 'Americaâ\x80\x99s Original Thrift. Goodwill Industries was founded in 1902 by Rev. Edgar J. Helms, a Methodist minister and early social innovator. Helms collected used household goods and clothing in wealthier areas of Boston, then trained and hired those who were poor to mend and repair the used goods.', 'What about Bob. This a howlingly funny movie filmed entirely at Smith Mountain Lake in Virginia. Smith Mountain Lake has a tourism motto: Closer than you think!!. There are some wonderful bed and breakfasts in this area of Virginia. One even in an historic downtown areas of a nearby town to Smith Mountain Lake.', 'Jim Edmonds. James Patrick Jim Edmonds (born June 27, 1970) is an American former center fielder in Major League Baseball and a current broadcaster for Fox Sports Midwest. He played for the California/Anaheim Angels, St. Louis Cardinals, San Diego Padres, Chicago Cubs, Milwaukee Brewers, and Cincinnati Reds.', 'If you are currently using a gas or electric pool heater, the savings from solar pool heating system will pay for itself in 2-3 years of installation. For example, the average system costs about $5,500 and it typically costs about $2,000 a year to heat a pool with gas. solar pool heater can be installed for $3,500 to $8,000. Since the sun is free, it will cost you nothing to heat your pool from April through November. Plus, there are no regular maintenance fees.', 'Cnidarians are a group of aquatic invertebrates that includes jellyfish, corals, sea anemones and hydras.', "David Delamare was born in Leicester, UK but has spent most of his life in Portland, Oregon where he enjoys the cloudy weather.Though he likes to travel, he has never driven a car. He sleeps late and works deep into the night.When he's not attending films, plays, or concerts he can usually be found at home or strolling in Portland's Hawthorne District.hough he likes to travel, he has never driven a car. He sleeps late and works deep into the night. When he's not attending films, plays, or concerts he can usually be found at home or strolling in Portland's Hawthorne District.", 'What about Bob. This a howlingly funny movie filmed entirely at Smith Mountain Lake in Virginia. Smith Mountain Lake has a tourism motto: Closer than you think!!. There are some wonderful bed and breakfasts in this area of Virginia. One even in an historic downtown areas of a nearby town to Smith Mountain Lake.')
# neg_docs = ('Hilton Garden Inn London Heathrow Airport added 123 new photos to the album: HGI London Heathrow Airport Exclusive Launch Event â\x80\x94 at Hilton Garden Inn London Heathrow Airport.', 'How HIV is Transmitted. HIV is spread by sexual contact with an infected person, by sharing needles and/or syringes (primarily for drug injection) with someone who is infected, or, less commonly (and now very rarely in countries where blood is screened for HIV antibodies), through transfusions of infected blood or blood clotting factors.', 'Personal injury. A common cause associated with nonischemic priapism â\x80\x94 a persistent erection caused by excessive blood flow in the penis â\x80\x94 is trauma or injury to your genitals, pelvis or perhaps the perineum, the region involving the base of the penis and the anus.', '1 Liberty Island, exclave of New York with surrounding waters in New Jersey. 2  Shooters Island Island in the middle of Kill Van Kull, part in New Jersey and part in New York.  Plum Island, Sandy Hook Bay.', "Also, Is Your Number Up A request from area fire departments, law enforcement and ambulance services - PUT YOUR PROPER ADDRESS NUMBER UP AT YOUR RESIDENCE OR BUSINESS Check it out! The Nicholas County Clerk's office is now offering a new convenience for anyone who spends a lot of time in the records room.", "Solar thermal (ST) is one of the most cost-effective renewable energy systems. Solar thermal water heating systems collect the sun's energy in the form of thermal or heat energy. The system can save a major portion of your utility bill. This solar thermal system will cost about $4,600 US (with the price decreasing all the time). 2  A solar pool heater, popular and practical, is an open loop system. 3  It's called this because water circulates back into the pool, which is (of course) an open system.", "BMW is not all top-end cars unfortunately for you amazinBimmer. BMW doesn't have competitors to the Mercedes SL, S and CL 600 models, not to mention the 65 AMG's that cost over $195K a pop. Audi is the number three brand in terms of household income behind only Porsche and Mercedes-Benz.", 'He was ridden to victory by jockey Oliver Lewis, one of thirteen African-American jockeys to compete in the race. Since then, the Kentucky Derby has been held every year at Louisvilleâ\x80\x99s Churchill Downs racetrack, making it the longest continuous held sporting event in the United States.', 'The average may be around 1000 sq.ft. but it depends on the amount of bedrooms. My listing at 4355 Nob el Drive, # 73, is way above average, especially price per sq.ft. It is 1881 sq.ft. and the price is only. $450 - 469000, an incredible deal for a beautiful townhome.BR/2BA, approx 1061 sqft units average about $403/sqft. (This one is strange because, theoretically, the price per square foot should go down as the square footage goes up.', 'A free inside look at ABM Industries salary trends. 665 salaries for 288 jobs at ABM Industries. Salaries posted anonymously by ABM Industries employees. Best Jobs in America NEW!', "The name Bob is an English baby name. In English the meaning of the name Bob is: Abbreviation of Robert. American Meaning: The name Bob is an American baby name. In American the meaning of the name Bob is: Abbreviation of Robert.German Meaning: The name Bob is a German baby name. In German the meaning of the name Bob is: Famed, bright; shining.An all-time favorite boys' name since the Middle Ages. Famous Bearers: Scottish national hero Robert the Bruce and novelist Robert Ludlum.n American the meaning of the name Bob is: Abbreviation of Robert. German Meaning: The name Bob is a German baby name. In German the meaning of the name Bob is: Famed, bright; shining. An all-time favorite boys' name since the Middle Ages.", 'Model 120, 13â\x80\x9d Pitch..............................................................................................................................Page 6. Model 120, 14â\x80\x9d Pitch..............................................................................................................................Page 7. Model 120, 15â\x80\x9d Pitch..............................................................................................................................Page 8. Model 120, 16â\x80\x9d Pitch..............................................................................................................................Page 9.', 'When people ask the question, â\x80\x9cHow much will solar panels cost,â\x80\x9d they could really be asking either, â\x80\x9cHow much does a solar panel cost,â\x80\x9d or â\x80\x9cHow much will it cost for enough solar panels to power my house?â\x80\x9d. The first question is more directly related to solar panel cost, so weâ\x80\x99ll cover that first.The answer is a little tricky because it depends on whether you are planning to buy pre-made solar panels or make them yourself. For premade solar panels, a single panel can cost about $900, or $12 per watt.A medium sized system to provide power to a small or very energy efficient house might cost $25,000 and a solar system to power a large house could cost $50,000+.he first question is more directly related to solar panel cost, so weâ\x80\x99ll cover that first. The answer is a little tricky because it depends on whether you are planning to buy pre-made solar panels or make them yourself. For premade solar panels, a single panel can cost about $900, or $12 per watt.', 'What types of animals live in a deciduous forest? Many types of animals live in a deciduous forest. Some of those animals are squirrels, deer, skunks, bears, raccoons, coyotes, and mice.', 'Childhood & Early Life. David Ortiz was born in Saint Domingo, Dominican Republic to Enrique and Angela Rosa. His father played baseball for years in Dominican pro and semipro leagues and became a source for inspiration for Ortiz.', 'Robert B. Shepard. Childhood, Family & Music: Bob Shepard was born on April 28, 1927, in Phoenix Arizona, to Chester and Dorothy Shepard. He was raised in Riverside, California, from the time of his birth to 1945. He was the eldest of four boys (Bob, Phil, Gilbert and Wayne). His father, Chester, died when Bob was 8 years old.')
#
# inputs = list(queries) + list(pos_docs) + list(neg_docs)
# empty_list = [""] * len(inputs)
#
# for i, zipped in enumerate(inputs):
#     encodings = tokenizer(
#         zipped,
#         return_tensors="pt",
#         truncation=True,
#         padding=True,
#         max_length=512,
#     )
#     print(i, encodings["input_ids"].shape, tokenizer.decode(encodings["input_ids"][0]))
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load("demo_model.pt", map_location=device))
model.to(device)
model.eval()

with torch.no_grad():
    inputs = queries + documents
    # Tokenize queries and documents
    encodings = tokenizer(inputs, **tokenizer_options).to(device)
    ids, masks = encodings["input_ids"], encodings["attention_mask"]

    # Inference pass through model
    outputs = model.distilbert(ids, masks)
    outputs_hidden = outputs.last_hidden_state.mean(dim=1)  # [:, 0]
    vec_queries = outputs_hidden[:len(queries)].unsqueeze(1)
    vec_documents = outputs_hidden[len(queries):].unsqueeze(0)
Example #22
0
def train_MLM(config):
    conf = SimpleNamespace(**config)

    data_l = pd.read_pickle(conf.datapath_l)
    data_r = pd.read_pickle(conf.datapath_r)
    model_out = f"/lfs/1/sahaana/enrichment/ember/pretraining/models/{conf.model_name}"

    if "MARCO-1K" in conf.model_name:
        supervision = pd.read_pickle(
            conf.supervision
        )  #this is just using all of the provided BM25 to seed
        # So that in get item you can loc by these guys
        data_l = data_l.set_index("QID")
        data_r = data_r.set_index("PID")
        bm25_argsort = None
    elif "MARCO" in conf.model_name:
        data_l = data_l.set_index("QID")
        data_r = data_r.set_index("PID")
        bm25_argsort = pd.read_pickle(conf.bm25_argsort_path)
        supervision = None
    else:  #if ("SQuAD" in conf.model_name) or ('imdb_wiki' in conf.model_name):
        # For these workloads, I've made sure that the index is already set, so no need to change index
        bm25_argsort = pd.read_pickle(conf.bm25_argsort_path)
        supervision = None
    """else: #really just for deepmatcher
        bm25_argsort = np.load(conf.bm25_argsort_path)
        supervision = None"""

    # Tokenizer
    bert_tokenizer = AutoTokenizer.from_pretrained(
        f'{conf.model_type}-base-uncased')
    data_collator = DataCollatorForEnrich(tokenizer=bert_tokenizer,
                                          mlm=True,
                                          mlm_probability=conf.mlm_probability,
                                          masking=conf.mlm_masking,
                                          num_seps=conf.mlm_num_seps)

    # Model
    if conf.model_type == 'distilbert':
        model_config = DistilBertConfig()
        if conf.from_scratch:
            model = DistilBertForMaskedLM(config=model_config)
        else:
            model = DistilBertForMaskedLM(config=model_config).from_pretrained(
                f"distilbert-base-{conf.tokenizer_casing}")

    elif conf.model_type == 'bert':
        model_config = BertConfig()
        if conf.from_scratch:
            model = BertForMaskedLM(config=model_config)
        else:
            model = BertForMaskedLM(config=model_config).from_pretrained(
                f"bert-base-{conf.tokenizer_casing}")

    # Training Data
    if conf.num_test == 0:
        train_data_l = data_l
        train_data_r = data_r
        train_bm25 = bm25_argsort
    else:
        train_idx, test_idx = sequential_tt_split(len(data_l), conf.num_train,
                                                  conf.num_test)
        train_data_l = data_l.iloc[train_idx]
        ###test_data_l = data_l.iloc[test_idx]
        train_data_r = data_r.iloc[train_idx]
        ###test_data_r = data_r.iloc[test_idx]
        train_bm25 = bm25_argsort[train_idx]
        ###test_bm25 = bm25_argsort[test_idx]

    # Training Configs
    if "MARCO-1K" in conf.model_name:
        train_dataset = MARCO_BM25MLMDataset(train_data_l,
                                             train_data_r,
                                             supervision,
                                             bert_tokenizer,
                                             data_col=conf.data_column)
    else:  #if ("MARCO" in conf.model_name) or ("SQuAD" in conf.model_name) or ("imdb_wiki" in conf.model_name):
        train_dataset = MARCO_MLMDataset(train_data_l,
                                         train_data_r,
                                         bert_tokenizer,
                                         data_col=conf.data_column,
                                         bm25_argsort=train_bm25)
    """else: # deepmatcher
        train_dataset = DM_Okapi25MLMDataset(train_data_l, train_data_r, 
                                             bert_tokenizer, data_col=conf.data_column, 
                                             index_bm25=False, bm25_argsort=train_bm25)"""

    training_args = TrainingArguments(
        output_dir=model_out,
        overwrite_output_dir=True,
        num_train_epochs=conf.train_epochs,
        per_device_train_batch_size=conf.batch_size,
        save_steps=10_000)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset)

    # Train and save
    trainer.train()
    trainer.save_model(model_out)
Example #23
0
import torch
import flask
import joblib
import functools
import time
from flask import Flask
from flask import request
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, BertForMaskedLM, BertTokenizer

app = Flask(__name__)

DEVICE = "cpu"
MODEL_NAME = 'distilbert-base-uncased'
MODEL = DistilBertForMaskedLM.from_pretrained(MODEL_NAME)
PREDICTION_DICT = dict()

memory = joblib.Memory("../input/", verbose=0)

def predict_from_cache(sentence, mask):
    if sentence in PREDICTION_DICT:
        return PREDICTION_DICT[sentence]
    else:
        result = mask_prediction(sentence, mask)
        PREDICTION_DICT[sentence] = result
        return result

@memory.cache
def mask_prediction(sentence, mask):
    tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)                                                                                               
    mask_index = int(mask) # are                                                                                                                                         
    text = str(sentence) #"Hello how [MASK] you doing?"                                                                                                          
Example #24
0
                                                add_cross_attention=True,
                                                is_decoder=True)
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
# create tokenizer...
tokenizer = DistilBertTokenizer.from_pretrained("bert-large-uncased")
input_ids = tokenizer('This is a long article to summarize',
                      add_special_tokens=False,
                      return_tensors="pt").input_ids
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
# train...
# loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
# loss.backward()

config.attention_type = 'performer'

model = DistilBertForMaskedLM.from_pretrained(bert2bert, config=config)

wiki = load_dataset("wikipedia", "20200501.fr", split='train[:3%]')

train_encodings = tokenizer(wiki['text'], padding=True, truncation=True)


class WikiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = [torch.tensor(val[idx])
                for key, val in self.encodings.items()][0]
        return item
Example #25
0
def softmax(x):
    return x.exp() / (x.exp().sum(-1)).unsqueeze(-1)


model = None
tokenizer = None
model_name = "bert-large-uncased"

disable_gpu = False
device = torch.device("cpu")
print("device:", device)
if not model:
    print("using model:", model_name)
    if "distilbert" in model_name:
        bert = DistilBertForMaskedLM.from_pretrained(model_name)
    else:
        bert = BertForMaskedLM.from_pretrained(model_name)
    bert.to(device)
else:
    print("using custom model:", model.config.architectures)
    bert = model
    bert.to(device)

if not tokenizer:
    if "distilbert" in model_name:
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    else:
        tokenizer = BertTokenizer.from_pretrained(model_name)
else:
    tokenizer = tokenizer
Example #26
0
import torch
from transformers import DistilBertForMaskedLM, DistilBertTokenizer, DistilBertConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

model_name = 'distilbert-base-multilingual-cased'
tokenizer_name = 'distilbert-base-multilingual-cased'

config = DistilBertConfig.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_name)
config.attention_type = 'performer'

model = DistilBertForMaskedLM.from_pretrained(model_name, config=config)

wiki = load_dataset("wikipedia", "20200501.fr", split='train[:10%]')

train_encodings = tokenizer(wiki['text'], max_length=8192, padding='max_length', truncation=True)


class WikiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = [torch.tensor(val[idx]) for key, val in self.encodings.items()][0]
        return item

    def __len__(self):
        length = [len(val) for key, val in self.encodings.items()][0]
        return length
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import os
import torch
print(torch.cuda.is_available())
from transformers import DistilBertConfig

config = DistilBertConfig()

from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

from transformers import DistilBertForMaskedLM

model = DistilBertForMaskedLM(config=config)

model.num_parameters()

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../results_file.txt",
    block_size=128,
)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
Example #28
0
train_df = df.iloc[:3000]

# Getting teachers tokenizer and preparing data
teacher_model_name = "bert-base-uncased"
student_model_name = "distilbert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(teacher_model_name)

dataset = LanguageModelingDataset(train_df["sentence"],
                                  teacher_model_name,
                                  sort=False)
collate_fn = DataCollatorForLanguageModeling(tokenizer)
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32)

# Getting teacher and student model
teacher = BertForMaskedLM.from_pretrained(teacher_model_name)
student = DistilBertForMaskedLM.from_pretrained(student_model_name)

# needed paramteres for training
params = {
    "n_epoch": 3,
    "temperature": 2.0,
    "alpha_ce": 0.5,
    "alpha_mlm": 2.0,
    "alpha_cos": 1.0,
    "alpha_mse": 1.0,
    "gradient_accumulation_steps": 50,
    "learning_rate": 5e-4,
    "adam_epsilon": 1e-6,
    "weight_decay": 0.0,
    "warmup_prop": 0.05,
    "max_grad_norm": 5.0,