def __init__(self,  
              num_classes=1):
     super(NaiveSummarizer, self).__init__()
     
     self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     self.tokenizer.padding_side = 'left'
     
     self.embedder = LSTM(self.tokenizer.vocab_size)
     self.lstm = nn.LSTM(128, 64, 1, batch_first=True, bidirectional=False)
     self.fc = nn.Linear(64, num_classes)
Ejemplo n.º 2
0
def get_tokenizer(model_type='BERT'):
    if model_type == 'distilBERT':
        tokenizer = DistilBertTokenizerFast.from_pretrained(
            'distilbert-base-uncased')
    elif model_type == 'BERT':
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    elif model_type == 'alBERT':
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    else:
        print('model_type not allowed ', model_type)
    return tokenizer
Ejemplo n.º 3
0
def create_tokenizer(model_type: str) -> PreTrainedTokenizer:
    if model_type == "albert":
        return AlbertTokenizer.from_pretrained("albert-base-v2")
    elif model_type == "bert":
        return BertTokenizer.from_pretrained("bert-base-uncased")
    elif model_type == "electra":
        return BertTokenizer.from_pretrained("bert-base-uncased")
    else:
        raise ValueError(
            f"model_type={model_type} must be one of ['albert', 'bert', 'electra']"
        )
def main(args):
    with open(args.config) as fp:
        data = json.loads(fp.read())
    config = AlbertConfig(**data)
    model = AlbertForMaskedLM(config)
    model: AlbertForMaskedLM = load_tf_weights_in_albert(
        model, config, args.checkpoint)
    model.save_pretrained(args.output)

    tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True)
    tokenizer.save_pretrained(args.output)
Ejemplo n.º 5
0
 def getAlBertEmbeddings(self):
     model = AlbertModel.from_pretrained('albert-base-v2')
     tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     model.eval()
     tokens_tensor, segments_tensors = self.getIndexs(tokenizer)
     with torch.no_grad():
         last_hidden_states = model(tokens_tensor,
                                    attention_mask=segments_tensors)
     features = last_hidden_states[0][:, 0, :].numpy()
     features = np.reshape(features, features.shape[1])
     return (features.tolist())
Ejemplo n.º 6
0
def init_model(cachedir='~/hashtag/', no_cuda=True):
    global tokenizer, model

    f_cachedir = os.path.expanduser(cachedir)
    bert_config = AlbertConfig.from_pretrained(f_cachedir)
    model = HashtagClassifier.from_pretrained(f_cachedir, config=bert_config)
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
    model.to(device)
    model.eval()

    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
Ejemplo n.º 7
0
def get_bert(bert_model, bert_do_lower_case):
    # Avoid a hard dependency on BERT by only importing it if it's being used
    from transformers import AlbertTokenizer, BertModel
    model = BertModel.from_pretrained('huseinzol05/bert-base-bahasa-cased')
    tokenizer = AlbertTokenizer.from_pretrained(
        'huseinzol05/bert-base-bahasa-cased',
        unk_token = '[UNK]',
        pad_token = '[PAD]',
        do_lower_case = False,
    )
    return tokenizer, model
Ejemplo n.º 8
0
def download_albert_base():
    file = '../input/albert-base-v2'

    config = AlbertConfig.from_pretrained('albert-base-v2')
    config.save_pretrained(file)
    
    model = AlbertModel.from_pretrained('albert-base-v2')
    model.save_pretrained(file)

    tkn = AlbertTokenizer.from_pretrained('albert-base-v2')
    tkn.save_pretrained(file)
Ejemplo n.º 9
0
def _test(_hparams):
    model = BERT.load_from_checkpoint(
        checkpoint_path=_hparams.weight_path,
        tags_csv=_hparams.cfg_path
    )
    print('model loaded.')
    model.eval()
    model.freeze()

    if _hparams.pretrained_model.startswith('distilbert'):
        tokenizer = DistilBertTokenizer.from_pretrained(_hparams.pretrained_model)
    elif _hparams.pretrained_model.startswith('bert'):
        tokenizer = BertTokenizer.from_pretrained(_hparams.pretrained_model)
    elif _hparams.pretrained_model.startswith('albert'):
        tokenizer = AlbertTokenizer.from_pretrained(_hparams.pretrained_model)
    else:
        raise ValueError('Unrecognized model name.')

    y_all, y_hat_all = [], []

    error_analysis_f = None
    if _hparams.error_analysis:
        error_analysis_f = open(MiningConfig.error_analysis_path % (_hparams.name, _hparams.dataset), 'w')

    for input_ids, attention_mask, token_type_ids, y in model.test_dataloader():
        y_hat, attn = model(input_ids, attention_mask, token_type_ids)
        a, y_hat = torch.max(y_hat, dim=1)
        for i in range(input_ids.size(0)):
            y_single = y.cpu().numpy()[i]
            y_hat_single = y_hat.cpu().numpy()[i]
            text = tokenizer.decode(input_ids[i]).\
                replace('[CLS]', '').replace('[SEP]', '').replace('[PAD]', '').\
                replace('\t', '').replace('\n', '').strip()
            y_all.append(y_single)
            y_hat_all.append(y_hat_single)

            if 'STANCE' not in _hparams.dataset:
                if _hparams.error_analysis:
                    if y_single == 0 and y_hat_single == 1:
                        error_analysis_f.write('FN' + '\t' + text + '\n')
                    if y_single == 1 and y_hat_single == 0:
                        error_analysis_f.write('FP' + '\t' + text + '\n')
            else:
                if _hparams.error_analysis:
                    if y_single != y_hat_single:
                        error_analysis_f.write('%s-->%s' % (LABEL_MAP['STANCE'][y_single], LABEL_MAP['STANCE'][y_hat_single])
                                               + '\t' + text + '\n')

    if _hparams.error_analysis:
        error_analysis_f.close()

    test_acc = accuracy_score(y_all, y_hat_all)
    test_f1 = f1_score(y_all, y_hat_all, average='macro')
    print(test_acc, test_f1)
Ejemplo n.º 10
0
 def _test_TFAlbert(self, size, large=False):
     from transformers import AlbertTokenizer, TFAlbertModel
     tokenizer = AlbertTokenizer.from_pretrained(size)
     model = TFAlbertModel.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["last_hidden_state"]
     self.run_test(model,
                   input_dict,
                   input_signature=spec,
                   outputs=outputs,
                   large=large)
Ejemplo n.º 11
0
 def init_predict_tokenizer(self, tokenizer: PreTrainedTokenizer,
                            ckpt: str) -> None:
     self.inf_session.tokenizer = tokenizer if tokenizer else \
         AlbertTokenizer.from_pretrained(self.inf_session.config.model.sp_model,
                                         max_len=self.inf_session.config.data_source.max_seq_length, truncation=True)
     self.inf_session.special_token_mask = [
         self.inf_session.tokenizer.unk_token_id,
         self.inf_session.tokenizer.sep_token_id,
         self.inf_session.tokenizer.pad_token_id,
         self.inf_session.tokenizer.cls_token_id
     ]
     logger.info(f'Predictions from model weights: {ckpt}')
Ejemplo n.º 12
0
 def __init__(self, 
              in_dim, 
              hidden_dim, 
              out_dim, 
              num_heads, 
              num_classes=2):
     super(BasicSummarizer, self).__init__()
     
     self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     self.tokenizer.padding_side = 'left'
     self.embedder = LSTM(self.tokenizer.vocab_size)
     self.gat_classifier = GATClassifier(in_dim, hidden_dim, out_dim, num_heads, num_classes)
Ejemplo n.º 13
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
    def test_tokenization_albert(self):
        # Given
        self.base_tokenizer = AlbertTokenizer.from_pretrained(
            'albert-base-v2', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyAlbertTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['albert-base-v2']),
                                                do_lower_case=True,
                                                strip_accents=True)

        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline['input_ids']:
                if len(rust.token_ids) == len(baseline['input_ids']):
                    if Counter(rust.token_ids) != Counter(
                            baseline['input_ids']):
                        raise AssertionError(
                            f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                            f'Sentence a: {self.examples[idx].text_a} \n'
                            f'Sentence b: {self.examples[idx].text_b} \n'
                            f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                            f'Rust: {rust.token_ids} \n'
                            f'Python {baseline["input_ids"]}')
                else:
                    raise AssertionError(
                        f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                        f'Sentence a: {self.examples[idx].text_a} \n'
                        f'Sentence b: {self.examples[idx].text_b} \n'
                        f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                        f'Rust: {rust.token_ids} \n'
                        f'Python {baseline["input_ids"]}')
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
Ejemplo n.º 15
0
    def __init__(self, batch_size, epoch_num, model_name, is_test):
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epoch_num
        self.NUM_LABELS = 4
        self.model_name = model_name

        if self.model_name == "bert":
            self.model_version = 'bert-base-cased'
            self.tokenizer = BertTokenizer.from_pretrained(self.model_version)
            if is_test:
                self.model = BertForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = BertForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)
        elif self.model_name == "robert":
            self.model_version = 'roberta-base'
            self.tokenizer = RobertaTokenizer.from_pretrained(
                self.model_version)
            if is_test:
                self.model = RobertaForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = RobertaForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)
        elif self.model_name == "albert":
            self.model_version = 'albert-base-v2'
            self.tokenizer = AlbertTokenizer.from_pretrained(
                self.model_version)
            if is_test:
                self.model = AlbertForSequenceClassification.from_pretrained(
                    model_name + "_model", num_labels=self.NUM_LABELS)
            else:
                self.model = AlbertForSequenceClassification.from_pretrained(
                    self.model_version, num_labels=self.NUM_LABELS)

        if is_test:
            self.testset = FakeNewsDataset("test", tokenizer=self.tokenizer)
            self.testloader = DataLoader(self.testset,
                                         batch_size=self.BATCH_SIZE,
                                         collate_fn=create_mini_batch)
        else:
            self.trainset = FakeNewsDataset("train", tokenizer=self.tokenizer)
            self.trainloader = DataLoader(self.trainset,
                                          batch_size=self.BATCH_SIZE,
                                          collate_fn=create_mini_batch)
            self.model.train()
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
Ejemplo n.º 16
0
 def encode_text(cls, tlc_text: np.array, sequence_length):
     tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     instances_tlc = tokenizer.batch_encode_plus(
         tlc_text,
         max_length=sequence_length,
         pad_to_max_length=True,
         return_attention_masks=True,
         return_token_type_ids=False)
     input_ids_tlc = torch.tensor(instances_tlc['input_ids'],
                                  dtype=torch.int32)
     attention_mask_tlc = torch.tensor(instances_tlc['attention_mask'],
                                       dtype=torch.int32)
     return input_ids_tlc, attention_mask_tlc
Ejemplo n.º 17
0
 def __init__(self,
              path='src/Bert',
              model_type='huseinzol05/bert-base-bahasa-cased'):
     self.path = path
     self.model_type = model_type
     self.tokenizer = AlbertTokenizer.from_pretrained(self.path,
                                                      do_lower_case=True)
     self.model = BertForSequenceClassification.from_pretrained(
         self.path, num_labels=3)
     self.device = "cuda" if torch.cuda.is_available() else "cpu"
     # self.device = "cpu"
     self.model.to(self.device)
     self.model.eval()
Ejemplo n.º 18
0
def compute_input_ids_masks_albert(train_set,val_test_set_emb,MAX_LEN):
    from transformers import AlbertTokenizer
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2',padding_side ='left')
    text_batch_train = train_set['review'].apply(dataset_preprocessing.clean_text).to_list()
    encoding = tokenizer(text_batch_train, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN)
    train_inputs = encoding['input_ids']
    train_masks = encoding['attention_mask']

    text_batch_val = val_test_set_emb['review'].apply(dataset_preprocessing.clean_text).to_list()
    encoding = tokenizer(text_batch_val, return_tensors='pt', padding=True, truncation=True,max_length=MAX_LEN)
    val_test_inputs = encoding['input_ids']
    val_test_masks = encoding['attention_mask']
    return train_inputs.numpy(),train_masks.numpy(),val_test_inputs.numpy(),val_test_masks.numpy()
Ejemplo n.º 19
0
 def __init__(self, args):
     self.args = args
     self.tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/bert-base-bahasa-cased', 
             unk_token = '[UNK]', pad_token='[PAD]', do_lower_case=False)
     self.sep_token = '[SEP]'
     self.cls_token = '[CLS]'
     self.pad_token = '[PAD]'
     self.tgt_bos = '[CLS]'
     self.tgt_eos = '[SEP]'
     self.tgt_sent_split = '[CLS]'
     self.sep_vid = self.tokenizer.vocab[self.sep_token]
     self.cls_vid = self.tokenizer.vocab[self.cls_token]
     self.pad_vid = self.tokenizer.vocab[self.pad_token]
Ejemplo n.º 20
0
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, num_classes=2):
        super(Summarizer, self).__init__()

        albert_base_configuration = AlbertConfig(
            hidden_size=256,
            num_attention_heads=4,
            intermediate_size=1024,
        )

        self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        self.embedder = AlbertModel(albert_base_configuration)
        self.gat_classifier = GATClassifier(in_dim, hidden_dim, out_dim,
                                            num_heads, num_classes)
Ejemplo n.º 21
0
 def __init__(self, model_name, max_length, device):
     super(TransformerRLN, self).__init__()
     self.max_length = max_length
     self.device = device
     if model_name == 'albert':
         self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
         self.encoder = AlbertModel.from_pretrained('albert-base-v2')
     elif model_name == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         self.encoder = BertModel.from_pretrained('bert-base-uncased')
     else:
         raise NotImplementedError
     self.to(self.device)
Ejemplo n.º 22
0
def read_train_inputs(train_file, delimiter, max_len, max_negatives,
                      num_dev_samples):
    f = open(train_file, 'r', encoding='utf8')
    unique_entity_map, all_samples = {}, []
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    for line in f.readlines():
        info = line.strip().split('\t')
        sentence, entity_id, canonical_name, negative_samples = info[0], info[
            1], info[2], info[3].split(delimiter)
        if len(negative_samples) < max_negatives:
            negative_samples = negative_samples + [''] * (
                max_negatives - len(negative_samples))
        else:
            negative_samples = negative_samples[:max_negatives]
        token_info = tokenizer.encode_plus(sentence,
                                           max_length=max_len,
                                           pad_to_max_length=True)
        sentence_tokens = token_info['input_ids']
        sentence_attention_mask = np.array(token_info['attention_mask'])
        negative_tokens, negative_attention_masks = [], []
        for negative_sample in negative_samples:
            negative_token_info = tokenizer.encode_plus(negative_sample,
                                                        max_length=max_len,
                                                        pad_to_max_length=True)
            negative_tokens.append(negative_token_info['input_ids'])
            negative_attention_masks.append(
                np.array(negative_token_info['attention_mask']))
        train_sample = TrainSample(sentence, entity_id, negative_samples,
                                   sentence_tokens, negative_tokens,
                                   sentence_attention_mask,
                                   negative_attention_masks)
        all_samples.append(train_sample)
        if entity_id not in unique_entity_map:
            entity_token_info = tokenizer.encode_plus(canonical_name,
                                                      max_length=max_len,
                                                      pad_to_max_length=True)
            new_entity = EntityObj(
                entity_id, canonical_name, entity_token_info['input_ids'],
                np.array(entity_token_info['attention_mask']))
            unique_entity_map[entity_id] = new_entity
            new_entity.utterances.append(sentence_tokens)
            new_entity.masks.append(sentence_attention_mask)
        else:
            unique_entity_map[entity_id].utterances.append(sentence_tokens)
            unique_entity_map[entity_id].masks.append(sentence_attention_mask)
    random.shuffle(all_samples)
    train_samples, dev_samples = all_samples[:len(all_samples) -
                                             num_dev_samples], all_samples[
                                                 len(all_samples) -
                                                 num_dev_samples:]
    return train_samples, dev_samples, unique_entity_map
Ejemplo n.º 23
0
 def __init__(self,
              data_dir,
              task,
              max_len,
              bert_name,
              bert_type,
              mode='train'):
     self.mode = mode
     if bert_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(bert_name)
     elif bert_type == 'albert':
         self.tokenizer = AlbertTokenizer.from_pretrained(bert_name)
     self.data = self.convert_data(max_len)
     self.num_class = 3
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
        super(ALBERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 510:
            logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
            max_seq_length = 510
        self.max_seq_length = max_seq_length

        self.bert = AlbertModel.from_pretrained(model_name_or_path)
        self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
def train(args):
    wandb.init(config=args, project='CXR-BERT')

    set_seed(args.seed)

    # TODO: bert-base,small,tiny tokenizer
    if args.bert_model == "albert-base-v2":
        tokenizer = AlbertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=True).tokenize
    elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":  # same with Bert-base-cased model
        tokenizer = AutoTokenizer.from_pretrained(args.bert_model).tokenize
    elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
        tokenizer = AutoTokenizer.from_pretrained(args.bert_model).tokenize
    elif args.bert_model == "bert-small-scratch":
        tokenizer = BertTokenizer.from_pretrained(
            "google/bert_uncased_L-4_H-512_A-8", do_lower_case=True).tokenize
    elif args.bert_model == "bert-base-scratch":
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                                  do_lower_case=True).tokenize
    else:
        tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                                  do_lower_case=True).tokenize

    transforms = get_transforms(args)

    print("Load Train dataset", args.train_dataset)
    train_dataset = CXRDataset(args.train_dataset, tokenizer, transforms, args)

    print("Load Test dataset", args.test_dataset)
    test_dataset = CXRDataset(args.test_dataset, tokenizer, transforms, args) \
        if args.test_dataset is not None else None

    print("Create DataLoader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) \
        if test_dataset is not None else None

    print("Creating BERT Trainer")
    trainer = CXRBERT_Trainer(args,
                              train_dataloader=train_data_loader,
                              test_dataloader=test_data_loader)

    print("Training Start!")
    for epoch in range(args.epochs):
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)
Ejemplo n.º 26
0
    def test_tokenization_albert(self):
        # Given
        self.base_tokenizer = AlbertTokenizer.from_pretrained(
            'albert-base-v2', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyAlbertTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['albert-base-v2']),
                                                do_lower_case=True,
                                                strip_accents=True)

        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline['input_ids']:
                for pos, (rust_id, baseline_id) in enumerate(
                        zip(rust.token_ids, baseline['input_ids'])):
                    # This check is required a SentencePiece can also be ambiguous in very rare cases
                    # (e.g. "eee" -> "e, ee" or "ee, e" have the same score)
                    if rust_id != baseline_id:
                        if pos < len(baseline):
                            if (rust_id != baseline['input_ids'][pos + 1]) & \
                                    (rust_id != baseline['input_ids'][pos - 1]):
                                raise AssertionError(
                                    f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                                    f'Sentence a: {self.examples[idx].text_a} \n'
                                    f'Sentence b: {self.examples[idx].text_b} \n'
                                    f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                                    f'Rust: {rust.token_ids} \n'
                                    f'Python {baseline["input_ids"]}')
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
Ejemplo n.º 27
0
    def __init__(
        self,
        model_name=Defaults.model_name,
        measure=Defaults.measure,
        gap=Defaults.gap,
        gap_mask=Defaults.gap_mask,
        gap_tune=Defaults.gap_tune,
        gap_mask_tune=Defaults.gap_mask_tune,
        min_token_length_normal=Defaults.min_token_length_normal,
        min_token_length_lead=Defaults.min_token_length_lead,
        min_token_length_followup=Defaults.min_token_length_followup,
        min_token_length_normal_tune=Defaults.min_token_length_normal_tune,
        min_token_length_lead_tune=Defaults.min_token_length_lead_tune,
        min_token_length_followup_tune=Defaults.min_token_length_followup_tune,
        device=Defaults.device,
        inference_batch_size=Defaults.inference_batch_size,
        inference_mask_evenly=Defaults.inference_mask_evenly,
        len_sent_allow_cut=Defaults.len_sent_allow_cut,
        p_mask=Defaults.p_mask,
        show_progress_bar=Defaults.show_progress_bar,
    ):
        """This class should not be instantiated directly: instead use BlancHelp or BlancTune"""
        self.model_name = model_name
        self.measure = measure
        self.gap = gap
        self.gap_mask = gap_mask
        self.gap_tune = gap_tune
        self.gap_mask_tune = gap_mask_tune
        self.min_token_length_normal = min_token_length_normal
        self.min_token_length_lead = min_token_length_lead
        self.min_token_length_followup = min_token_length_followup
        self.min_token_length_normal_tune = min_token_length_normal_tune
        self.min_token_length_lead_tune = min_token_length_lead_tune
        self.min_token_length_followup_tune = min_token_length_followup_tune
        self.device = device
        self.inference_batch_size = inference_batch_size
        self.inference_mask_evenly = inference_mask_evenly
        self.len_sent_allow_cut = len_sent_allow_cut
        self.p_mask = p_mask
        self.show_progress_bar = show_progress_bar

        # The same is intentionally not given:
        self.gap_tune = self.gap if self.gap_tune < 0 else self.gap_tune
        self.gap_mask_tune = self.gap_mask if self.gap_mask_tune < 0 else self.gap_mask_tune

        if self.model_name.lower().find('albert') >= 0:
            self.model_tokenizer = AlbertTokenizer.from_pretrained(model_name)
        else:
            self.model_tokenizer = BertTokenizer.from_pretrained(model_name)
Ejemplo n.º 28
0
 def load_ds_from_cache(self) -> None:
     ds_metadata = utils.core_utils.load_json(self.ds_meta)
     self.dataset_conf['num_train_recs'], self.dataset_conf['num_val_recs'], self.dataset_conf[
         'num_test_recs'] = \
         ds_metadata["train_recs"], ds_metadata["val_recs"], ds_metadata["test_recs"]
     for k in self.dataset_conf[self.target_ds_structure].keys():
         self.dataset_conf[f'{k}_start_date'] = ds_metadata[f'{k}_start_date'] \
             if f'{k}_start_date' in ds_metadata.keys() else None
         self.dataset_conf[f'{k}_end_date'] = ds_metadata[f'{k}_end_date'] \
             if f'{k}_end_date' in ds_metadata.keys() else None
     self.dataset_conf['dsid'] = ds_metadata[
         'dsid'] if 'dsid' in ds_metadata.keys() else None
     self.dataset_conf['albert_tokenizer'] = \
         AlbertTokenizer.from_pretrained(self.config.model.sp_model, max_len=self.config.data_source.max_seq_length,
                                         truncation=True)
Ejemplo n.º 29
0
    def _get_lm_model_tokenizer(self, lm_model="albert"):
        if getattr(self, "_lm_model_tokenizer", None) is not None:
            return self._lm_model_tokenizer

        if self._train_dl is not None and self._train_dl.dataset is not None:
            self._lm_model_tokenizer = self._train_dl.dataset.lm_model_tokenizer

        if lm_model == "albert":
            self._lm_model_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        else:
            raise NotImplementedError(
                f"{lm_model} lm model is not supported. Only albert is supported at this moment."
            )

        return self._lm_model_tokenizer
Ejemplo n.º 30
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(ALBERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 510:
            logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
            max_seq_length = 510
        self.max_seq_length = max_seq_length

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.albert = AlbertModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)