def main():

    print('Start')
    parser = argparse.ArgumentParser()

    # Add the arguments to the parser
    parser.add_argument("--model_name", required=True)
    parser.add_argument("--checkpoint_input_path", required=False)
    parser.add_argument("--checkpoint_output_path", required=True)
    parser.add_argument("--mnli_path", required=True)
    parser.add_argument("--squad_path", required=True)
    parser.add_argument("--train_squad", default=True)
    parser.add_argument("--train_mnli", default=True)
    parser.add_argument("--seed", default=1995)
    parser.add_argument("--learning_rate", default=5e-5, type=float)
    parser.add_argument("--batch_size", default=16, type=int)
    parser.add_argument("--epochs", default=3, type=int)

    args = vars(parser.parse_args())

    random.seed(args['seed'])

    def read_squad(path):
        path = Path(path)
        with open(path, 'rb') as f:
            squad_dict = json.load(f)
        contexts = []
        questions = []
        answers = []
        for group in squad_dict['data']:
            for passage in group['paragraphs']:
                context = passage['context']
                for qa in passage['qas']:
                    question = qa['question']
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)

        return contexts, questions, answers

    squad_contexts, squad_questions, squad_answers = read_squad(
        args['squad_path'])
    random_index = random.sample(range(len(squad_answers)), 16)
    squad_contexts = [squad_contexts[index] for index in random_index]
    squad_questions = [squad_questions[index] for index in random_index]
    squad_answers = [squad_answers[index] for index in random_index]

    def parse_mnli(path):
        sentences_a = []
        sentences_b = []
        labels = []
        with open(path, "r+", encoding="utf8") as f:
            for item in jsonlines.Reader(f):
                sentences_a.append(item['sentence1'])
                sentences_b.append(item['sentence2'])
                labels.append(item['gold_label'])

        return sentences_a, sentences_b, labels

    mnli_a, mnli_b, mnli_labels = parse_mnli(args['mnli_path'])

    random_index = random.sample(range(len(mnli_a)), 16)
    mnli_a = [mnli_a[index] for index in random_index]
    mnli_b = [mnli_b[index] for index in random_index]
    mnli_labels = [mnli_labels[index] for index in random_index]

    label_encode = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
    mnli_labels = [label_encode[label] for label in mnli_labels]

    print('Done importing data')

    from transformers import BertTokenizer, BertTokenizerFast

    tokenizer = BertTokenizer.from_pretrained(args['model_name'],
                                              do_lower_case=True,
                                              padding=True,
                                              truncation=True,
                                              add_special_tokens=True,
                                              model_max_length=500)

    tokenizer_fast = BertTokenizerFast.from_pretrained(args['model_name'],
                                                       do_lower_case=True,
                                                       padding=True,
                                                       truncation=True,
                                                       add_special_tokens=True,
                                                       model_max_length=500)

    from squad_processing import add_end_idx, add_token_positions

    add_end_idx(squad_answers, squad_contexts)

    squad_encodings = tokenizer_fast(squad_contexts,
                                     squad_questions,
                                     add_special_tokens=True,
                                     truncation=True,
                                     padding=True,
                                     max_length=500)

    # Processing of token positions
    add_token_positions(squad_encodings, squad_answers, tokenizer_fast)

    # In[69]:

    # MNLI

    mnli_encodings = tokenizer(mnli_a,
                               mnli_b,
                               add_special_tokens=True,
                               max_length=500,
                               truncation=True,
                               padding=True)
    mnli_encodings['labels'] = mnli_labels

    from torch.utils.data import Dataset

    class MnliDataset(Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            #print(self.encodings['start_positions'][idx])
            #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()}
            return {
                'input_ids':
                torch.tensor(self.encodings['input_ids'][idx],
                             dtype=torch.long),
                'attention_mask':
                torch.tensor(self.encodings['attention_mask'][idx],
                             dtype=torch.long),
                'token_type_ids':
                torch.tensor(self.encodings['token_type_ids'][idx],
                             dtype=torch.long),
                'labels':
                torch.tensor(self.encodings['labels'][idx], dtype=torch.long)
            }

        def __len__(self):
            return len(self.encodings.input_ids)

    class SquadDataset(Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            #print(self.encodings['start_positions'][idx])
            #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()}
            return {
                'input_ids':
                torch.tensor(self.encodings['input_ids'][idx],
                             dtype=torch.long),
                'attention_mask':
                torch.tensor(self.encodings['attention_mask'][idx],
                             dtype=torch.long),
                'start_positions':
                torch.tensor(self.encodings['start_positions'][idx],
                             dtype=torch.long),
                'end_positions':
                torch.tensor(self.encodings['end_positions'][idx],
                             dtype=torch.long)
            }

        def __len__(self):
            return len(self.encodings.input_ids)

    train_mnli = MnliDataset(mnli_encodings)
    train_squad = SquadDataset(squad_encodings)

    from transformers import BertPreTrainedModel, BertModel
    from torch import nn
    from torch.nn import CrossEntropyLoss

    # In[106]:

    class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
        """BERT model for classification.
	    This module is composed of the BERT model with a linear layer on top of
	    the pooled output.
	    """
        def __init__(self, config, num_labels=3):
            super().__init__(config)
            self.num_labels = num_labels
            self.bert = BertModel(config)
            self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
            self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
            #self.apply(self.init_bert_weights)

        def forward(self,
                    input_ids,
                    token_type_ids=None,
                    attention_mask=None,
                    labels=None):
            pooled_output = self.bert(input_ids, token_type_ids,
                                      attention_mask)[1]
            pooled_output = self.dropout(pooled_output)
            logits = self.classifier(pooled_output)

            return logits

        def freeze_bert_encoder(self):
            for param in self.bert.parameters():
                param.requires_grad = False

        def unfreeze_bert_encoder(self):
            for param in self.bert.parameters():
                param.requires_grad = True

    # In[101]:

    mnli_model = BertForMultiLabelSequenceClassification.from_pretrained(
        args['model_name'])

    from torch.nn import DataParallel
    from torch.utils.data import DataLoader
    from transformers import AdamW

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    print(device)

    train_loader_mnli = DataLoader(train_mnli,
                                   batch_size=args['batch_size'],
                                   shuffle=True)
    mnli_model = DataParallel(mnli_model)

    optim = AdamW(mnli_model.parameters(), lr=args['learning_rate'])

    mnli_model.to(device)
    mnli_model.train()

    from barbar import Bar
    for epoch in range(args['epochs']):
        for i, batch in enumerate(Bar(train_loader_mnli)):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device,
                                                        dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device,
                                                        dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            outputs = mnli_model(input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=labels)

            loss_fct = CrossEntropyLoss().to(device)
            loss = loss_fct(outputs, labels)
            #loss = outputs.loss
            loss.sum().backward()
            optim.step()
    mnli_model.eval()

    file_name = args['checkpoint_output_path'] + '/checkpoint_mnli.pt'
    torch.save(mnli_model.state_dict(), file_name)

    from transformers.modeling_outputs import QuestionAnsweringModelOutput

    class BertForQuestionAnswering(BertPreTrainedModel):

        _keys_to_ignore_on_load_unexpected = [r"pooler"]

        def __init__(self, config):
            super().__init__(config)
            self.num_labels = config.num_labels

            self.bert = BertModel(config, add_pooling_layer=False)
            self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

            self.init_weights()

        def forward(
            self,
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            start_positions=None,
            end_positions=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
        ):
            r"""
	        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
	            Labels for position (index) of the start of the labelled span for computing the token classification loss.
	            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
	            sequence are not taken into account for computing the loss.
	        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
	            Labels for position (index) of the end of the labelled span for computing the token classification loss.
	            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
	            sequence are not taken into account for computing the loss.
	        """
            return_dict = return_dict if return_dict is not None else self.config.use_return_dict

            outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

            sequence_output = outputs[0]

            logits = self.qa_outputs(sequence_output)
            start_logits, end_logits = logits.split(1, dim=-1)
            start_logits = start_logits.squeeze(-1)
            end_logits = end_logits.squeeze(-1)

            total_loss = None
            if start_positions is not None and end_positions is not None:
                # If we are on multi-GPU, split add a dimension
                if len(start_positions.size()) > 1:
                    start_positions = start_positions.squeeze(-1)
                if len(end_positions.size()) > 1:
                    end_positions = end_positions.squeeze(-1)
                # sometimes the start/end positions are outside our model inputs, we ignore these terms
                ignored_index = start_logits.size(1)
                start_positions.clamp_(0, ignored_index)
                end_positions.clamp_(0, ignored_index)

                loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
                start_loss = loss_fct(start_logits, start_positions)
                end_loss = loss_fct(end_logits, end_positions)
                total_loss = (start_loss + end_loss) / 2

            if not return_dict:
                output = (start_logits, end_logits) + outputs[2:]
                return ((total_loss, ) +
                        output) if total_loss is not None else output

            return QuestionAnsweringModelOutput(
                loss=total_loss,
                start_logits=start_logits,
                end_logits=end_logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

    squad_model = BertForQuestionAnswering.from_pretrained(args['model_name'])

    squad_model.load_state_dict(mnli_model.state_dict(), strict=False)

    train_loader_squad = DataLoader(train_squad,
                                    batch_size=args['batch_size'],
                                    shuffle=True)

    squad_model = DataParallel(squad_model)

    squad_model.to(device)
    squad_model.train()
    optim = AdamW(squad_model.parameters(), lr=args['learning_rate'])

    # In[122]:

    for epoch in range(args['epochs']):
        for i, batch in enumerate(Bar(train_loader_squad)):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device,
                                                        dtype=torch.long)
            start_positions = batch['start_positions'].to(device,
                                                          dtype=torch.long)
            end_positions = batch['end_positions'].to(device, dtype=torch.long)
            outputs = squad_model(input_ids,
                                  attention_mask=attention_mask,
                                  start_positions=start_positions,
                                  end_positions=end_positions)
            loss = outputs[0]
            loss.sum().backward()
            optim.step()
    squad_model.eval()

    file_name = args['checkpoint_output_path'] + '/checkpoint_mnli_squad.pt'

    torch.save(squad_model.state_dict(), file_name)
Exemple #2
0
    #define the forward pass
    def forward(self, sent_id, mask):
        #pass the inputs to the model
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        # output layer
        x = self.fc2(x)
        # apply softmax activation
        x = self.softmax(x)
        return x


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokens_test = tokenizer.batch_encode_plus(df["text"].tolist(),
                                          max_length=25,
                                          padding='max_length',
                                          truncation=True)
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])

path = 'saved_weights.pt'
model = BERT_Arch(bert)
model.load_state_dict(torch.load(path))

start = time.time()
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
Exemple #3
0
def pico_extract(text, pth_path, idx2tag):
    ''' tup: list of tuples (token, tag)
    '''
    spacy_tokens = [token.text for token in nlp(text)]
    spacy_tokens = [t for t in spacy_tokens if t != '\u2009']

    ## Tokenization
    pre_wgts = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
    tokenizer = BertTokenizerFast.from_pretrained(pre_wgts, num_labels=13)
    # plain text not pre-tokenised by scispaCy, so word_ids unavailable
    inputs = tokenizer(spacy_tokens,
                       is_split_into_words=True,
                       return_offsets_mapping=True,
                       padding=False,
                       truncation=True)
    word_ids = inputs.word_ids()
    inputs = {key: torch.tensor(value) for key, value in inputs.items()}

    ## Load model
    model = BertForTokenClassification.from_pretrained(pre_wgts, num_labels=13)
    # Load checkpoint
    checkpoint = torch.load(pth_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['state_dict'], strict=False)
    model.cpu()
    model.eval()

    ## Run model
    outputs = model(inputs['input_ids'].unsqueeze(0),
                    inputs['attention_mask'].unsqueeze(0))
    logits = outputs[0].squeeze(0)  # [seq_len, n_tags]
    preds = torch.argmax(logits, dim=1)  # [seq_len]
    preds = preds.numpy().tolist()[1:-1]  # len=seq_len-2, remove cls/sep token

    ids = inputs['input_ids'][1:-1]
    word_ids = word_ids[1:-1]
    tokens = tokenizer.convert_ids_to_tokens(ids)
    tags = [idx2tag[idx] for idx in preds]
    pre_wid = None
    tags_new = []
    for t, wid in zip(tags, word_ids):
        if wid != pre_wid:
            tags_new.append(t)
        pre_wid = wid
    # Convert back to non-sub spacy tokens/tags
    tags = tags_new
    tokens = spacy_tokens
    # len(tags_new) == len(spacy_tokens)

    # Record span start/end idxs
    sidxs, eidxs = [], []
    for i in range(len(tags)):
        if i == 0 and tags[i] != 'O':
            sidxs.append(0)
            if tags[1] == 'O':
                eidxs.append(0)
        if i > 0 and i < len(tags) - 1 and tags[i] != 'O':
            if tags[i - 1] == 'O' and tags[i] != 'O':
                sidxs.append(i)
            if tags[i + 1] == 'O' and tags[i] != 'O':
                eidxs.append(i)

        if tags[len(tags) - 1] != 'O':
            sidxs.append(len(tags) - 1)
            eidxs.append(len(tags) - 1)

    tup = []
    for si, ei in zip(sidxs, eidxs):
        ent_tokens = tokens[si:ei + 1]
        ent_tags = tags[si:ei + 1]

        # ent_tags may include multiple type of tags
        ents = [t.split('-')[1] for t in ent_tags]
        ents_set = list(set(ents))
        for ent in ents_set:
            indices = [
                idx for idx, t in enumerate(ent_tags) if t.split('-')[1] == ent
            ]
            sub = [ent_tokens[ic] for ic in indices]
            # sub_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(sub))
            # sub_new = []
            # for i, tok in enumerate(sub):
            # if tok.startswith("##"):
            #     if sub_new:
            #         sub_new[-1] = f"{sub_new[-1]}{tok[2:]}"
            # else:
            #     sub_new.append(tok)

            sub_text = ' '.join(sub)
            sub_text = re.sub(r" - ", "-", sub_text)
            sub_text = re.sub(r" = ", "=", sub_text)
            sub_text = re.sub(r" / ", "/", sub_text)
            sub_text = re.sub(r"\( ", "(", sub_text)
            sub_text = re.sub(r" \)", ")", sub_text)

            # Remove incomplete brackets
            left = [(m.start(0), m.end(0)) for m in re.finditer(r1, sub_text)]
            right = [(m.start(0), m.end(0)) for m in re.finditer(r2, sub_text)]

            if len(left) > 0 and len(right) == 0:  # (
                sub_text = re.sub(r"\(", "", sub_text)
            if len(left) == 0 and len(right) > 0:  # )
                sub_text = re.sub(r"\)", "", sub_text)
            if len(left) > 0 and len(right) > 0:  # )( or ()
                if left[0][0] > right[0][0]:  # )(
                    sub_text = re.sub(r"\)", "", sub_text)
                    sub_text = re.sub(r"\(", "", sub_text)
            sub_text = re.sub(r'^[\s]', "", sub_text)
            sub_text = re.sub(r'[\s]$', "", sub_text)
            sub_text = ' '.join([s for s in sub_text.split(' ') if len(s) > 1])
            tup.append((ent, sub_text))
    return tup, tokens, tags
Exemple #4
0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16=True)
    device = accelerator.device

# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/

"""## Load Model and Tokenizer




 
"""

model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")

# You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly)

"""## Read Data

- Training set: 26935 QA pairs
- Dev set: 3523  QA pairs
- Test set: 3492  QA pairs

- {train/dev/test}_questions:	
  - List of dicts with the following keys:
   - id (int)
   - paragraph_id (int)
   - question_text (string)
   - answer_text (string)
Exemple #5
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    config = get_config()

    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    tokenizer = BertTokenizerFast.from_pretrained(config.tokenizer)
    tokenizer.model_max_length = config.max_seq_length
    data_pipeline = data.ClassificationDataPipeline(
        lambda: tfds.builder(f'{config.dataset_path}/{config.dataset_name}'),
        tokenizer)

    num_train_examples = data_pipeline.dataset_builder.info.splits[
        'train'].num_examples
    num_train_steps = int(num_train_examples * config.num_train_epochs //
                          config.train_batch_size)
    warmup_steps = int(config.warmup_proportion * num_train_steps)
    cooldown_steps = num_train_steps - warmup_steps

    is_regression_task = (data_pipeline.dataset_builder.info.features['label'].
                          dtype == 'float32')
    if is_regression_task:
        num_classes = 1
        compute_stats = compute_regression_stats
    else:
        num_classes = data_pipeline.dataset_builder.info.features[
            'label'].num_classes
        compute_stats = compute_classification_stats

    model = create_model(config, num_classes=num_classes)
    optimizer = create_optimizer(config, model)
    optimizer = optimizer.replicate()
    del model  # don't keep a copy of the initial model
    optimizer = training.harmonize_across_hosts(optimizer)

    learning_rate_fn = training.create_learning_rate_scheduler(
        factors='constant * linear_warmup * linear_decay',
        base_learning_rate=config.learning_rate,
        warmup_steps=warmup_steps,
        steps_per_cycle=cooldown_steps,
    )

    output_dir = get_output_dir(config)
    gfile.makedirs(output_dir)

    train_history = training.TrainStateHistory(learning_rate_fn)
    train_state = train_history.initial_state()

    if config.do_train:
        train_step_fn = training.create_train_step(compute_loss_and_metrics,
                                                   clip_grad_norm=1.0)
        train_iter = data_pipeline.get_inputs(
            split='train', batch_size=config.train_batch_size, training=True)

        for step, batch in zip(range(0, num_train_steps), train_iter):
            optimizer, train_state = train_step_fn(optimizer, batch,
                                                   train_state)

    if config.do_eval:
        # While our input pipelines use TFDS, we'll use metrics from the
        # HuggingFace datasets library instead.
        datasets.logging.set_verbosity_error()
        # Workaround for https://github.com/huggingface/datasets/issues/812
        logging.getLogger('filelock').setLevel(logging.ERROR)

        eval_step = training.create_eval_fn(compute_stats)
        eval_results = []

        if config.dataset_path == 'glue' and config.dataset_name == 'mnli':
            validation_splits = ['validation_matched', 'validation_mismatched']
        else:
            validation_splits = ['validation']

        for split in validation_splits:
            eval_iter = data_pipeline.get_inputs(
                split=split, batch_size=config.eval_batch_size, training=False)
            eval_stats = eval_step(optimizer, eval_iter)
            eval_metric = datasets.load_metric(config.dataset_path,
                                               config.dataset_name)
            eval_metric.add_batch(predictions=eval_stats['prediction'],
                                  references=eval_stats['label'])
            eval_metrics = eval_metric.compute()
            prefix = 'eval_mismatched' if split == 'validation_mismatched' else 'eval'
            for name, val in sorted(eval_metrics.items()):
                line = f'{prefix}_{name} = {val:.06f}'
                print(line, flush=True)
                eval_results.append(line)

        eval_results_path = os.path.join(output_dir, 'eval_results.txt')
        with gfile.GFile(eval_results_path, 'w') as f:
            for line in eval_results:
                f.write(line + '\n')

    if config.do_predict:
        predict_step = training.create_eval_fn(compute_stats)
        predict_results = []

        path_map = {
            ('glue', 'cola', 'test'): 'CoLA.tsv',
            ('glue', 'mrpc', 'test'): 'MRPC.tsv',
            ('glue', 'qqp', 'test'): 'QQP.tsv',
            ('glue', 'sst2', 'test'): 'SST-2.tsv',
            ('glue', 'stsb', 'test'): 'STS-B.tsv',
            ('glue', 'mnli', 'test_matched'): 'MNLI-m.tsv',
            ('glue', 'mnli', 'test_mismatched'): 'MNLI-mm.tsv',
            ('glue', 'qnli', 'test'): 'QNLI.tsv',
            ('glue', 'rte', 'test'): 'RTE.tsv',
            # No eval on WNLI for now. BERT accuracy on WNLI is below baseline,
            # unless a special training recipe is used.
            # ('glue/wnli', 'test'): 'WNLI.tsv',
        }
        label_sets = {
            ('glue', 'cola'): ['0', '1'],
            ('glue', 'mrpc'): ['0', '1'],
            ('glue', 'qqp'): ['0', '1'],
            ('glue', 'sst2'): ['0', '1'],
            ('glue', 'mnli'): ['entailment', 'neutral', 'contradiction'],
            ('glue', 'qnli'): ['entailment', 'not_entailment'],
            ('glue', 'rte'): ['entailment', 'not_entailment'],
        }

        for path_map_key in path_map:
            candidate_dataset_path, candidate_dataset_name, split = path_map_key
            if (candidate_dataset_path != config.dataset_path
                    or candidate_dataset_name != config.dataset_name):
                continue

            predict_iter = data_pipeline.get_inputs(
                split=split, batch_size=config.eval_batch_size, training=False)
            predict_stats = predict_step(optimizer, predict_iter)
            idxs = predict_stats['idx']
            predictions = predict_stats['prediction']

            tsv_path = os.path.join(
                output_dir, path_map[config.dataset_path, config.dataset_name,
                                     split])
            with gfile.GFile(tsv_path, 'w') as f:
                f.write('index\tprediction\n')
                if is_regression_task:
                    for idx, val in zip(idxs, predictions):
                        f.write(f'{idx}\t{val:.06f}\n')
                else:
                    label_set = label_sets[config.dataset_path,
                                           config.dataset_name]
                    for idx, val in zip(idxs, predictions):
                        f.write(f'{idx}\t{label_set[val]}\n')
            print('Wrote', tsv_path)
Exemple #6
0
            answer[
                'answer_end'] = end_idx - 1  # When the gold label is off by one character
        elif context[start_idx - 2:end_idx - 2].lower() == gold_text.lower():
            answer['answer_start'] = start_idx - 2
            answer[
                'answer_end'] = end_idx - 2  # When the gold label is off by two characters


add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

from transformers import BertTokenizerFast, BertModel

tokenizer = BertTokenizerFast.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    padding=True,
    truncation=True,
    add_special_tokens=True,
    model_max_length=1000000000)

#model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

train_encodings = tokenizer(train_contexts,
                            train_questions,
                            truncation=True,
                            padding=True,
                            max_length=500)
val_encodings = tokenizer(val_contexts,
                          val_questions,
                          truncation=True,
                          padding=True,
                          max_length=500)
Exemple #7
0
 def __init__(self, args):
     super(BonzDataModule, self).__init__()
     self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', max_len=512)
     self.args = args
Exemple #8
0
args = Namespace(**js['args'])
# args.epochs = 3

random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

idx2tag = js['idx2tag']
idx2tag = {int(idx): tag for idx, tag in idx2tag.items()}
tag2idx = {tag: idx for idx, tag in idx2tag.items()}

softmax = nn.Softmax(dim=1)
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-v1.1',
                                              num_labels=13)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')


#%%
class SemiData():
    def __init__(self):
        self.semi_path = '/home/qwang/pre-pico/data/semi_scores_9923.csv'
        self.gold_path = '/home/qwang/pre-pico/data/tsv/18mar_output/pico_18mar.json'

    def read_conll_tsv(self, tsv_path):
        ''' Read seqs/tags for one tsv file
            seqs[i] --> ['Leicestershire', '22', 'points', ',', ...], tags[i] --> ['B-ORG', 'O', 'O', ...]
        '''
        dat = pd.read_csv(tsv_path, sep='\t', header=None)
    configfile = "config.json"
else:
    configfile = sys.argv[1]

# Read the params
with open(configfile, "r") as f:
    config = json.load(f)

globalparams = config["global_params"]
encparams = config["encoder_params"]
decparams = config["decoder_params"]
modelparams = config["model_params"]

# Load the tokenizers
en_tok_path = encparams["tokenizer_path"]
en_tokenizer = BertTokenizerFast(os.path.join(en_tok_path, "vocab.txt"))
de_tok_path = decparams["tokenizer_path"]
de_tokenizer = BertTokenizerFast(os.path.join(de_tok_path, "vocab.txt"))

# Init the dataset
train_en_file = globalparams["train_en_file"]
train_de_file = globalparams["train_de_file"]
valid_en_file = globalparams["valid_en_file"]
valid_de_file = globalparams["valid_de_file"]

enc_maxlength = encparams["max_length"]
dec_maxlength = decparams["max_length"]

batch_size = modelparams["batch_size"]
train_dataset = QADataset(train_en_file, train_de_file, en_tokenizer,
                          de_tokenizer, enc_maxlength, dec_maxlength)
Exemple #10
0
if __name__ == "__main__":
    util.setup_seed(6)
    parser = argparse.ArgumentParser(description='Kil Bert Project')
    parser.add_argument('-d',
                        '--data',
                        help='data name',
                        default='imdb',
                        choices=['agnews', 'imdb', 'newsgroup'])
    args = parser.parse_args()

    with open('settings.json', 'r', encoding='utf-8') as f:
        settings = json.load(f)
    config = settings["bert"][args.data]
    config["model_name"] = 'bert-base-uncased'
    tokenizer = BertTokenizerFast.from_pretrained(config["model_name"])

    train, test = util.get_data(args.data)
    train = train.map(lambda e: tokenizer(e['text'],
                                          truncation=True,
                                          padding='max_length',
                                          max_length=config["max_len"]),
                      batched=True)
    train = train.map(lambda e: {'labels': e['label']}, batched=True)
    train.set_format(
        type='torch',
        columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

    test = test.map(lambda e: tokenizer(e['text'],
                                        truncation=True,
                                        padding='max_length',
Exemple #11
0
    pos = [pos_to_num[token.pos_] for token in doc]
    pos = [pos_to_num['SPECIAL']
           ] + pos  # add numerical representation of 'SPECIAL' tag
    pos.append(pos_to_num['SPECIAL'])
    sentences_tokenized.append(tokens)
    sent_pos_tags.append(pos)
    sent_marked = sentences_marked[idx]
    doc_marked = nlp(sent_marked)
    tokens_marked = [token.text for token in doc_marked]
    sentences_marked_tokenized.append(tokens_marked)

print('Pre-tokenization with SpaCy is finished.')

# Tokenize pre-tokenized sentences with BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True,
    additional_special_tokens=['[TARGET_START]', '[TARGET_END]'])
encodings = tokenizer(sentences_tokenized,
                      return_offsets_mapping=True,
                      is_pretokenized=True)
encodings_marked = tokenizer(sentences_marked_tokenized, is_pretokenized=True)

print('BERT tokenization is finished.')

# For each tokenized (original) sentence create a position vector that marks target tokens with 1's and the rest
# of the tokens with 0's.
tokenized_marked_texts = [
    tokenizer.convert_ids_to_tokens(i) for i in encodings_marked['input_ids']
]
position_vectors = []
for i in range(len(tokenized_marked_texts)):
Exemple #12
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT 로드 완료")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT 로드 완료")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base 로드 완료")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base 로드 완료")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor 로드 완료")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base 로드 완료")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 로드 완료")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base 로드 완료")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad 로드 완료")
        print("====================================")
Exemple #13
0
import streamlit as st
# from argparse import ArgumentParser
import lime
from lime.lime_text import LimeTextExplainer

MODELS = {
    "BERT": "model_noprocess.h5"
}
model_name = 'bert-base-uncased'

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
repo_root = os.path.dirname(os.path.abspath(__file__))[:os.path.dirname(os.path.abspath(__file__)).find("Assignment_1")+13]
import_model = load_model(repo_root+"/models/model_noprocess.h5")
class_names = ['1', '2', '3', '4', '5']
explainer = LimeTextExplainer(class_names=class_names)
print(repo_root)
# Obtain the CSS for Buttons to be displayed
@st.cache(suppress_st_warning=True, allow_output_mutation=True)
def get_button_css(button_id):
    custom_css = f"""
        <style>
            #{button_id} {{
                background-color: rgb(255, 255, 255);
                color: rgb(38, 39, 48);
                padding: 0.25em 0.38em;
                position: relative;
Exemple #14
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    config = FLAGS.config

    model = create_model(config)
    optimizer = create_optimizer(config, model)
    del model  # don't keep a copy of the initial model

    output_dir = get_output_dir(config)
    gfile.makedirs(output_dir)

    # Restore from a local checkpoint, if one exists.
    optimizer = checkpoints.restore_checkpoint(output_dir, optimizer)
    start_step = int(optimizer.state[0].step)

    optimizer = optimizer.replicate()
    optimizer = training.harmonize_across_hosts(optimizer)

    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    tokenizer = BertTokenizerFast.from_pretrained(config.tokenizer)
    tokenizer.model_max_length = config.max_seq_length

    data_pipeline = data.PretrainingDataPipeline(
        glob.glob('cache/pretrain.*_of_*.tfrecord'),
        tokenizer,
        max_predictions_per_seq=config.max_predictions_per_seq)

    learning_rate_fn = training.create_learning_rate_scheduler(
        factors='constant * linear_warmup * linear_decay',
        base_learning_rate=config.learning_rate,
        warmup_steps=config.num_warmup_steps,
        steps_per_cycle=config.num_train_steps - config.num_warmup_steps,
    )

    train_history = training.TrainStateHistory(learning_rate_fn)
    train_state = train_history.initial_state()

    if config.do_train:
        train_iter = data_pipeline.get_inputs(
            batch_size=config.train_batch_size, training=True)
        train_step_fn = training.create_train_step(
            compute_pretraining_loss_and_metrics, clip_grad_norm=1.0)

        for step, batch in zip(range(start_step, config.num_train_steps),
                               train_iter):
            optimizer, train_state = train_step_fn(optimizer, batch,
                                                   train_state)
            if jax.host_id() == 0 and (step % config.save_checkpoints_steps
                                       == 0
                                       or step == config.num_train_steps - 1):
                checkpoints.save_checkpoint(output_dir,
                                            optimizer.unreplicate(), step)
                config_path = os.path.join(output_dir, 'config.json')
                if not os.path.exists(config_path):
                    with open(config_path, 'w') as f:
                        json.dump({'model_type': 'bert', **config.model}, f)

    if config.do_eval:
        eval_iter = data_pipeline.get_inputs(batch_size=config.eval_batch_size)
        eval_iter = itertools.islice(eval_iter, config.max_eval_steps)
        eval_fn = training.create_eval_fn(compute_pretraining_stats,
                                          sample_feature_name='input_ids')
        eval_stats = eval_fn(optimizer, eval_iter)

        eval_metrics = {
            'loss':
            jnp.mean(eval_stats['loss']),
            'masked_lm_loss':
            jnp.mean(eval_stats['masked_lm_loss']),
            'next_sentence_loss':
            jnp.mean(eval_stats['next_sentence_loss']),
            'masked_lm_accuracy':
            jnp.sum(eval_stats['masked_lm_correct']) /
            jnp.sum(eval_stats['masked_lm_total']),
            'next_sentence_accuracy':
            jnp.sum(eval_stats['next_sentence_correct']) /
            jnp.sum(eval_stats['next_sentence_total']),
        }

        eval_results = []
        for name, val in sorted(eval_metrics.items()):
            line = f'{name} = {val:.06f}'
            print(line, flush=True)
            eval_results.append(line)

        eval_results_path = os.path.join(output_dir, 'eval_results.txt')
        with gfile.GFile(eval_results_path, 'w') as f:
            for line in eval_results:
                f.write(line + '\n')
from thai2transformers.conf import Task
from thai2transformers import preprocess

CACHE_DIR = f'{str(Path.home())}/.cache/huggingface_datasets'

METRICS = {
    Task.MULTICLASS_CLS: classification_metrics,
    Task.MULTILABEL_CLS: multilabel_classification_metrics
}

PUBLIC_MODEL = {
    'mbert': {
        'name':
        'bert-base-multilingual-cased',
        'tokenizer':
        BertTokenizerFast.from_pretrained('bert-base-multilingual-cased'),
        'config':
        BertConfig.from_pretrained('bert-base-multilingual-cased'),
    },
    'xlmr': {
        'name': 'xlm-roberta-base',
        'tokenizer':
        XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base'),
        'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'),
    },
    'xlmr-large': {
        'name': 'xlm-roberta-large',
        'tokenizer':
        XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large'),
        'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'),
    },
Exemple #16
0
    for file in files:
        if file.startswith("prediction"):
            prediction_file = os.path.join(Path(model_dir).parent, file)
            break
    if prediction_file is None:
        raise FileNotFoundError("no prediction file")

print(f"loading predictions from {prediction_file}")

dataset_properties = json.load(
    open(os.path.join(model_dir, "dataset_properties.json")))
target_vocab = dataset_properties["target_vocab"]
special_tokens = dataset_properties["special_tokens"]

tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab)
source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
source_tokenizer.add_special_tokens(
    {"additional_special_tokens": special_tokens})

# collect per sentence length:
# - similarities (similar to edit distance)
# - accuracies
similarities = collections.defaultdict(list)
accuracies = collections.defaultdict(list)

# also collect confusions (i.e., information about how tokens got wrongly predicted)
confusions = {}

# 3 is the SEP token (which marks end of the sequence) - if the only difference between prediction
# and true sequence is the output of additional token(s) after the complete correct sequence has been
# predicted, the confusion lies in the SEP token, so we'll need to add this.
        df = df.drop(df[df['label'] == 0.0].index).reset_index()
        df_lf_x = df_lf_x.loc[df.index]

    # The input size is the number of linguistic features. We get this value from the
    # dataframe, but we remove 2 items. One for the "TWEET" column, and other for the
    # class
    input_size = df_lf_x.shape[1]

    # CustomBERTModel model
    model = CustomBERTModel(input_size,
                            num_classes=len(df['label'].unique())
                            if task_type == 'classification' else 1)
    model.to(device)

    # Get the tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(pretrained_model)

    # Encode label as numbers instead of user names
    if task_type == 'classification':
        df["label"] = df["label"].astype('category').cat.codes

    # Encode datasets to work with transformers
    dataset = Dataset.from_pandas(df)

    # Tokenizer trainset and test dataframe with the training
    # The tokenize function only takes care of the "tweet"
    # column and will create the input_ids, token_type_ids, and
    # attention_mask
    dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset))

    # Finally, we "torch" the new columns. We return the rest
Exemple #18
0
    logger = DefaultLogger(config["log_path"], experiment_name,
                           config["run_name"], config["run_id"],
                           hyper_parameters)
    model_state_dict_dir = config["path_to_save_model"]
    if not os.path.exists(model_state_dict_dir):
        os.makedirs(model_state_dict_dir)

assert config["encoder"] == "BERT", "此程序仅用于Bert模型,若使用LSTM请运行train.py"

# 读取数据
train_data = json.load(open(train_data_path, "r", encoding="utf-8"))
valid_data = json.load(open(valid_data_path, "r", encoding="utf-8"))

# 数据预处理器
tokenizer = BertTokenizerFast.from_pretrained(config["bert_path"],
                                              add_special_tokens=False,
                                              do_lower_case=False)
tokenize = tokenizer.tokenize
get_tok2char_span_map = lambda text: tokenizer.encode_plus(
    text, return_offsets_mapping=True, add_special_tokens=False)[
        "offset_mapping"]

preprocessor = Preprocessor(tokenize_func=tokenize,
                            get_tok2char_span_map_func=get_tok2char_span_map)

# train and valid max token num
max_tok_num = 0
all_data = train_data + valid_data

for sample in all_data:
    tokens = tokenize(sample["text"])
Exemple #19
0
def get_tokenizer(args):
    return BertTokenizerFast.from_pretrained(args.tokenizer_path, max_len=args.seq_len)
Exemple #20
0
 def __init__(self, vocab_path, strip_accents, clean_text, lowercase):
     common_params = {'strip_accents': strip_accents, 'clean_text': clean_text, 'lowercase': lowercase}
     self._tokenizer = BertTokenizerFast(
         vocab_file=vocab_path, **common_params
     )
Exemple #21
0
import pylab
from tensorboardX import SummaryWriter
import torchvision.utils as vutils
import utils
import models
import params
import train, test
from transformers import BertModel, BertConfig, BertTokenizer, BertTokenizerFast, AdamW, get_linear_schedule_with_warmup

###BERT model instead of the Extractor
# create the BERTConfig, BERTTokenizer, and BERTModel
model_name = "bert-base-uncased"
config = BertConfig.from_pretrained(model_name,
                                    output_hidden_states=True,
                                    return_dict=True)
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
bert = BertModel.from_pretrained(model_name, config=config)

src_train_dataloader = utils.get_train_loader(
    '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/books.csv',
    tokenizer)
src_test_dataloader = utils.get_test_loader(
    '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/books.csv',
    tokenizer)
tgt_train_dataloader = utils.get_train_loader(
    '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/dvd.csv',
    tokenizer)
tgt_test_dataloader = utils.get_test_loader(
    '/content/drive/My Drive/Data_summarization/pytorch_DAN/data/dvd.csv',
    tokenizer)
	def run(self, focused=False, focused_model=None, training_epochs=5):
		if focused==True:
			self.model_list=[focused_model]
		else:
			pass
		for model_name in self.model_list:

			training_args = TrainingArguments(
			output_dir='./results/'+model_name,
			num_train_epochs=training_epochs,
			per_device_train_batch_size=16,
			per_device_eval_batch_size=64,
			warmup_steps=500,
			weight_decay=0.01,
			#evaluate_during_training=True,
			logging_dir='./logs/'+model_name,
			)

			model = None
			tokenizer = None
			print('Training on a dataset with ' +str(self.num_labels)+ ' labels')
			if model_name == "bert-base-uncased":
				model = BertForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels)
				tokenizer = BertTokenizerFast.from_pretrained(model_name)
			elif model_name == "albert-base-v2":
				tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v2')
				model = transformers.AlbertForSequenceClassification.from_pretrained('albert-base-v2', return_dict=True, num_labels=self.num_labels)
			elif model_name == "roberta-base":
				tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')
				model = transformers.RobertaForSequenceClassification.from_pretrained('roberta-base', return_dict=True, num_labels=self.num_labels)
			elif model_name == "linear_SVM":
				tokenizer = None
				model = 'linear_SVM'
				parameters={
				  'vect__ngram_range': [(1, 1), (1, 2)],
			  	'tfidf__use_idf': (True, False),
				  'clf__alpha': (5e-2, 1e-2,5e-3, 1e-3,5e-3),
			  	'clf__penalty': ('l2', 'l1', 'elasticnet')
				}
				classifier=SGDClassifier(loss='hinge',random_state=42,max_iter=5,tol=None)
			elif model_name == "multinomial_naive_bayesian":
				tokenizer = None
				model = 'multinomial_naive_bayesian'
				parameters= {
				  'vect__ngram_range': [(1, 1), (1, 2)],
				  'tfidf__use_idf': (True, False),
				  'clf__alpha': (1,1e-1,1e-2, 1e-3,1e-4),
				  'clf__fit_prior': (True, False),
				}
				classifier=MultinomialNB()


			if not model or not tokenizer: #use 'assert' here instead?
				print("ERROR")


			def tokenize(batch):
				return tokenizer(batch['text'], padding='max_length', truncation=True)

			if tokenizer is not None:

				train_dataset = self.train_dataset_raw.map(tokenize, batched=True, batch_size=len(self.train_dataset_raw))
				test_dataset = self.test_dataset_raw.map(tokenize, batched=True, batch_size=len(self.train_dataset_raw))
				train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
				test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
			else:
				train_dataset = self.train_dataset_raw
				test_dataset = self.test_dataset_raw


			if model_name== "linear_SVM" or model_name== "multinomial_naive_bayesian":
				trainer=None
				pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', classifier),
                     ])
				gs_clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
				if len(train_dataset['labels'])<25:
				  print('not enough data to use a count vectorizer, sorry!')
				else:
				  gs_ind=int(len(train_dataset['labels'])/10)	#use a tenth of the training dataset to do gridsearch
				  gs_clf = gs_clf.fit(train_dataset['text'][:gs_ind], train_dataset['labels'][:gs_ind])
				  best_params=gs_clf.best_params_
				  pipeline.set_params(**best_params)
				  pipeline.fit(train_dataset['text'], train_dataset['labels'])
				  prediction=pipeline.predict(test_dataset['text'])
				  precision, recall, f1, _ = precision_recall_fscore_support(test_dataset['labels'], prediction, average=None)
				  full_report=classification_report(test_dataset['labels'], prediction)
				  acc = accuracy_score(test_dataset['labels'], prediction)
				  loss=hamming_loss(test_dataset['labels'], prediction)
				  curr_metrics={
          'eval_loss': loss,
                    'eval_accuracy': np.mean(acc),
                    'eval_f1': np.mean(f1),
                    'eval_precision': np.mean(precision),
                    'eval_recall': np.mean(recall),
                    'eval_full_report': full_report
          }
				  dump(pipeline, model_name + "_model.joblib")
				  print('best parameters are:')
				  print(best_params)

			else:
				trainer = Trainer(model=model,
				                      args=training_args,
				                      compute_metrics=self.compute_metrics,
				                      train_dataset=train_dataset,
				                      eval_dataset=test_dataset
				)
				trainer.train()
				curr_metrics = trainer.evaluate()
				trainer.save_model(model_name+"_model")

			self.all_metrics[model_name] = curr_metrics
			print(curr_metrics)



			# adding this fully solves the out of memory (OOM) error; https://github.com/huggingface/transformers/issues/1742
			del model, tokenizer, trainer

			# these 2 lines may not be needed
			gc.collect()
			torch.cuda.empty_cache()
Exemple #23
0
 def get_tokenizer(self, opt: Opt):
     return BertTokenizer.from_pretrained('bert-base-uncased')
 def __init__(self, model_name):
     self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
     self.model = EncoderDecoderModel.from_pretrained(model_name)
    def __init__(self,
                 dataset,
                 model_init,
                 batch_size,
                 label_num=54,
                 exclude=-1,
                 masked_lm=False,
                 masked_lm_ratio=0.2,
                 dynamic_masked_lm=False,
                 include_raw_text=False,
                 seed=0,
                 clf_type="multi_label_classify"):
        """Initialize.

        Args:
            dataset (dict): a dataset dict.
            model_init (str): the pre-trained model name. select from ``['bert-base-cased',
                'bert-base-uncased', 'bert-large-cased', and 'bert-large-uncased']``.
            batch_size (int): the batch size in each step.
            exclude (int): exclude one category from the data.
                Use -1 (default) to include all categories.
            masked_lm (bool): whether to randomly replace words with mask tokens.
            masked_lm_ratio (float): the ratio of random masks. Ignored when masked_lm is False.
            dynamic_masked_lm (bool): whether to generate dynamic masked language model. lm ratio
                will be randomly sampled. ``dynamic_masked_lm`` and ``masked_lm`` should not be
                set True at the same time.
            include_raw_text (bool): whether to return the raw text.
            seed: random seed.
        """
        self._buckets = [30, 50, 100, 200]
        self._max_len = self._buckets[-1]
        self._data = [[] for i in range(len(self._buckets))]

        self._batch_size = batch_size
        self._label_num = label_num
        self._tokenizer = BertTokenizerFast.from_pretrained(
            utils.get_transformers(model_init),
            do_lower_case="uncased" in model_init)

        self._seed = seed
        self._pad_tok_id = self._tokenizer.pad_token_id

        self._masked_lm = masked_lm
        self._masked_lm_ratio = masked_lm_ratio
        self._mask_tok_id = self._tokenizer.mask_token_id

        if dynamic_masked_lm and masked_lm:
            raise RuntimeError(
                "Cannot have dynamic_masked_lm and masked_lm both True.")

        self._dynamic_masked_lm = dynamic_masked_lm
        self._include_raw_text = include_raw_text

        self._clf_type = clf_type

        counter = 0
        logger.info("DatasetForBert is processing data.")

        if isinstance(dataset, list):
            load_data = dataset
        elif isinstance(dataset, dict):
            load_data = dataset["data"]

        for item in tqdm.tqdm(load_data):
            y = item["label"]
            s0 = "[CLS] " + item["text0"] + " [SEP]"
            if "text1" in item:
                s1 = item["text1"] + " [SEP]"
            else:
                s1 = ""

            if y == exclude:
                continue

            counter += 1

            s0_ids = self._tokenizer.convert_tokens_to_ids(
                self._tokenizer.tokenize(s0))
            s1_ids = self._tokenizer.convert_tokens_to_ids(
                self._tokenizer.tokenize(s1))
            text_ids = (s0_ids + s1_ids)[:self._max_len]

            for bucket_id in range(len(self._buckets)):
                if self._buckets[bucket_id] >= len(text_ids):
                    self._data[bucket_id].append(
                        (text_ids, y, len(s0_ids), len(s1_ids), s0 + s1))
                    break

        logger.info("Load %d documents. with filter %d.", counter, exclude)
        self._bucket_prob = np.asarray([len(x) for x in self._data])
        self._bucket_prob = self._bucket_prob / np.sum(self._bucket_prob)
Exemple #26
0
import regex as re

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


if args.LM == 'Bert':
    from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM

    config = BertConfig(vocab_size=28996,
                        max_position_embeddings=512,
                        num_attention_heads=12,
                        num_hidden_layers=12,
                        #type_vocab_size=2, default is 2
                        )
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
    model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e50_b16', config=config)
    #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
    # 12-layer, 768-hidden, 12-heads, 110M parameters.

elif args.LM == 'RoBerta':
    from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

    config = RobertaConfig(vocab_size=50265,
                           max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e50_b16', config=config)
Exemple #27
0
@author: qwang
"""

import re
from collections import defaultdict

import spacy
import torch

from transformers import BertTokenizerFast, BertForTokenClassification, BertForSequenceClassification
from transformers import logging
logging.set_verbosity_error()

nlp = spacy.load('en_core_sci_sm')
sent_tokenizer = BertTokenizerFast.from_pretrained(
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')
sent_model = BertForSequenceClassification.from_pretrained(
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')


#%% PICO sentence detector
def sent_detect(text, pth_path):
    # Split to sents and tokenization
    sents = list(nlp(text).sents)
    sents = [str(s) for s in sents]
    inputs = sent_tokenizer(sents,
                            truncation=True,
                            padding=True,
                            return_tensors="pt")

    # Load checkpoint
class DataArgs:
    train_datapath: str = field(default='', metadata={"help": "training dataset path"})
    val_datapath: str = field(default='', metadata={"help": "validation dataset path"})
    init_model_path: str = field(default='', metadata={"help": "initial model path"})
    block_size: int = field(default=512, metadata={"help": "block size"})
    window_size: int = field(default=510, metadata={"help": "window size"})
    finetune_self_attn: bool = field(default=False, metadata={"help": "finetune the self attention layer"})

if __name__ == '__main__':

    parser = HfArgumentParser((TrainingArguments, DataArgs, ))

    training_args, data_args = parser.parse_args_into_dataclasses(look_for_args_file=False)

    trelm_electra_model = TrelmElectraForMaskedLM.from_pretrained(data_args.init_model_path)
    trelm_electra_model_tokenizer = BertTokenizerFast.from_pretrained(data_args.init_model_path)

    if not data_args.finetune_self_attn:
        # fix the self-attention parameters
        for param in trelm_electra_model.trelm_electra.encoder.layer.parameters():
            param.requires_grad = False

    logger.info(trelm_electra_model)

    logger.info('Evaluating trelm-electra for refernece ...')
    pretrain_and_evaluate(training_args, data_args, trelm_electra_model, trelm_electra_model_tokenizer, eval_only=True, model_path=None)

    logger.info(f'Pretraining trelm-electra ... ')
    pretrain_and_evaluate(training_args, data_args, trelm_electra_model, trelm_electra_model_tokenizer, eval_only=False, model_path=training_args.output_dir)

    model_path = training_args.output_dir
                                         max_length=self.maxlen)
        inp_ids, type_ids = inp['input_ids'], inp['token_type_ids']
        attention_mask = inp['attention_mask']

        padding_length = self.maxlen - len(inp_ids)
        inp_ids = inp_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        type_ids = type_ids + ([0] * padding_length)

        assert len(inp_ids) == self.maxlen
        assert len(type_ids) == self.maxlen
        assert len(attention_mask) == self.maxlen
        return torch.tensor(inp_ids), torch.tensor(type_ids), torch.tensor(
            attention_mask)


if __name__ == '__main__':
    from data import KpBioDataset
    from transformers import BertTokenizerFast
    from torch.utils.data import DataLoader

    BATCH_SIZE = 8 * 8
    tokenizer = BertTokenizerFast.from_pretrained('./albert_base')

    text = '''跨國文化的國家在歐洲不同國家待了快10年 就在今年簽證完結之後回國了 本以為回來是開心的 終於喝到每天念掛的珍奶跟日食 頭1 2個月在找工作還有跟朋友團聚然後突然爆發疫症 就在這個待業期間 想慢慢適應這一切 每天也在想這到底是我想待到養老的國家嗎 畢竟自己心裡是個華人 但是習慣了西方的生活方式 家人朋友也說我太獨立 已經不太合群 之前在英國住過    '''

    kp = KeyphrasePredictor(tokenizer,
                            './albert_base',
                            ckpt='ckpt/step_8502.ckpt')
    print(kp.predict(text))
Exemple #30
0
def test_squad_feature_extractor(dataset):
    print("======Squad Feature Test Case======")
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    # First test
    context = 'This is a sample context. BERT will find the answer words in the context by pointing the start and end token positions.'
    question = 'Where are the answer words?'
    answer = 'in the context'
    start_pos = context.find(answer)
    input_ids, token_type_ids, start_pos, end_pos = squad_features(
        context, question, answer, start_pos, tokenizer)

    assert tokenizer.convert_ids_to_tokens(input_ids) == \
        ['[CLS]', 'where', 'are', 'the', 'answer', 'words', '?', '[SEP]', \
         'this', 'is', 'a', 'sample', 'context', '.', \
         'bert', 'will', 'find', 'the', 'answer', 'words', 'in', 'the', 'context', \
         'by', 'pointing', 'the', 'start', 'and', 'end', 'token', 'positions', '.', '[SEP]'], \
             "Your tokenized result does not match the expected result."

    assert token_type_ids == \
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], \
        "Your sentence type ids do not math the expected result"

    assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == ['in', 'the', 'context'], \
        "The start and end tokens do not point the answer position."

    print("The first test passed!")

    # Second test
    context = 'Sometimes, the answer could be subwords so you may need to split them manually.'
    question = 'What should the answer consist of'
    answer = 'word'
    start_pos = context.find(answer)
    input_ids, token_type_ids, start_pos, end_pos = squad_features(
        context, question, answer, start_pos, tokenizer)

    assert tokenizer.convert_ids_to_tokens(input_ids) == \
        ['[CLS]', 'what', 'should', 'the', 'answer', 'consist', 'of', '[SEP]',
         'sometimes', ',', 'the', 'answer', 'could', 'be', 'sub', '##word', '##s',
         'so', 'you', 'may', 'need', 'to', 'split', 'them', 'manually', '.', '[SEP]'], \
             "Your tokenized result does not match the expected result."

    assert token_type_ids == \
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], \
        "Your sentence type ids do not math the expected result"

    assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == ['##word'], \
        "The start and end tokens do not point the answer position."

    print("The second test passed!")

    # Third test
    context = 'When the answer is not given, you should return None for start_pos and end_pos.'
    question = 'This test case does not need a question'
    input_ids, token_type_ids, start_pos, end_pos = squad_features(
        context, question, None, None, tokenizer)

    assert len(input_ids) == 33, \
        "Your tokenized result does not match the expected result."

    assert start_pos is None and end_pos is None, \
        "You should return None for start_pos and end_pos when the answer is not given."

    print("The third test passed!")

    # Forth test
    sample = dataset[0]
    context = sample['context']
    question = sample['question']
    answer = sample['answer']
    start_pos = sample['start_pos']

    input_ids, token_type_ids, start_pos, end_pos = squad_features(
        context, question, answer, start_pos, tokenizer)

    assert len(input_ids) == 176, \
        "Your tokenized result does not match the expected result."

    assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == tokenizer.tokenize(answer), \
        "The start and end tokens do not point the answer position."

    print("The forth test passed!")

    # Fifth test
    sample = dataset[80000]
    context = sample['context']
    question = sample['question']
    answer = sample['answer']
    start_pos = sample['start_pos']

    input_ids, token_type_ids, start_pos, end_pos = squad_features(
        context, question, answer, start_pos, tokenizer)

    assert len(input_ids) == 165, \
        "Your tokenized result does not match the expected result."

    assert tokenizer.convert_ids_to_tokens(input_ids[start_pos: end_pos+1]) == tokenizer.tokenize(answer), \
        "The start and end tokens do not point the answer position."

    print("The fifth test passed!")

    print("All 5 tests passed!")