sep_sentence.remove("<pad>")
        pad_num += 1

    for i, j in enumerate(sep_sentence[1:-2]):
        out[i + pad_num + 1] = max(char_label[current_idx:current_idx + len(j)])

        if j == "<unk>":
            current_idx = current_idx + 1
        else:
            current_idx = current_idx + len(j)

    return out.tolist()


config = AutoConfig.from_pretrained(model_path)
tokenizer = XLNetTokenizer.from_pretrained(model_path, unk_token=unk_token)
model = XLNetForTokenClassification.from_pretrained(model_path, num_labels=13)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
# device = torch.device("cpu")

model.to(device)

train_input_ids = []
train_labels = []
train_masks = []
Beispiel #2
0
def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/train.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Set the seed value all over the place to make this reproducible.
    seed_val = args.seed
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    # Choose device
    device = get_default_device()

    prompts_train_idxs = np.loadtxt(args.train_prompts_idxs_path,
                                    dtype=np.int64)
    topics_dist = np.loadtxt(args.unique_prompts_distribution_path,
                             dtype=np.int32)

    # Normalise
    topics_dist = topics_dist / np.linalg.norm(topics_dist, 1)

    # Load the BERT tokenizer.
    print('Loading BERT tokenizer...')
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                               do_lower_case=True)

    with open(args.unique_prompts_path) as f:
        topics = f.readlines()
    # Remove whitespaces and convert to lowercase
    topics = [x.strip().lower() for x in topics]

    with open(args.train_resps_path) as f:
        responses = f.readlines()
    # Remove whitespaces and convert to lower case
    responses = [x.strip().lower() for x in responses]

    # Tokenize all the prompts and the responses and then map the tokens to their word IDs
    topic_ids = []
    for sent in topics:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        topic_ids.append(encoded_sent)

    resp_ids = []
    for sent in responses:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        resp_ids.append(encoded_sent)

    MAX_LEN_topic = max([len(sen) for sen in topic_ids])
    MAX_LEN_resp = max([len(sen) for sen in resp_ids])
    print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token,
                                                   tokenizer.pad_token_id))

    # Pad our input tokens with value 0.
    # "post" indicates that we want to pad and truncate at the end of the sequence,
    # as opposed to the beginning.
    topic_ids = pad_sequences(topic_ids,
                              maxlen=MAX_LEN_topic,
                              dtype="long",
                              value=0,
                              truncating="post",
                              padding="post")

    resp_ids = pad_sequences(resp_ids,
                             maxlen=MAX_LEN_resp,
                             dtype="long",
                             value=0,
                             truncating="post",
                             padding="post")

    # The attention mask simply makes it explicit which tokens are actual words versus which are padding.
    attention_masks_topic = []
    # For each sentence...
    for sent in topic_ids:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        # Store the attention mask for this sentence.
        attention_masks_topic.append(att_mask)
    attention_masks_resp = []
    for sent in resp_ids:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        # Store the attention mask for this sentence.
        attention_masks_resp.append(att_mask)

    # Convert to torch tensors

    prompts_train_idxs = torch.from_numpy(prompts_train_idxs)
    prompts_train_idxs = prompts_train_idxs.long()

    topic_ids = torch.tensor(topic_ids)
    topic_ids = topic_ids.long()
    topic_ids = topic_ids.to(device)

    attention_masks_topic = torch.tensor(attention_masks_topic)
    attention_masks_topic = attention_masks_topic.long()
    attention_masks_topic = attention_masks_topic.to(device)

    resp_ids = torch.tensor(resp_ids)
    resp_ids = resp_ids.long()
    resp_ids = resp_ids.to(device)

    attention_masks_resp = torch.tensor(attention_masks_resp)
    attention_masks_resp = attention_masks_resp.long()
    attention_masks_resp = attention_masks_resp.to(device)

    # Create the DataLoader for our training set.
    print(prompts_train_idxs.size(0))
    print(resp_ids.size(0))
    print(attention_masks_resp.size(0))
    train_data = TensorDataset(prompts_train_idxs, resp_ids,
                               attention_masks_resp)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
    model = XLNetForSequenceClassification.from_pretrained(
        "xlnet-base-cased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    model.to(device)

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)

    loss_values = []

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * args.n_epochs
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    for epoch in range(args.n_epochs):
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(
            epoch + 1, args.n_epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))
            p_id = batch[0].to(device)
            r = batch[1].to(device)
            r_msk = batch[2].to(device)
            # Perform dynamic shuffling
            p_id, r, r_msk, y_true, batch_size = _shuffle(
                p_id, r, r_msk, topics_dist, args.num_topics, device)
            # Get the prompts from the topics
            p, p_msk = _get_prompts(p_id, topic_ids, attention_masks_topic)
            p, p_msk = p.to(device), p_msk.to(device)
            # Concatenate prompts and responses
            pr_resp, pr_resp_msk = _join_pr_resp(p, p_msk, r, r_msk,
                                                 args.reverse)
            pr_resp, pr_resp_msk = pr_resp.to(device), pr_resp_msk.to(device)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(pr_resp,
                            token_type_ids=None,
                            attention_mask=pr_resp_msk,
                            labels=y_true)

            # The call to `model` always returns a tuple, so we need to pull the
            # loss value out of the tuple.
            loss = outputs[0]
            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_loss += loss.item()
            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            # Update the learning rate.
            scheduler.step()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() -
                                                              t0)))

        # NEED TO DO THE VALIDATION CODE NOW - see the rest of the tutorial at
        # https://medium.com/@aniruddha.choudhury94/part-2-bert-fine-tuning-tutorial-with-pytorch-for-text-classification-on-the-corpus-of-linguistic-18057ce330e1

    # Save the model to a file
    file_path = args.save_path + 'xlnet_classifier_seed' + str(
        args.seed) + '.pt'
    torch.save(model, file_path)
Beispiel #3
0
            res2['Patient_is_Pro_Vaccination__c']),
        'Hesitancy_Classification__c':
        max(res1['Hesitancy_Classification__c'],
            res2['Hesitancy_Classification__c']),
        'timestamp':
        str(datetime.now())
    }
    return model_res


checkpoint = torch.load("xlnet_vaccine.bin")
model_state_dict = checkpoint['state_dict']
model = XLNetForMultiLabelSequenceClassification(
    num_labels=model_state_dict["classifier.weight"].size()[0])
model.load_state_dict(model_state_dict)
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                           do_lower_case=True)


@api_view(['GET', 'POST'])
def predictXLNET(request):
    sessionID = str(request.GET.get('session'))
    print(sessionID)
    surveydata = read_bq(sessionID)
    print(surveydata)

    label_cols = [
        'Conspiracy: Distrust of government, organizations, big pharma',
        'Fear of Critical side-effects (Autism, Brain Damage, SIDS/Death)',
        'Fear of Non-critical side-effects (Rash, Pain, Fever, GI problems, Bump on arm)',
        'Holistic or alternative medicine', 'Logistic Concerns', 'Pro-vax',
        'Religious Beliefs', 'Right to choose',
Beispiel #4
0
def main():
    num_embeddings = 512
    # Select a batch size for training
    batch_size = 32
    """
    train_mode: True  ==> training
      or        False ==> predict
    """
    train_mode = True

    load_trained = False

    train = pd.read_csv("./data/train.csv")
    test = pd.read_csv("./data/test.csv")

    label_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    if len(sys.argv) < 3:
        print("Example: python3 XLNet.py <label> <device_no(int)>")
        sys.exit()

    label_start = int(sys.argv[1])
    device_no = sys.argv[2]

    print("GPU Available: {}".format(torch.cuda.is_available()))
    n_gpu = torch.cuda.device_count()
    print("Number of GPU Available: {}".format(n_gpu))
    device = torch.device(
        "cuda:{}".format(device_no) if torch.cuda.is_available() else "cpu")
    print("using device: {}".format(device))

    if not os.path.exists("./submission"):
        os.mkdir("./submission")

    sample = pd.read_csv("./data/sample_submission.csv")

    for label in label_cols[label_start:label_start + 3]:
        print("Label: {}".format(label))
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                                   do_lower_case=True)
        train_text_list = train["comment_text"].values
        test_text_list = test["comment_text"].values

        train_input_ids = tokenize_inputs(train_text_list,
                                          tokenizer,
                                          num_embeddings=num_embeddings)
        test_input_ids = tokenize_inputs(test_text_list,
                                         tokenizer,
                                         num_embeddings=num_embeddings)

        train_attention_masks = create_attn_masks(train_input_ids)
        test_attention_masks = create_attn_masks(test_input_ids)

        # add input ids and attention masks to the dataframe
        train["features"] = train_input_ids.tolist()
        train["masks"] = train_attention_masks

        test["features"] = test_input_ids.tolist()
        test["masks"] = test_attention_masks

        Y_true = y_split(train, label)

        # train valid split
        training, valid = train_test_split(train,
                                           test_size=0.2,
                                           random_state=23)

        X_train = training["features"].values.tolist()
        X_valid = valid["features"].values.tolist()

        Y_train = y_split(training, label)
        Y_valid = y_split(valid, label)

        train_masks = training["masks"].values.tolist()
        valid_masks = valid["masks"].values.tolist()

        # Convert all of our input ids and attention masks into
        # torch tensors, the required datatype
        X_train = torch.tensor(X_train)
        X_valid = torch.tensor(X_valid)

        Y_train = torch.tensor(Y_train, dtype=torch.long)
        Y_valid = torch.tensor(Y_valid, dtype=torch.long)

        train_masks = torch.tensor(train_masks, dtype=torch.long)
        valid_masks = torch.tensor(valid_masks, dtype=torch.long)

        # Create an iterator of our data with torch DataLoader. This helps save on
        # memory during training because, unlike a for loop,
        # with an iterator the entire dataset does not need to be loaded into memory

        train_data = TensorDataset(X_train, train_masks, Y_train)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=batch_size)

        validation_data = TensorDataset(X_valid, valid_masks, Y_valid)
        validation_sampler = SequentialSampler(validation_data)
        validation_dataloader = DataLoader(validation_data,
                                           sampler=validation_sampler,
                                           batch_size=batch_size)

        num_labels = 2
        num_epochs = 5

        # load model: xlnet_label_3ep_weight.bin (trained on 2.4.2020  score: 0.84)
        model_save_path = "xlnet_{}_{}embed_{}ep_weights.bin".format(
            label, num_embeddings, 3)

        if load_trained:
            model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist = load_model(
                model_save_path)
            # print(model)
        else:
            model = XLNetForMultiLabelSequenceClassification(
                num_labels=num_labels)

        # Freeze pretrained xlnet parameters
        # model.freeze_xlnet_decoder()
        model.unfreeze_xlnet_decoder()

        optimizer = AdamW(model.parameters(),
                          lr=2e-5,
                          weight_decay=0.01,
                          correct_bias=False)

        if train_mode:
            model, train_loss_set, valid_loss_set = train_model(
                model,
                num_epochs=num_epochs,
                optimizer=optimizer,
                train_dataloader=train_dataloader,
                valid_dataloader=validation_dataloader,
                model_save_path=model_save_path,
                device=device)
        else:
            # load model
            model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist = load_model(
                model_save_path)
            # print(model)

        # validation
        valid_preds = generate_predictions(model,
                                           valid,
                                           num_labels,
                                           device=device,
                                           batch_size=batch_size)
        score = roc_auc_score_FIXED(Y_valid, valid_preds)
        print("Label: {}, ROC_AUC: {}".format(label, score))

        predicts = generate_predictions(model,
                                        test,
                                        num_labels,
                                        device=device,
                                        batch_size=batch_size)

        sample[label] = predicts
        output_filename = "submission_XLNET_{}_{}_{}ep.csv".format(
            datetime.datetime.now().date(), label, num_epochs)
        sample.to_csv(output_filename, index=False)
        print("Label: {}, Output: {}".format(label, output_filename))

        # print(predicts)

        sample[label] = predicts
        sample.to_csv("submission_XLNET_{}_{}ep.csv".format(label, num_epochs),
                      index=False)
Beispiel #5
0
 def __init__(self, vocab_path, do_lower_case):
     self.tokenizer = XLNetTokenizer(vocab_path, do_lower_case)
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )
Beispiel #7
0
    'epoch_size': 4
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='xlnet-base-cased',
                        help='model name or path')
    args = parser.parse_args()

    config = XLNetConfig.from_pretrained(args.model)
    model = XLNetModel.from_pretrained(args.model, config=config)
    tokenizer = XLNetTokenizer.from_pretrained(args.model)

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
    ]
    results = se.eval(transfer_tasks)

    sts_task_list = [
Beispiel #8
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print('gpu count:', n_gpu)

    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(random_seed)

    os.makedirs(output_dir, exist_ok=True)

    model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
    model.to(device)
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

    no_decay = ['bias', 'LayerNorm.weight']
    ## note: no weight decay according to XLNet paper
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)

    train_data = load_and_cache_examples(data_path, 'race', tokenizer)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=train_batch_size)

    num_train_steps = len(
        train_dataloader) // gradient_accumulation_steps * num_train_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_steps)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataloader))
    logger.info("  Batch size = %d", train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    global_step = 0

    for ep in range(int(num_train_epochs)):
        model.train()
        max_score = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            output = model(input_ids=input_ids,
                           token_type_ids=segment_ids,
                           attention_mask=input_mask,
                           labels=label_ids)
            loss = output.loss
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()  # We have accumulated enought gradients
                scheduler.step()
                model.zero_grad()
                global_step += 1

            if step % 800 == 0:
                logger.info("Training loss: {}, global step: {}".format(
                    tr_loss / nb_tr_steps, global_step))

        eval_data = load_and_cache_examples(data_path,
                                            'race',
                                            tokenizer,
                                            evaluate=True)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)

        logger.info("***** Running Dev Evaluation *****")
        logger.info("  Num examples = %d", len(eval_dataloader))
        logger.info("  Batch size = %d", eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        logits_all = []
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                eval_output = model(input_ids=input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
            tmp_eval_loss = eval_output.loss
            logits = eval_output.logits
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            for i in range(len(logits)):
                logits_all += [logits[i]]

            tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': tr_loss / nb_tr_steps
        }
        logger.info(" Epoch: %d", (ep + 1))
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))

        output_eval_file = os.path.join(output_dir, "results.txt")
        with open(output_eval_file, "a+") as writer:
            writer.write(" Epoch: " + str(ep + 1))
            for key in sorted(result.keys()):
                writer.write("%s = %s\n" % (key, str(result[key])))

        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            output_dir, "pytorch_model_{}epoch.bin".format(ep + 1))
        torch.save(model_to_save.state_dict(), output_model_file)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "fasta":
            FASTA_DATASET = True

            datasets = load_dataset_fasta(data_files, data_args.max_seq_length)
        else:
            if extension == "txt":
                extension = "text"
            datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = XLNetConfig()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = XLNetTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    tokenized_datasets = dict()
    for dataset_key, dataset in datasets.items():
        # Tokenize
        encodings = tokenizer(
            dataset['sequences'],
            truncation=True,
            padding='max_length', # TODO get from args passed in
            max_length=data_args.max_seq_length,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_attention_mask=False
        )
        
        torch_dataset = FastaDataset(encodings)
        tokenized_datasets[dataset_key] = torch_dataset


    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
Beispiel #10
0
def make_xlnet_tokenizer() -> PreTrainedTokenizer:
    tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    return tokenizer
Beispiel #11
0
    parser = ArgumentParser()
    parser.add_argument('--path')
    parser.add_argument('--xlnet', action='store_true', default=False, help='whether using xlnet tokenizer for preprocessing')
    parser.add_argument('--roberta', action='store_true', default=False)
    parser.add_argument('--bert', action='store_true', default=False)
    args = parser.parse_args()

    return args


if __name__ == '__main__':

    args = argparser()

    index, text, gold = load_data(args.path)
    if args.xlnet:
        model_version = 'xlnet-base-cased'
        tokenizer = XLNetTokenizer.from_pretrained(model_version, do_lower_case=True)

    elif args.bert:
        model_version = 'bert-base-uncased'
        tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)

    elif args.roberta:
        model_version = 'roberta-base'
        tokenizer = RobertaTokenizer.from_pretrained(model_version, do_lower_case=True)

    context, text_attention_mask = tokenization(text, tokenizer, args)
    save_preprocessing(context, text_attention_mask, gold)
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return self.examples[item]


if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--overwrite_cache",
        default=False,
        type=bool,
    )
    parser.add_argument(
        "--model_type",
        type=str,
        default="xlnet",
        help="The model architecture to be trained or fine-tuned.",
    )
    args = parser.parse_args()

    path = "/Users/eyalorbach/data/movie_plots_short/valid"
    tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased",
                                               cache_dir="/tmp/cache")
    mds = MaskedPlotDataset(tokenizer, args, path)
Beispiel #13
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print('gpu count:', n_gpu)

    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(random_seed)

    os.makedirs(output_dir, exist_ok=True)

    model_state_dict = torch.load(output_model_file, map_location=device)
    model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased',
                                                   state_dict=model_state_dict)
    model.to(device)
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

    eval_data = load_and_cache_examples(data_path,
                                        'mc500',
                                        tokenizer,
                                        test=True)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=eval_batch_size)

    logger.info("***** Running Evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataloader))
    logger.info("  Batch size = %d", eval_batch_size)
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    logits_all = []
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            eval_output = model(input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
        tmp_eval_loss = eval_output.loss
        logits = eval_output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        for i in range(len(logits)):
            logits_all += [logits[i]]

        tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples

    result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy}
    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))

    output_eval_file = os.path.join(output_dir, "results.txt")
    with open(output_eval_file, "a+") as writer:
        for key in sorted(result.keys()):
            writer.write("%s = %s\n" % (key, str(result[key])))
Beispiel #14
0
        summary = splitted[SUMMARY_INDEX].strip()
        text = splitted[TEXT_INDEX]
        for junk in JUNK_HEADER_TEXT:
            text = text.replace(junk, "").strip()

        # Don't accept content with too small of text content or title content. Often these are very bad examples.
        if len(text) < 1024:
            return None
        if len(summary) < 30:
            return None

        return {"summary": summary, "text": text}


tok = XLNetTokenizer.from_pretrained("xlnet-base-cased")


# This is a map function for processing reviews. It returns a dict:
#  { 'text' { input_ids_as_tensor },
#    'target' { input_ids_as_tensor } }
def map_tokenize_news(processed):
    text = processed["text"]
    text_enc = tok.encode(text,
                          add_special_tokens=False,
                          max_length=None,
                          pad_to_max_length=False)

    title = processed["summary"]
    # Insert the title as the second sentence, forcing the proper token types.
    title_enc = tok.encode(title,
Beispiel #15
0
    def load(cls,
             pretrained_model_name_or_path,
             revision=None,
             tokenizer_class=None,
             use_fast=True,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        kwargs["revision"] = revision

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(
                pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if "AlbertTokenizer" in tokenizer_class:
            if use_fast:
                ret = AlbertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "XLMRobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLMRobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = RobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "XLNetTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLNetTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "CamembertTokenizer" in tokenizer_class:
            if use_fast:
                ret = CamembertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRQuestionEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRContextEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained XLNet model).")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--discr",
                        default=False,
                        action='store_true',
                        help="Whether to do discriminative fine-tuning.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--layers',
        type=int,
        nargs='+',
        default=[-2],
        help="choose the layers that used for downstream tasks, "
        "-2 means use pooled output, -1 means all layer,"
        "else means the detail layers. default is -2")
    parser.add_argument('--num_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--num_test_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--pooling_type',
                        default=None,
                        type=str,
                        choices=[None, 'mean', 'max'])
    args = parser.parse_args()

    processors = {
        "sst": SSTProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    summary_writer = SummaryWriter(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")

    model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased")

    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    #no_decay = ['bias', 'gamma', 'beta']

    optimizer_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.0
    }]

    optimizer = AdamW(optimizer_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)

    global_step = 0
    global_train_step = 0

    all_examples = processor.get_all_examples(args.data_dir)

    all_features = convert_examples_to_features(all_examples, label_list,
                                                args.max_seq_length, tokenizer)

    all_input_ids = all_features['input_ids']
    all_input_mask = all_features['attention_mask']
    all_segment_ids = all_features['token_type_ids']
    all_label_ids = all_features['labels']

    all_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                             all_label_ids)

    train_data, eval_data = random_split(all_data, [100000, 12428])

    eval_dataloader = DataLoader(eval_data,
                                 batch_size=args.eval_batch_size,
                                 shuffle=False)

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Batch size = %d", args.train_batch_size)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        print("TOTAL STEPS: ",
              (len(train_dataloader) * int(args.num_train_epochs)))

        epoch = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            epoch += 1
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, token_type_ids, label_ids = batch
                # print("Input ids shape:", input_ids.shape)
                # print("Input mask shape:", input_mask.shape)
                # print("Tok type Ids shape:", segment_ids.shape)
                # print("Labels shape:", label_ids.shape)
                loss, _ = model(input_ids,
                                attention_mask=input_mask,
                                token_type_ids=token_type_ids,
                                labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    # scheduler.step()

                    summary_writer.add_scalar('Loss/train', loss.item(),
                                              global_step)

                    # possibly comment this out
                    max_grad_norm = 1.0
                    _clip_grad_norm(optimizer_parameters, max_grad_norm)
                    model.zero_grad()
                    global_step += 1

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            pos_eval_prec, pos_eval_recall, pos_eval_f1 = 0, 0, 0
            neg_eval_prec, neg_eval_recall, neg_eval_f1 = 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(
                    os.path.join(args.output_dir,
                                 "results_ep" + str(epoch) + ".txt"),
                    "w") as f:
                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluate"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(
                            input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.detach().to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output) + "\n")
                    tmp_eval_accuracy = np.sum(outputs == label_ids)
                    tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_neg_sent(
                        outputs, label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy
                    neg_eval_prec += tmp_eval_prec
                    neg_eval_recall += tmp_eval_recall
                    neg_eval_f1 += tmp_eval_f1

                    tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_pos_sent(
                        outputs, label_ids)
                    pos_eval_prec += tmp_eval_prec
                    pos_eval_recall += tmp_eval_recall
                    pos_eval_f1 += tmp_eval_f1

                    global_train_step += 1

                    summary_writer.add_scalar("Loss/test",
                                              tmp_eval_loss.mean().item(),
                                              global_train_step)
                    summary_writer.add_scalar("Accuracy/test",
                                              tmp_eval_accuracy,
                                              global_train_step)

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples

            pos_eval_prec = pos_eval_prec / nb_eval_steps
            pos_eval_recall = pos_eval_recall / nb_eval_steps
            pos_eval_f1 = pos_eval_f1 / nb_eval_steps

            neg_eval_prec = neg_eval_prec / nb_eval_steps
            neg_eval_recall = neg_eval_recall / nb_eval_steps
            neg_eval_f1 = neg_eval_f1 / nb_eval_steps

            result = {
                'eval_loss': eval_loss,
                'eval_accuracy': eval_accuracy,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps,
                'pos_eval_precision': pos_eval_prec,
                'neg_eval_precision': neg_eval_prec,
                'pos_eval_recall': pos_eval_recall,
                'neg_eval_recall': neg_eval_recall,
                'pos_eval_f1': pos_eval_f1,
                'neg_eval_f1': neg_eval_f1
            }

            summary_writer.add_scalar("Epoch_loss/train", tr_loss, epoch)
            summary_writer.add_scalar("Epoch_loss/test", eval_loss, epoch)
            summary_writer.add_scalar("Epoch_accuracy/test", eval_accuracy,
                                      epoch)

            summary_writer.add_scalar("Epoch_positive_precision/test",
                                      pos_eval_prec, epoch)
            summary_writer.add_scalar("Epoch_negative_precision/test",
                                      neg_eval_prec, epoch)

            summary_writer.add_scalar("Epoch_positive_recall/test",
                                      pos_eval_recall, epoch)
            summary_writer.add_scalar("Epoch_negative_recall/test",
                                      neg_eval_recall, epoch)

            summary_writer.add_scalar("Epoch_positive_f1/test", pos_eval_f1,
                                      epoch)
            summary_writer.add_scalar("Epoch_negative_f1/test", neg_eval_f1,
                                      epoch)

            output_eval_file = os.path.join(
                args.output_dir, "eval_results_ep" + str(epoch) + ".txt")
            print("output_eval_file=", output_eval_file)
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
            print("Saving model")
            torch.save(
                model.module.state_dict(),
                os.path.join(
                    args.output_dir, "sst-phrases-finetuned-xlnet-model_" +
                    str(epoch) + ".pth"))
Beispiel #17
0
                                           'test.json')

    args.embedding_path = os.path.join(args.tokenizer_dir, 'embedding.bin')
    args.config = os.path.join(args.tokenizer_dir, 'knowledge_config.json')
    print(args)

    if tokenization == 'BERT':
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_dir)
        banwords = tokenizer.convert_tokens_to_ids(
            ['It', 'She', 'They', 'He', 'it', 'she', 'he', 'they'])
    elif tokenization == 'GPT2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_dir)
        banwords = tokenizer.convert_tokens_to_ids(
            ['It', 'She', 'They', 'He', 'it', 'she', 'he', 'they'])
    elif tokenization == 'XLNET':
        tokenizer = XLNetTokenizer.from_pretrained(args.tokenizer_dir)
        banwords = tokenizer.convert_tokens_to_ids(
            ['It', 'She', 'They', 'He', 'it', 'she', 'he', 'they'])
    else:
        raise NotImplementedError

    with open(args.config, 'r') as f:
        knowledge_config = json.load(f)
        config = SimpleNamespace(**knowledge_config)
    print(config)

    if args.option == 'compute_bleu':
        with open('decoded_results.json', 'r') as f:
            results = json.load(f)
        with open(args.test_path, 'r') as f:
            references = json.load(f)
Beispiel #18
0
"""
import torch
from transformers import XLNetTokenizer

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
            confusion_matrix, classification_report
import time
import sys

sys.path.append('/home/xijian/pycharm_projects/document-level-classification/')
from xlnet_hierarchical_attn.config import *
from xlnet_hierarchical_attn.prepare_data import load_data
from xlnet_hierarchical_attn.train import MyXLNetModel, printbar

tokenizer = XLNetTokenizer.from_pretrained(xlnet_model_dir+'spiece.model')

ngpu = 4 # 4

use_cuda = torch.cuda.is_available() # 检测是否有可用的gpu
device = torch.device("cuda:0" if (use_cuda and ngpu>0) else "cpu")
print('*'*8, 'device:', device)


# checkpoint = save_dir + 'epoch011_valacc0.971_ckpt.tar'
checkpoint = save_dir + last_new_checkpoint


@torch.no_grad()
def eval_step(model, inps, labs):
    input_ids, token_type_ids, attention_mask = inps
Beispiel #19
0
def load_tokenizer() -> XLNetTokenizer:
    tokenizer = XLNetTokenizer.from_pretrained(
        configs.data.path, max_len=configs.model.max_length, add_special_token=False)
    tokenizer.return_attention_mask = None
    return tokenizer
Beispiel #20
0
    parser.add_argument('--dataset',
                        type=str,
                        default='one-billion-words',
                        choices=['yelp', 'amazon', 'one-billion-words'])
    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    forward_model_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset)

    backward_model_path = '../checkpoints/backward_xlnet/{}'.format(
        args.dataset)

    forward_model = XLNetLMHeadModel.from_pretrained(forward_model_path)
    backward_model = XLNetLMHeadModel.from_pretrained(backward_model_path)

    forward_tokenizer = XLNetTokenizer.from_pretrained(forward_model_path)
    backward_tokenizer = XLNetTokenizer.from_pretrained(backward_model_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)
    forward_model = forward_model.to(device)
    backward_model = backward_model.to(device)

    forward_testset = XLNetDataset(
        args.dataset,
        "test",
        tokenizer=forward_tokenizer,
        max_sentence_length=args.max_sentence_length,
        is_forward=1)
    backward_testset = XLNetDataset(
        args.dataset,
Beispiel #21
0
def run(gpu_id, options, distributed=False):
    if distributed:
        dist.init_process_group(
            backend="nccl",
            rank=gpu_id,
            world_size=options.num_gpus,
            init_method="env://",
        )
        torch.cuda.set_device(gpu_id)
    torch.manual_seed(options.seed)
    use_cuda = torch.cuda.is_available() and not options.no_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    logger = lavd.Logger(options.name, disabled=gpu_id != 0)
    # Parser needs to be rebuilt, since it can't be serialised and it is needed to even
    # detect the number of GPUs, but here it's only used to log it.
    parser = build_parser() if gpu_id == 0 else None

    spinner = logger.spinner("Initialising")
    spinner.start()

    checkpoint = (default_checkpoint
                  if options.checkpoint is None else load_checkpoint(
                      os.path.join(options.checkpoint, "stats.pt")))
    # Either use the checkpoint directory as the configuration or use one of the
    # available pre-trained models.
    pre_trained = options.checkpoint or options.pre_trained

    # All but the primary GPU wait here, so that only the primary process loads the
    # pre-trained model and the rest uses the cached version.
    if distributed and gpu_id != 0:
        torch.distributed.barrier()

    model_kind = checkpoint["model"].get("kind") or options.model_kind
    use_special = True
    masked_lm = True
    if model_kind == "bert":
        if pre_trained is None:
            pre_trained = "bert-base-german-cased"
        config = BertConfig.from_pretrained(pre_trained)
        model = BertForMaskedLM.from_pretrained(pre_trained, config=config)
        tokeniser = BertTokenizer.from_pretrained(pre_trained)
    elif model_kind == "bert-scratch":
        # The pre_trained here is only for the configuartion (num layers etc.)
        # But the weights are not loaded
        if pre_trained is None:
            pre_trained = "bert-base-german-cased"
        # Use either the provided vocabulary or the pre_trained one.
        vocab = options.vocab or pre_trained
        tokeniser = BertTokenizer.from_pretrained(vocab)
        config = BertConfig.from_pretrained(pre_trained)
        config.vocab_size = tokeniser.vocab_size
        model = BertForMaskedLM(config)
    elif model_kind == "gpt2":
        if pre_trained is None:
            pre_trained = "gpt2"
        config = GPT2Config.from_pretrained(pre_trained)
        model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config)
        tokeniser = GPT2Tokenizer.from_pretrained(pre_trained)
        masked_lm = False
        use_special = False
    elif model_kind == "gpt2-german":
        assert pre_trained is not None, "--pre-trained must be given for gpt2-german"
        config = GPT2Config.from_pretrained(pre_trained)
        model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config)
        # Using the XLNetTokenizer because the pre-trained German GPT-2 model uses
        # SentencePiece and that's easiest way to use it.
        # That also means that the automatic tokenisation cannot be done, because XLNet
        # uses different placing of the special tokens.
        tokeniser = XLNetTokenizer.from_pretrained(
            pre_trained,
            keep_accents=True,
            unk_token="<unk>",
            # start and end of sequence use the same token
            bos_token="<endoftext>",
            eos_token="<endoftext>",
        )
        masked_lm = False
        use_special = False
    elif model_kind == "gpt2-scratch":
        # The pre_trained here is only for the configuartion (num layers etc.)
        # But the weights are not loaded
        if pre_trained is None:
            pre_trained = "gpt2"
        # Use either the provided vocabulary or the pre_trained one.
        vocab = options.vocab or pre_trained
        tokeniser = GPT2Tokenizer.from_pretrained(vocab)
        config = GPT2Config.from_pretrained(pre_trained)
        config.vocab_size = tokeniser.vocab_size
        model = GPT2LMHeadModel(config)
        masked_lm = False
        use_special = False
    else:
        raise Exception("No model available for {}".format(model_kind))
    model = model.to(device)

    # Primary process has loaded the model and the other can now load the cached
    # version.
    if distributed and gpu_id == 0:
        torch.distributed.barrier()

    train_dataset = TextDataset(
        options.train_text,
        tokeniser,
        use_special=use_special,
        manual_special=model_kind == "gpt2-german",
    )
    train_sampler = (DistributedSampler(train_dataset,
                                        num_replicas=options.num_gpus,
                                        rank=gpu_id) if distributed else None)
    train_data_loader = DataLoader(
        train_dataset,
        batch_size=options.batch_size,
        # Only shuffle when not using a sampler
        shuffle=train_sampler is None,
        num_workers=options.actual_num_workers,
        sampler=train_sampler,
        pin_memory=True,
    )

    validation_data_loaders = []
    for val_file in options.validation_text:
        vals = val_file.split("=", 1)
        if len(vals) > 1:
            # Remove whitespace around the name
            name = vals[0].strip()
            # Expand the ~ to the full path as it won't be done automatically since it's
            # not at the beginning of the word.
            file_path = os.path.expanduser(vals[1])
        else:
            name = None
            file_path = vals[0]
        validation_dataset = TextDataset(
            file_path,
            tokeniser,
            name=name,
            use_special=use_special,
            manual_special=model_kind == "gpt2-german",
        )
        validation_sampler = (DistributedSampler(
            validation_dataset, num_replicas=options.num_gpus, rank=gpu_id)
                              if distributed else None)
        validation_data_loader = DataLoader(
            validation_dataset,
            batch_size=options.batch_size,
            # Only shuffle when not using a sampler
            shuffle=validation_sampler is None,
            num_workers=options.actual_num_workers,
            sampler=validation_sampler,
            pin_memory=True,
        )
        validation_data_loaders.append(validation_data_loader)

    initial_lr = options.lr
    # Only restore the learning rate if resuming from a checkpoint and not manually
    # resetting the learning rate.
    if len(checkpoint["train"]["lr"]) > 0 and not options.reset_lr:
        initial_lr = checkpoint["train"]["lr"][-1]

    no_decay = ["bias", "LayerNorm.weight"]
    optimiser_grouped_parameters = [
        {
            "params": [
                param for name, param in model.named_parameters()
                if not any(nd in name for nd in no_decay)
            ],
            "weight_decay":
            options.weight_decay,
        },
        {
            "params": [
                param for name, param in model.named_parameters()
                if any(nd in name for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimiser = AdamW(optimiser_grouped_parameters,
                      lr=initial_lr,
                      eps=options.adam_eps)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimiser,
        num_warmup_steps=options.lr_warmup,
        num_training_steps=options.num_epochs,
    )

    amp_scaler = amp.GradScaler() if use_cuda and options.fp16 else None

    if distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[gpu_id],
                                        find_unused_parameters=True)

    validation_details = [
        OrderedDict(
            name=data_loader.dataset.name,
            path=data_loader.dataset.path,
            size=len(data_loader.dataset),
        ) for data_loader in validation_data_loaders
    ]
    experiment = OrderedDict(
        model_kind=model_kind,
        train=OrderedDict(path=train_dataset.path, size=len(train_dataset)),
        validation=validation_details,
        options=options,
    )
    log_experiment(logger, experiment)

    logger.log_command(parser, options)

    # Wait for all processes to load eveything before starting training.
    # Not strictly necessary, since they will wait once the actual model is run, but
    # this makes it nicer to show the spinner until all of them are ready.
    if distributed:
        torch.distributed.barrier()
    spinner.stop()

    if options.checkpoint is not None:
        resume_text = "Resuming from - Epoch {epoch}".format(
            epoch=checkpoint["epoch"])
        logger.set_prefix(resume_text)
        epoch_results = [
            OrderedDict(
                name="Train",
                stats=OrderedDict(
                    loss=checkpoint["train"]["stats"]["loss"][-1],
                    perplexity=checkpoint["train"]["stats"]["perplexity"][-1],
                ),
            )
        ] + [
            OrderedDict(
                name=val_name,
                stats=OrderedDict(
                    loss=val_result["stats"]["loss"][-1],
                    perplexity=val_result["stats"]["perplexity"][-1],
                ),
            ) for val_name, val_result in checkpoint["validation"].items()
        ]
        log_epoch_stats(logger, epoch_results, metrics)

    train(
        logger,
        model,
        optimiser,
        train_data_loader,
        validation_data_loaders,
        lr_scheduler=lr_scheduler,
        device=device,
        num_epochs=options.num_epochs,
        checkpoint=checkpoint,
        model_kind=model_kind,
        amp_scaler=amp_scaler,
        masked_lm=masked_lm,
    )
Beispiel #22
0
    args.output_file = os.path.join(output_path, suffix)
    if args.started_sentence_id==1 and os.path.exists(args.output_file):
        os.remove(args.output_file)
    print('The output file is ', args.output_file)

    args.input_file = os.path.join(args.input_file, f'''{args.dataset}/{args.keywords}keywords.txt''')

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)

    if args.random==0:
        classifier_model_path = '../checkpoints/xlnet_classifier/{}'.format(args.dataset)
        args.classifier_model_path = classifier_model_path
        classifier_model = XLNetForTokenClassification.from_pretrained(classifier_model_path,num_labels=4)
        classifier_model_tokenizer = XLNetTokenizer.from_pretrained(classifier_model_path)

        logger.logger.info('Initialize backward XLNetForTokenClassification from checkpoint {}.'.format(classifier_model_path))
        classifier_model = classifier_model.to(device)
        classifier_model.eval()
    else:
        classifier_model = None
        classifier_model_tokenizer = None


    if args.model_name == 'LSTMLMGenerate':
        forward_lm_path = '../checkpoints/forward_lstm_lm/{}/best.pt'.format(args.dataset)
        backward_lm_path = '../checkpoints/backward_lstm_lm/{}/best.pt'.format(args.dataset)
        args.forward_lm_path = forward_lm_path
        args.backward_lm_path = backward_lm_path
Beispiel #23
0
class XlnetProcessor(object):
    """Base class for data converters for sequence classification data sets."""
    def __init__(self, vocab_path, do_lower_case):
        self.tokenizer = XLNetTokenizer(vocab_path, do_lower_case)

    def get_train(self, data_file):
        """Gets a collection of `InputExample`s for the train set."""
        return self.read_data(data_file)

    def get_dev(self, data_file):
        """Gets a collection of `InputExample`s for the dev set."""
        return self.read_data(data_file)

    def get_test(self, lines):
        return lines

    def get_labels(self):
        """Gets the list of labels for this data set."""
        return [
            "toxic", "severe_toxic", "obscene", "threat", "insult",
            "identity_hate"
        ]

    @classmethod
    def read_data(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        if 'pkl' in str(input_file):
            lines = load_pickle(input_file)
        else:
            lines = input_file
        return lines

    def truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def create_examples(self, lines, example_type, cached_examples_file):
        '''
        Creates examples for data
        '''
        pbar = ProgressBar(n_total=len(lines))
        if cached_examples_file.exists():
            logger.info("Loading examples from cached file %s",
                        cached_examples_file)
            examples = torch.load(cached_examples_file)
        else:
            examples = []
            for i, line in enumerate(lines):
                guid = '%s-%d' % (example_type, i)
                text_a = line[0]
                label = line[1]
                if isinstance(label, str):
                    label = [np.float(x) for x in label.split(",")]
                else:
                    label = [np.float(x) for x in list(label)]
                text_b = None
                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label)
                examples.append(example)
                pbar.batch_step(step=i, info={}, bar_type='create examples')
            logger.info("Saving examples into cached file %s",
                        cached_examples_file)
            torch.save(examples, cached_examples_file)
        return examples

    def create_features(self, examples, max_seq_len, cached_features_file):
        '''
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        '''
        # Load data features from cache or dataset file
        pbar = ProgressBar(n_total=len(examples))
        if cached_features_file.exists():
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            features = torch.load(cached_features_file)
        else:
            features = []
            pad_token = self.tokenizer.convert_tokens_to_ids(
                [self.tokenizer.pad_token])[0]
            cls_token = self.tokenizer.cls_token
            sep_token = self.tokenizer.sep_token
            cls_token_segment_id = 2
            pad_token_segment_id = 4

            for ex_id, example in enumerate(examples):
                tokens_a = self.tokenizer.tokenize(example.text_a)
                tokens_b = None
                label_id = example.label

                if example.text_b:
                    tokens_b = self.tokenizer.tokenize(example.text_b)
                    # Modifies `tokens_a` and `tokens_b` in place so that the total
                    # length is less than the specified length.
                    # Account for [CLS], [SEP], [SEP] with "- 3"
                    self.truncate_seq_pair(tokens_a,
                                           tokens_b,
                                           max_length=max_seq_len - 3)
                else:
                    # Account for [CLS] and [SEP] with '-2'
                    if len(tokens_a) > max_seq_len - 2:
                        tokens_a = tokens_a[:max_seq_len - 2]

                # xlnet has a cls token at the end
                tokens = tokens_a + [sep_token]
                segment_ids = [0] * len(tokens)
                if tokens_b:
                    tokens += tokens_b + [sep_token]
                    segment_ids += [1] * (len(tokens_b) + 1)
                tokens += [cls_token]
                segment_ids += [cls_token_segment_id]

                input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                input_mask = [1] * len(input_ids)
                input_len = len(input_ids)
                padding_len = max_seq_len - len(input_ids)

                # pad on the left for xlnet
                input_ids = ([pad_token] * padding_len) + input_ids
                input_mask = ([0] * padding_len) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_len) + segment_ids

                assert len(input_ids) == max_seq_len
                assert len(input_mask) == max_seq_len
                assert len(segment_ids) == max_seq_len

                if ex_id < 2:
                    logger.info("*** Example ***")
                    logger.info(f"guid: {example.guid}" % ())
                    logger.info(
                        f"tokens: {' '.join([str(x) for x in tokens])}")
                    logger.info(
                        f"input_ids: {' '.join([str(x) for x in input_ids])}")
                    logger.info(
                        f"input_mask: {' '.join([str(x) for x in input_mask])}"
                    )
                    logger.info(
                        f"segment_ids: {' '.join([str(x) for x in segment_ids])}"
                    )

                feature = InputFeature(input_ids=input_ids,
                                       input_mask=input_mask,
                                       segment_ids=segment_ids,
                                       label_id=label_id,
                                       input_len=input_len)
                features.append(feature)
                pbar.batch_step(step=ex_id,
                                info={},
                                bar_type='create features')
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)
        return features

    def create_dataset(self, features, is_sorted=False):
        # Convert to Tensors and build dataset
        if is_sorted:
            logger.info("sorted data by th length of input")
            features = sorted(features,
                              key=lambda x: x.input_len,
                              reverse=True)
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        return dataset
Beispiel #24
0
 def load_tokenizer(self,model_path):
     tokenizer = XLNetTokenizer.from_pretrained(model_path)
     return tokenizer
    parser.add_argument('--load_from_checkpoint', type=str)
    parser.add_argument('--continue_training', type=str)
    parser.add_argument('--output_directory', type=str)
    parser.add_argument('--tokenizer_path', type=str)
    parser.add_argument('--max_len', type=int, default=256)
    parser.add_argument('--max_steps', type=int, default=500)
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--num_gpus', type=int, default=4)
    args = parser.parse_args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    task = 'offense_rating'
    path2spiece = 'xlnet_base_cased\spiece.model' 
    max_len = 64
    tokenizer = XLNetTokenizer(vocab_file=path2spiece, do_lower_case=False)
    data_path = r'C:\Users\krish\hamze\SemEval-2021-Task-7-Hahackathon\xlnet\data\train.csv'
    df_data = pd.read_csv(data_path,sep=",",encoding="utf-8", usecols=['text', 'offense_rating'])
    # print(df_data.columns)
    print(df_data.head(n=20))
    # print(df_data.offense_rating.unique())
    # print(df_data.offense_rating.value_counts())
    sentences = df_data.text.to_list()
    labels = df_data.offense_rating.to_list()
    print(sentences[0], labels[0])
    tag2idx={'0': 0, '1': 1}
    tag2name={tag2idx[key] : key for key in tag2idx.keys()}


    
#Tokenization and Segmentation
Beispiel #26
0
    def __init__(self, args: dict, doLower: bool, train_batchSize: int, testval_batchSize:int, learningRate: float, doLearningRateScheduler: bool, target_columns: list, smartBatching: bool = True, mixedPrecision: bool = True, labelSentences: dict = None, max_label_len= None, model= None, optimizer= None, loss_fct= None, device= "cpu"):
        self.args = args
        self.labelSentences = labelSentences
        self.tokenizer = None
        self.device = device
        self.train_batchSize = train_batchSize
        self.testval_batchSize = testval_batchSize
        self.learningRate = learningRate
        self.optimizer = optimizer
        self.doLearningRateScheduler = doLearningRateScheduler
        self.learningRateScheduler = None
        self.smartBatching = smartBatching
        self.mixedPrecision = mixedPrecision
        self.max_label_len = max_label_len
        self.target_columns = target_columns
        self.input_multiclass_as_one = False


        if self.args["model"] in ["distilbert", "bert", "xlnet", "lstm", "roberta", "distilroberta"]:
            # define loss function
            if loss_fct:
                self.loss_fct = loss_fct
            else:
                self.loss_fct = BCEWithLogitsLoss()

            # define how many labels need to be classified
            if self.args["binaryClassification"]:
                self.num_labels = 1
            else:
                self.num_labels = len(self.labelSentences.keys())

        # build model from the model_str
        if self.args["model"] == "distilbert":
            if doLower:
                self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
            else:
                self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

        elif self.args["model"] == "bert":
            if doLower:
                self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            else:
                self.model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

        elif self.args["model"] == "xlnet":
            if doLower:
                # no lowercase version exists therefore using the cased version in the doLower case as well
                self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
            else:
                self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

        elif self.args["model"] == "roberta":
            if doLower:
                self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
            else:
                self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        elif self.args["model"] == "distilroberta":
            if doLower:
                self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
            else:
                self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

        #elif self.args["model"] == "CNN":
        #    self.model = MyLSTM(num_labels=self.num_labels)

        elif self.args["model"] == "gradboost":
            self.model = GradientBoostingClassifier(learning_rate= self.learningRate, n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1)
            self.input_multiclass_as_one = True

        elif self.args["model"] == "randomforest":
            self.model = RandomForestClassifier(n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1, n_jobs= -1)
            self.input_multiclass_as_one = True

        elif self.args["model"] == "naivebayes":
            self.model = OneVsRestClassifier(MultinomialNB(alpha= self.learningRate))

        elif self.args["model"] == "naivebayes_norm":
            self.model = Pipeline([
                ("nb_norm", MinMaxScaler()),
                ("nb_clf", OneVsRestClassifier(MultinomialNB(alpha= self.learningRate)))
                ])

        elif self.args["model"] == "sgd":
            self.model = OneVsRestClassifier(SGDClassifier(alpha= self.learningRate, loss='hinge', penalty='l2'))

        else:
            logging.error("Define a model in the args dict.")
            sys.exit("Define a model in the args dict.")
Beispiel #27
0
}

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                               lowercase=True,
                                               add_special_tokens=True)

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2',
                                                   lowercase=True,
                                                   add_special_tokens=True)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base',
                                                     lowercase=True,
                                                     add_special_tokens=True)

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                                 lowercase=True,
                                                 add_special_tokens=True)


def data_generator(f_path, params):
    with open(f_path) as f:
        for line in f:
            line = line.rstrip()
            text, slot_intent = line.split('\t')
            words = text.split()[1:-1]
            slot_intent = slot_intent.split()
            slots, intent = slot_intent[1:-1], slot_intent[-1]
            words = [
                params['word2idx'].get(w, len(params['word2idx']))
                for w in words
            ]
Beispiel #28
0
def setupXLNetSentimentAnalysis(modelName):
    tokenizer = XLNetTokenizer.from_pretrained(modelName)
    model = XLNetForSequenceClassification.from_pretrained(modelName)
    return pipeline(task="sentiment-analysis",
                    model=model,
                    tokenizer=tokenizer)
Beispiel #29
0
from transformers.modeling_utils import (WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
from transformers import XLNetTokenizer, XLNetForSequenceClassification, XLNetPreTrainedModel, XLNetModel
from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from torch.utils.data.dataset import ConcatDataset
from XLNet import XLNetForMultiSequenceClassification, Dataset_multi, Dataset_3Way, get_predictions

import pandas as pd
import numpy as np
import random
from IPython.display import clear_output
from tqdm.notebook import tqdm, trange

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
trainset = Dataset_3Way("RTE5_train", tokenizer=tokenizer, three_tasks=False)
train_sampler = RandomSampler(trainset)
train_dataloader = DataLoader(trainset, sampler=train_sampler, batch_size=1)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

PRETRAINED_MODEL_NAME = "xlnet-base-cased"
model = XLNetForMultiSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
                                                            output_attentions=True,)
Beispiel #30
0
    def __init__(self,
                 model='bert',
                 model_size='base',
                 cased=True,
                 fine_tune=False,
                 use_proj=False,
                 proj_dim=256):
        super(Encoder, self).__init__()
        assert (model in MODEL_LIST)

        self.base_name = model
        self.model = None
        self.tokenizer = None
        self.num_layers = None
        self.hidden_size = None
        self.fine_tune = fine_tune

        # First initialize the model and tokenizer
        model_name = ''

        # Do we want the tokenizer to lower case or not
        do_lower_case = False
        if model == 'bert' and (not cased):
            # For other models this choice doesn't make sense since they are trained
            # on cased version of text.
            do_lower_case = True

        # Model is one of the BERT variants
        if 'bert' in model:
            assert (model_size in BERT_MODEL_SIZES)
            model_name = model + "-" + model_size
            if model == 'bert' and not cased:
                # Only original BERT supports uncased models
                model_name += '-uncased'
            elif model == 'roberta':
                # RoBERTa model types have no casing suffix in HuggingFace map
                # So we don't modify the model name
                pass
            else:
                model_name += '-cased'

            if model == 'bert':
                self.model = BertModel.from_pretrained(
                    model_name, output_hidden_states=True)
                self.tokenizer = BertTokenizer.from_pretrained(
                    model_name, do_lower_case=do_lower_case)
            elif model == 'roberta':
                self.model = RobertaModel.from_pretrained(
                    model_name, output_hidden_states=True)
                self.tokenizer = RobertaTokenizer.from_pretrained(
                    model_name, do_lower_case=do_lower_case)
            elif model == 'spanbert':
                # Model is loaded in a different way
                # Earlier "pytorch_transformers" required a .tar.gz URL/file.
                # Updated library "transformers" requires pytorch_model.bin and config.json
                # separately. That's why we have to keep the SpanBERT codebase around and initialize
                # the model using that codebase (based on pytorch_pretrained_bert).
                # NOTE: By default transformer models are initialized to eval() mode!
                # Not using the eval() mode will result in randomness.
                self.model = SpanbertModel.from_pretrained(model_name).eval()
                # SpanBERT uses the same tokenizer as BERT (that's why the slicing in model name).
                # We use the tokenizer from "transformers" since it provides an almost unified API.
                self.tokenizer = BertTokenizer.from_pretrained(
                    model_name[4:], do_lower_case=do_lower_case)

            self.num_layers = self.model.config.num_hidden_layers + 1
            self.hidden_size = self.model.config.hidden_size

        elif model == "xlnet":
            model_name = model + "-" + model_size + "-cased"
            self.model = XLNetModel.from_pretrained(model_name,
                                                    output_hidden_states=True)
            self.tokenizer = XLNetTokenizer.from_pretrained(
                model_name, do_lower_case=do_lower_case)
            self.num_layers = self.model.config.num_hidden_layers + 1
            self.hidden_size = self.model.config.hidden_size

        # Set the model name
        self.model_name = model_name

        # Set shift size due to introduction of special tokens
        if self.base_name == 'xlnet':
            self.start_shift = 0
            self.end_shift = 2
        else:
            self.start_shift = (1 if self.tokenizer._cls_token else 0)
            self.end_shift = (1 if self.tokenizer._sep_token else 0)

        # Set requires_grad to False if not fine tuning
        if not fine_tune:
            for param in self.model.parameters():
                param.requires_grad = False

        if use_proj:
            # Apply a projection layer to output of pretrained models
            self.proj = nn.Linear(self.hidden_size, proj_dim)
            # Update the hidden size
            self.hidden_size = proj_dim
        else:
            self.proj = None
        # Set parameters required on top of pre-trained models
        self.weighing_params = nn.Parameter(torch.ones(self.num_layers))