Beispiel #1
0
def parse(jsonl_path, tokenizer, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # sentence1 text
    sentence1s = []
    # sentence2 text
    sentence2s = []
    # label
    labels = []

    xlnet_token_ids = []
    xlnet_token_masks = []
    xlnet_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        index = row["idx"]
        sentence1 = row["premise"]
        sentence2 = row["hypothesis"]
        label = row["label"] if "label" in row else 'entailment'

        sentence1s.append(sentence1)
        sentence2s.append(sentence2)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sentence1)
        sent2_tokens = tokenizer.tokenize(sentence2)

        if len(sent1_tokens) + len(sent2_tokens) > max_len:
            max_len = len(sent1_tokens) + len(sent2_tokens)

        while True:
            total_length = len(sent1_tokens) + len(sent2_tokens)
            # Account for [CLS], [SEP], [SEP] with "- 3"
            if total_length <= max_sequence_length - 3:
                break
            if len(sent1_tokens) > len(sent2_tokens):
                sent1_tokens.pop()
            else:
                sent2_tokens.pop()

        # Convert to XLNET manner
        tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"]
        token_segments = [0] * len(tokens)

        tokens += sent2_tokens + ["[SEP]"]
        token_segments += [1] * (len(sent2_tokens) + 1)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        xlnet_token_ids.append(torch.LongTensor(token_ids))
        xlnet_token_masks.append(torch.LongTensor(token_masks))
        xlnet_token_segments.append(torch.LongTensor(token_segments))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return MultitaskDataset(
        name="SuperGLUE",
        X_dict={
            "sentence1": sentence1s,
            "sentence2": sentence2s,
            "token_ids": xlnet_token_ids,
            "token_masks": xlnet_token_masks,
            "token_segments": xlnet_token_segments,
        },
        Y_dict={"labels": labels},
    )
Beispiel #2
0
def parse(jsonl_path, tokenizer, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # sentence1
    sent1s = []
    # sentence2
    sent2s = []
    # choice1
    choice1s = []
    # choice2
    choice2s = []

    labels = []

    xlnet_token1_ids = []
    xlnet_token2_ids = []

    # Check the maximum token length
    max_len = -1

    for sample in rows:
        index = sample["idx"]
        sent1 = sample["premise"]
        sent2 = sample["question"]

        sent2 = ("What was the cause of this?"
                 if sent2 == "cause" else "What happened as a result?")

        choice1 = sample["choice1"]
        choice2 = sample["choice2"]
        label = sample["label"] if "label" in sample else True
        sent1s.append(sent1)
        sent2s.append(sent2)
        choice1s.append(choice1)
        choice2s.append(choice2)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sent1)
        sent2_tokens = tokenizer.tokenize(sent2)

        # Tokenize choices
        choice1_tokens = tokenizer.tokenize(choice1)
        choice2_tokens = tokenizer.tokenize(choice2)

        # Convert to XLNET manner
        tokens1 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                   ["[SEP]"] + choice1_tokens + ["[SEP]"])
        tokens2 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                   ["[SEP]"] + choice2_tokens + ["[SEP]"])

        token1_ids = tokenizer.convert_tokens_to_ids(tokens1)
        token2_ids = tokenizer.convert_tokens_to_ids(tokens2)

        if len(token1_ids) > max_len:
            max_len = len(token1_ids)
        if len(token2_ids) > max_len:
            max_len = len(token2_ids)

        xlnet_token1_ids.append(torch.LongTensor(token1_ids))
        xlnet_token2_ids.append(torch.LongTensor(token2_ids))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return MultitaskDataset(
        name="SuperGLUE",
        X_dict={
            "sentence1": sent1s,
            "sentence2": sent2s,
            "choice1": choice1s,
            "choice2": choice2s,
            "token1_ids": xlnet_token1_ids,
            "token2_ids": xlnet_token2_ids,
        },
        Y_dict={"labels": labels},
    )
Beispiel #3
0
def parse(csv_path, tokenizer, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {csv_path}.")
    rows = pd.read_csv(csv_path)

    # for i in range(2):
    #     logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # sentence1
    sent1s = []
    # sentence2
    sent2s = []
    # choice1
    choice1s = []
    # choice2
    choice2s = []
    # choice3
    choice3s = []
    # choice4
    choice4s = []

    labels = []

    bert_token1_ids = []
    bert_token2_ids = []
    bert_token3_ids = []
    bert_token4_ids = []

    # Check the maximum token length
    max_len = -1

    for ex_idx, ex in rows.iterrows():
        sent1 = ex["sent1"]
        sent2 = ex["sent2"]

        choice1 = ex["ending0"]
        choice2 = ex["ending1"]
        choice3 = ex["ending2"]
        choice4 = ex["ending3"]

        label = ex["label"] if "label" in ex else 0

        sent1s.append(sent1)
        sent2s.append(sent2)
        choice1s.append(choice1)
        choice2s.append(choice2)
        choice3s.append(choice3)
        choice4s.append(choice4)

        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sent1)
        sent2_tokens = tokenizer.tokenize(sent2)
        choice1_tokens = tokenizer.tokenize(choice1)
        choice2_tokens = tokenizer.tokenize(choice2)
        choice3_tokens = tokenizer.tokenize(choice3)
        choice4_tokens = tokenizer.tokenize(choice4)

        # Convert to BERT manner
        bert_token1 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                       choice1_tokens + ["[SEP]"])
        bert_token2 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                       choice2_tokens + ["[SEP]"])
        bert_token3 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                       choice3_tokens + ["[SEP]"])
        bert_token4 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                       choice4_tokens + ["[SEP]"])

        token1_ids = tokenizer.convert_tokens_to_ids(bert_token1)
        token2_ids = tokenizer.convert_tokens_to_ids(bert_token2)
        token3_ids = tokenizer.convert_tokens_to_ids(bert_token3)
        token4_ids = tokenizer.convert_tokens_to_ids(bert_token4)

        if len(token1_ids) > max_len:
            max_len = len(token1_ids)
        if len(token2_ids) > max_len:
            max_len = len(token2_ids)
        if len(token3_ids) > max_len:
            max_len = len(token3_ids)
        if len(token4_ids) > max_len:
            max_len = len(token4_ids)

        bert_token1_ids.append(torch.LongTensor(token1_ids))
        bert_token2_ids.append(torch.LongTensor(token2_ids))
        bert_token3_ids.append(torch.LongTensor(token3_ids))
        bert_token4_ids.append(torch.LongTensor(token4_ids))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return MultitaskDataset(
        name="SuperGLUE",
        X_dict={
            "sent1": sent1s,
            "sent2": sent2s,
            "choice1": choice1s,
            "choice2": choice2s,
            "choice3": choice3s,
            "choice4": choice4s,
            "token1_ids": bert_token1_ids,
            "token2_ids": bert_token2_ids,
            "token3_ids": bert_token3_ids,
            "token4_ids": bert_token4_ids,
        },
        Y_dict={"labels": labels},
    )
Beispiel #4
0
def parse_from_rows(rows, tokenizer, max_sequence_length):

    # paragraph ids
    pids = []
    # question ids
    qids = []
    # answer ids
    aids = []

    # paragraph text
    paras = []
    # question text
    questions = []
    # answer text
    answers = []
    # labels
    labels = []

    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        pid = row["pid"]
        qid = row["qid"]
        aid = row["aid"]

        para_token = tokenizer.tokenize(
            row["paragraph"])[:max_sequence_length - 2]
        question_token = tokenizer.tokenize(
            row["question"])[:max_sequence_length - 2]
        answer_token = tokenizer.tokenize(row["answer"])[:max_sequence_length -
                                                         2]

        # Generate tokens
        tokens = (["[CLS]"] + para_token + ["[SEP]"] + question_token +
                  answer_token + ["[SEP]"])
        # No token segments
        token_segments = [0] * (len(para_token) + 2) + [0] * (
            len(question_token) + len(answer_token) + 1)
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        token_masks = [1] * len(token_ids)

        if len(tokens) > max_len:
            max_len = len(tokens)

        # Add to list
        paras.append(row["paragraph"])
        questions.append(row["question"])
        answers.append(row["answer"])

        label = row["label"]
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        pids.append(pid)
        qids.append(qid)
        aids.append(aid)

        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return MultitaskDataset(
        name="SuperGLUE",
        X_dict={
            "pid": pids,
            "qid": qids,
            "aid": aids,
            "para": paras,
            "question": questions,
            "answer": answers,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )
Beispiel #5
0
def parse(jsonl_path, tokenizer, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # sentence text
    sentences = []
    # span1
    span1s = []
    # span2
    span2s = []
    # span1 idx
    span1_idxs = []
    # span2 idx
    span2_idxs = []
    # label
    labels = []

    token1_idxs = []
    token2_idxs = []

    bert_tokens = []
    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        index = row["idx"]

        text = row["text"]
        span1_text = row["target"]["span1_text"]
        span2_text = row["target"]["span2_text"]
        span1_index = row["target"]["span1_index"]
        span2_index = row["target"]["span2_index"]

        label = row["label"] if "label" in row else True

        span1_char_index = get_char_index(text, span1_text, span1_index)
        span2_char_index = get_char_index(text, span2_text, span2_index)

        assert span1_char_index is not None, f"Check example {id} in {jsonl_path}"
        assert span2_char_index is not None, f"Check example {id} in {jsonl_path}"

        # Tokenize sentences
        bert_tokens_sub1 = tokenizer.tokenize(
            text[:min(span1_char_index[0], span2_char_index[0])])

        if span1_char_index[0] < span2_char_index[0]:
            bert_tokens_sub2 = tokenizer.tokenize(
                text[span1_char_index[0]:span1_char_index[1]])
            token1_idx = [
                len(bert_tokens_sub1) + 1,
                len(bert_tokens_sub1) + len(bert_tokens_sub2),
            ]
        else:
            bert_tokens_sub2 = tokenizer.tokenize(
                text[span2_char_index[0]:span2_char_index[1]])
            token2_idx = [
                len(bert_tokens_sub1) + 1,
                len(bert_tokens_sub1) + len(bert_tokens_sub2),
            ]

        sub3_st = (span1_char_index[1]
                   if span1_char_index[0] < span2_char_index[0] else
                   span2_char_index[1])
        sub3_ed = (span1_char_index[0]
                   if span1_char_index[0] > span2_char_index[0] else
                   span2_char_index[0])

        bert_tokens_sub3 = tokenizer.tokenize(text[sub3_st:sub3_ed])
        if span1_char_index[0] < span2_char_index[0]:
            bert_tokens_sub4 = tokenizer.tokenize(
                text[span2_char_index[0]:span2_char_index[1]])
            cur_len = (len(bert_tokens_sub1) + len(bert_tokens_sub2) +
                       len(bert_tokens_sub3))
            token2_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)]
        else:
            bert_tokens_sub4 = tokenizer.tokenize(
                text[span1_char_index[0]:span1_char_index[1]])
            cur_len = (len(bert_tokens_sub1) + len(bert_tokens_sub2) +
                       len(bert_tokens_sub3))
            token1_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)]

        if span1_char_index[0] < span2_char_index[0]:
            bert_tokens_sub5 = tokenizer.tokenize(text[span2_char_index[1]:])
        else:
            bert_tokens_sub5 = tokenizer.tokenize(text[span1_char_index[1]:])

        tokens = (["[CLS]"] + bert_tokens_sub1 + bert_tokens_sub2 +
                  bert_tokens_sub3 + bert_tokens_sub4 + bert_tokens_sub5 +
                  ["[SEP]"])

        if len(tokens) > max_len:
            max_len = len(tokens)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        token_segments = [0] * len(token_ids)
        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        token1_idxs.append(token1_idx)
        token2_idxs.append(token2_idx)

        sentences.append(text)
        span1s.append(span1_text)
        span2s.append(span2_text)
        span1_idxs.append(span1_index)
        span2_idxs.append(span2_index)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        bert_tokens.append(tokens)
        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    token1_idxs = torch.from_numpy(np.array(token1_idxs))
    token2_idxs = torch.from_numpy(np.array(token2_idxs))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return MultitaskDataset(
        name="SuperGLUE",
        X_dict={
            "sentence": sentences,
            "span1": span1s,
            "span2": span2s,
            "span1_idx": span1_idxs,
            "span2_idx": span2_idxs,
            "token1_idx": token1_idxs,
            "token2_idx": token2_idxs,
            "tokens": bert_tokens,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )
Beispiel #6
0
def parse_from_rows(rows, tokenizer, max_sequence_length):

    # sentence1 text
    sentence1s = []
    # sentence2 text
    sentence2s = []
    # sentence1 idx
    sentence1_idxs = []
    # sentence2 idx
    sentence2_idxs = []
    # word in common
    words = []
    # pos tag
    poses = []
    # label
    labels = []

    token1_idxs = []
    token2_idxs = []

    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        index = row["idx"]

        sentence1 = row["sentence1"]
        sentence2 = row["sentence2"]
        word = row["word"]
        pos = row["pos"]
        sentence1_idx = row["sentence1_idx"]
        sentence2_idx = row["sentence2_idx"]
        label = row["label"]

        sentence1s.append(sentence1)
        sentence2s.append(sentence2)
        sentence1_idxs.append(sentence1_idx)
        sentence2_idxs.append(sentence2_idx)
        words.append(word)
        poses.append(pos)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sentence1)
        sent2_tokens = tokenizer.tokenize(sentence2)

        word_tokens_in_sent1 = tokenizer.tokenize(sentence1.split()[sentence1_idx])
        word_tokens_in_sent2 = tokenizer.tokenize(sentence2.split()[sentence2_idx])

        while True:
            total_length = len(sent1_tokens) + len(sent2_tokens)
            if total_length > max_len:
                max_len = total_length
            # Account for [CLS], [SEP], [SEP] with "- 3"
            if total_length <= max_sequence_length - 3:
                break
            if len(sent1_tokens) > len(sent2_tokens):
                sent1_tokens.pop()
            else:
                sent2_tokens.pop()

        for idx in range(sentence1_idx - 1, len(sent1_tokens)):
            if (
                sent1_tokens[idx : idx + len(word_tokens_in_sent1)]
                == word_tokens_in_sent1
            ):
                token1_idxs.append(idx + 1)  # Add [CLS]
                break

        for idx in range(sentence2_idx - 1, len(sent2_tokens)):
            if (
                sent2_tokens[idx : idx + len(word_tokens_in_sent2)]
                == word_tokens_in_sent2
            ):
                token2_idxs.append(
                    idx + len(sent1_tokens) + 2
                )  # Add the length of the first sentence and [CLS] + [SEP]
                break

        # Convert to BERT manner
        tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"]
        token_segments = [0] * len(tokens)

        tokens += sent2_tokens + ["[SEP]"]
        token_segments += [1] * (len(sent2_tokens) + 1)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    token1_idxs = torch.from_numpy(np.array(token1_idxs))
    token2_idxs = torch.from_numpy(np.array(token2_idxs))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return MultitaskDataset(
        name="SuperGLUE",
        X_dict={
            "sentence1": sentence1s,
            "sentence2": sentence2s,
            "word": words,
            "pos": poses,
            "sentence1_idx": sentence1_idxs,
            "sentence2_idx": sentence2_idxs,
            "token1_idx": token1_idxs,
            "token2_idx": token2_idxs,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )