def parse(jsonl_path, tokenizer, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # sentence1 text sentence1s = [] # sentence2 text sentence2s = [] # label labels = [] xlnet_token_ids = [] xlnet_token_masks = [] xlnet_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: index = row["idx"] sentence1 = row["premise"] sentence2 = row["hypothesis"] label = row["label"] if "label" in row else 'entailment' sentence1s.append(sentence1) sentence2s.append(sentence2) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sentence1) sent2_tokens = tokenizer.tokenize(sentence2) if len(sent1_tokens) + len(sent2_tokens) > max_len: max_len = len(sent1_tokens) + len(sent2_tokens) while True: total_length = len(sent1_tokens) + len(sent2_tokens) # Account for [CLS], [SEP], [SEP] with "- 3" if total_length <= max_sequence_length - 3: break if len(sent1_tokens) > len(sent2_tokens): sent1_tokens.pop() else: sent2_tokens.pop() # Convert to XLNET manner tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"] token_segments = [0] * len(tokens) tokens += sent2_tokens + ["[SEP]"] token_segments += [1] * (len(sent2_tokens) + 1) token_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) xlnet_token_ids.append(torch.LongTensor(token_ids)) xlnet_token_masks.append(torch.LongTensor(token_masks)) xlnet_token_segments.append(torch.LongTensor(token_segments)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return MultitaskDataset( name="SuperGLUE", X_dict={ "sentence1": sentence1s, "sentence2": sentence2s, "token_ids": xlnet_token_ids, "token_masks": xlnet_token_masks, "token_segments": xlnet_token_segments, }, Y_dict={"labels": labels}, )
def parse(jsonl_path, tokenizer, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # sentence1 sent1s = [] # sentence2 sent2s = [] # choice1 choice1s = [] # choice2 choice2s = [] labels = [] xlnet_token1_ids = [] xlnet_token2_ids = [] # Check the maximum token length max_len = -1 for sample in rows: index = sample["idx"] sent1 = sample["premise"] sent2 = sample["question"] sent2 = ("What was the cause of this?" if sent2 == "cause" else "What happened as a result?") choice1 = sample["choice1"] choice2 = sample["choice2"] label = sample["label"] if "label" in sample else True sent1s.append(sent1) sent2s.append(sent2) choice1s.append(choice1) choice2s.append(choice2) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sent1) sent2_tokens = tokenizer.tokenize(sent2) # Tokenize choices choice1_tokens = tokenizer.tokenize(choice1) choice2_tokens = tokenizer.tokenize(choice2) # Convert to XLNET manner tokens1 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + ["[SEP]"] + choice1_tokens + ["[SEP]"]) tokens2 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + ["[SEP]"] + choice2_tokens + ["[SEP]"]) token1_ids = tokenizer.convert_tokens_to_ids(tokens1) token2_ids = tokenizer.convert_tokens_to_ids(tokens2) if len(token1_ids) > max_len: max_len = len(token1_ids) if len(token2_ids) > max_len: max_len = len(token2_ids) xlnet_token1_ids.append(torch.LongTensor(token1_ids)) xlnet_token2_ids.append(torch.LongTensor(token2_ids)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return MultitaskDataset( name="SuperGLUE", X_dict={ "sentence1": sent1s, "sentence2": sent2s, "choice1": choice1s, "choice2": choice2s, "token1_ids": xlnet_token1_ids, "token2_ids": xlnet_token2_ids, }, Y_dict={"labels": labels}, )
def parse(csv_path, tokenizer, max_data_samples, max_sequence_length): logger.info(f"Loading data from {csv_path}.") rows = pd.read_csv(csv_path) # for i in range(2): # logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # sentence1 sent1s = [] # sentence2 sent2s = [] # choice1 choice1s = [] # choice2 choice2s = [] # choice3 choice3s = [] # choice4 choice4s = [] labels = [] bert_token1_ids = [] bert_token2_ids = [] bert_token3_ids = [] bert_token4_ids = [] # Check the maximum token length max_len = -1 for ex_idx, ex in rows.iterrows(): sent1 = ex["sent1"] sent2 = ex["sent2"] choice1 = ex["ending0"] choice2 = ex["ending1"] choice3 = ex["ending2"] choice4 = ex["ending3"] label = ex["label"] if "label" in ex else 0 sent1s.append(sent1) sent2s.append(sent2) choice1s.append(choice1) choice2s.append(choice2) choice3s.append(choice3) choice4s.append(choice4) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sent1) sent2_tokens = tokenizer.tokenize(sent2) choice1_tokens = tokenizer.tokenize(choice1) choice2_tokens = tokenizer.tokenize(choice2) choice3_tokens = tokenizer.tokenize(choice3) choice4_tokens = tokenizer.tokenize(choice4) # Convert to BERT manner bert_token1 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + choice1_tokens + ["[SEP]"]) bert_token2 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + choice2_tokens + ["[SEP]"]) bert_token3 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + choice3_tokens + ["[SEP]"]) bert_token4 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + choice4_tokens + ["[SEP]"]) token1_ids = tokenizer.convert_tokens_to_ids(bert_token1) token2_ids = tokenizer.convert_tokens_to_ids(bert_token2) token3_ids = tokenizer.convert_tokens_to_ids(bert_token3) token4_ids = tokenizer.convert_tokens_to_ids(bert_token4) if len(token1_ids) > max_len: max_len = len(token1_ids) if len(token2_ids) > max_len: max_len = len(token2_ids) if len(token3_ids) > max_len: max_len = len(token3_ids) if len(token4_ids) > max_len: max_len = len(token4_ids) bert_token1_ids.append(torch.LongTensor(token1_ids)) bert_token2_ids.append(torch.LongTensor(token2_ids)) bert_token3_ids.append(torch.LongTensor(token3_ids)) bert_token4_ids.append(torch.LongTensor(token4_ids)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return MultitaskDataset( name="SuperGLUE", X_dict={ "sent1": sent1s, "sent2": sent2s, "choice1": choice1s, "choice2": choice2s, "choice3": choice3s, "choice4": choice4s, "token1_ids": bert_token1_ids, "token2_ids": bert_token2_ids, "token3_ids": bert_token3_ids, "token4_ids": bert_token4_ids, }, Y_dict={"labels": labels}, )
def parse_from_rows(rows, tokenizer, max_sequence_length): # paragraph ids pids = [] # question ids qids = [] # answer ids aids = [] # paragraph text paras = [] # question text questions = [] # answer text answers = [] # labels labels = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: pid = row["pid"] qid = row["qid"] aid = row["aid"] para_token = tokenizer.tokenize( row["paragraph"])[:max_sequence_length - 2] question_token = tokenizer.tokenize( row["question"])[:max_sequence_length - 2] answer_token = tokenizer.tokenize(row["answer"])[:max_sequence_length - 2] # Generate tokens tokens = (["[CLS]"] + para_token + ["[SEP]"] + question_token + answer_token + ["[SEP]"]) # No token segments token_segments = [0] * (len(para_token) + 2) + [0] * ( len(question_token) + len(answer_token) + 1) token_ids = tokenizer.convert_tokens_to_ids(tokens) token_masks = [1] * len(token_ids) if len(tokens) > max_len: max_len = len(tokens) # Add to list paras.append(row["paragraph"]) questions.append(row["question"]) answers.append(row["answer"]) label = row["label"] labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) pids.append(pid) qids.append(qid) aids.append(aid) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return MultitaskDataset( name="SuperGLUE", X_dict={ "pid": pids, "qid": qids, "aid": aids, "para": paras, "question": questions, "answer": answers, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )
def parse(jsonl_path, tokenizer, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # sentence text sentences = [] # span1 span1s = [] # span2 span2s = [] # span1 idx span1_idxs = [] # span2 idx span2_idxs = [] # label labels = [] token1_idxs = [] token2_idxs = [] bert_tokens = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: index = row["idx"] text = row["text"] span1_text = row["target"]["span1_text"] span2_text = row["target"]["span2_text"] span1_index = row["target"]["span1_index"] span2_index = row["target"]["span2_index"] label = row["label"] if "label" in row else True span1_char_index = get_char_index(text, span1_text, span1_index) span2_char_index = get_char_index(text, span2_text, span2_index) assert span1_char_index is not None, f"Check example {id} in {jsonl_path}" assert span2_char_index is not None, f"Check example {id} in {jsonl_path}" # Tokenize sentences bert_tokens_sub1 = tokenizer.tokenize( text[:min(span1_char_index[0], span2_char_index[0])]) if span1_char_index[0] < span2_char_index[0]: bert_tokens_sub2 = tokenizer.tokenize( text[span1_char_index[0]:span1_char_index[1]]) token1_idx = [ len(bert_tokens_sub1) + 1, len(bert_tokens_sub1) + len(bert_tokens_sub2), ] else: bert_tokens_sub2 = tokenizer.tokenize( text[span2_char_index[0]:span2_char_index[1]]) token2_idx = [ len(bert_tokens_sub1) + 1, len(bert_tokens_sub1) + len(bert_tokens_sub2), ] sub3_st = (span1_char_index[1] if span1_char_index[0] < span2_char_index[0] else span2_char_index[1]) sub3_ed = (span1_char_index[0] if span1_char_index[0] > span2_char_index[0] else span2_char_index[0]) bert_tokens_sub3 = tokenizer.tokenize(text[sub3_st:sub3_ed]) if span1_char_index[0] < span2_char_index[0]: bert_tokens_sub4 = tokenizer.tokenize( text[span2_char_index[0]:span2_char_index[1]]) cur_len = (len(bert_tokens_sub1) + len(bert_tokens_sub2) + len(bert_tokens_sub3)) token2_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)] else: bert_tokens_sub4 = tokenizer.tokenize( text[span1_char_index[0]:span1_char_index[1]]) cur_len = (len(bert_tokens_sub1) + len(bert_tokens_sub2) + len(bert_tokens_sub3)) token1_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)] if span1_char_index[0] < span2_char_index[0]: bert_tokens_sub5 = tokenizer.tokenize(text[span2_char_index[1]:]) else: bert_tokens_sub5 = tokenizer.tokenize(text[span1_char_index[1]:]) tokens = (["[CLS]"] + bert_tokens_sub1 + bert_tokens_sub2 + bert_tokens_sub3 + bert_tokens_sub4 + bert_tokens_sub5 + ["[SEP]"]) if len(tokens) > max_len: max_len = len(tokens) token_ids = tokenizer.convert_tokens_to_ids(tokens) token_segments = [0] * len(token_ids) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) token1_idxs.append(token1_idx) token2_idxs.append(token2_idx) sentences.append(text) span1s.append(span1_text) span2s.append(span2_text) span1_idxs.append(span1_index) span2_idxs.append(span2_index) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) bert_tokens.append(tokens) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) token1_idxs = torch.from_numpy(np.array(token1_idxs)) token2_idxs = torch.from_numpy(np.array(token2_idxs)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return MultitaskDataset( name="SuperGLUE", X_dict={ "sentence": sentences, "span1": span1s, "span2": span2s, "span1_idx": span1_idxs, "span2_idx": span2_idxs, "token1_idx": token1_idxs, "token2_idx": token2_idxs, "tokens": bert_tokens, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )
def parse_from_rows(rows, tokenizer, max_sequence_length): # sentence1 text sentence1s = [] # sentence2 text sentence2s = [] # sentence1 idx sentence1_idxs = [] # sentence2 idx sentence2_idxs = [] # word in common words = [] # pos tag poses = [] # label labels = [] token1_idxs = [] token2_idxs = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: index = row["idx"] sentence1 = row["sentence1"] sentence2 = row["sentence2"] word = row["word"] pos = row["pos"] sentence1_idx = row["sentence1_idx"] sentence2_idx = row["sentence2_idx"] label = row["label"] sentence1s.append(sentence1) sentence2s.append(sentence2) sentence1_idxs.append(sentence1_idx) sentence2_idxs.append(sentence2_idx) words.append(word) poses.append(pos) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sentence1) sent2_tokens = tokenizer.tokenize(sentence2) word_tokens_in_sent1 = tokenizer.tokenize(sentence1.split()[sentence1_idx]) word_tokens_in_sent2 = tokenizer.tokenize(sentence2.split()[sentence2_idx]) while True: total_length = len(sent1_tokens) + len(sent2_tokens) if total_length > max_len: max_len = total_length # Account for [CLS], [SEP], [SEP] with "- 3" if total_length <= max_sequence_length - 3: break if len(sent1_tokens) > len(sent2_tokens): sent1_tokens.pop() else: sent2_tokens.pop() for idx in range(sentence1_idx - 1, len(sent1_tokens)): if ( sent1_tokens[idx : idx + len(word_tokens_in_sent1)] == word_tokens_in_sent1 ): token1_idxs.append(idx + 1) # Add [CLS] break for idx in range(sentence2_idx - 1, len(sent2_tokens)): if ( sent2_tokens[idx : idx + len(word_tokens_in_sent2)] == word_tokens_in_sent2 ): token2_idxs.append( idx + len(sent1_tokens) + 2 ) # Add the length of the first sentence and [CLS] + [SEP] break # Convert to BERT manner tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"] token_segments = [0] * len(tokens) tokens += sent2_tokens + ["[SEP]"] token_segments += [1] * (len(sent2_tokens) + 1) token_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) token1_idxs = torch.from_numpy(np.array(token1_idxs)) token2_idxs = torch.from_numpy(np.array(token2_idxs)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return MultitaskDataset( name="SuperGLUE", X_dict={ "sentence1": sentence1s, "sentence2": sentence2s, "word": words, "pos": poses, "sentence1_idx": sentence1_idxs, "sentence2_idx": sentence2_idxs, "token1_idx": token1_idxs, "token2_idx": token2_idxs, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )