def _create_examples(self, lines: List[List[str]], type: str): """Creates examples for the training and dev sets.""" examples = [ InputExample( example_id=line[0], question="", # in the swag dataset, the # common beginning of each # choice is stored in "sent2". contexts=[line[1], line[1], line[1], line[1], line[1]], endings=[line[2], line[3], line[4], line[5], line[6]], label=line[7], ) for line in lines # we skip the line with the column names ] return examples
def windowed_tokenization(example: InputExample, label_map: dict, max_window_length: int, max_length: int, stride: int, no_answer_text: str, tokenizer: PreTrainedTokenizer, text_tokenizer: TextTokenizer, window_fn: Callable = None) -> List[InputExample]: # ToDo := Different amount of windows will trigger an error because of # different size in input features? sequences should be grouped by # size and chopped, padded accordingly # ToDo := no_answer_text is not used by now, no label corrected able_to_correct_label = no_answer_text is not None and no_answer_text != "" window_fn = window_fn if window_fn is not None else create_windows window_texts = window_fn(example.contexts[0], tokenizer, max_window_length, stride) windowed_examples = [] for win_idx, win_text in enumerate(window_texts): str_win_idx = str(win_idx) if len(str_win_idx) % 2 != 0: str_win_idx = '0' + str_win_idx if able_to_correct_label and should_correct_label( win_text, example.endings, no_answer_text, text_tokenizer): label, endings = correct_label(win_text, example.endings.copy(), no_answer_text, text_tokenizer) else: label = label_map[example.label] endings = example.endings # maximum 100 windows example_id = int(str(example.example_id) + str_win_idx) windowed_examples.append( InputExample( example_id=example_id, question=example.question, contexts=[win_text] * len(endings), endings=endings, label=label, )) return windowed_examples
def _create_examples(self, lines: List[List[str]], type: str): """Creates examples for the training and dev sets.""" if type == "train" and lines[0][-1] != "label": raise ValueError( "For training, the input file must contain a label column.") examples = [ InputExample( example_id=line[2], question=line[5], # in the swag dataset, the # common beginning of each # choice is stored in "sent2". contexts=[line[4], line[4], line[4], line[4]], endings=[line[7], line[8], line[9], line[10]], label=line[11], ) for line in lines[1:] # we skip the line with the column names ] return examples
def _create_examples(self, data, set_type): """Creates examples for the training and dev sets.""" examples = [] for data_raw in data['data']: article = data_raw['article'] # encode by default. When input_batching with ids, no string tensor is # allowed, ensure that example ids are always numeric. for i in range(len(data_raw["answers"])): example_id = self._encode_id(data_raw['id'], i) truth = str(ord(data_raw["answers"][i]) - ord("A")) question = data_raw["questions"][i] options = data_raw["options"][i] self.nof_labels = len(options) examples.append( InputExample( example_id=example_id, question=question, contexts=[article] * len(options), endings=options, label=truth, )) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (_, data_raw) in enumerate(lines): article = data_raw["article"] for i in range(len(data_raw["answers"])): race_id = self._encode_id(data_raw['race_id'], i) truth = str(ord(data_raw["answers"][i]) - ord("A")) question = data_raw["questions"][i] options = data_raw["options"][i] examples.append( InputExample( example_id=race_id, question=question, contexts=[article, article, article, article ], # this is not efficient but convenient endings=[ options[0], options[1], options[2], options[3] ], label=truth, )) return examples
def _create_examples(self, lines, type): """Creates examples for the training and dev sets.""" # There are two types of labels. They should be normalized def normalize(truth): if truth in "ABCD": return ord(truth) - ord("A") elif truth in "1234": return int(truth) - 1 else: logger.info("truth ERROR! %s", str(truth)) return None examples = [] three_choice = 0 four_choice = 0 five_choice = 0 other_choices = 0 # we deleted example which has more than or less than four choices for line in tqdm.tqdm(lines, desc="read arc data"): data_raw = json.loads(line.strip("\n")) if len(data_raw["question"]["choices"]) == 3: three_choice += 1 continue elif len(data_raw["question"]["choices"]) == 5: five_choice += 1 continue elif len(data_raw["question"]["choices"]) != 4: other_choices += 1 continue four_choice += 1 truth = str(normalize(data_raw["answerKey"])) assert truth != "None" question_choices = data_raw["question"] question = question_choices["stem"] id = data_raw["id"] options = question_choices["choices"] if len(options) == 4: examples.append( InputExample( example_id=id, question=question, contexts=[ options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""), options[2]["para"].replace("_", ""), options[3]["para"].replace("_", ""), ], endings=[ options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"] ], label=truth, )) if type == "train": assert len(examples) > 1 assert examples[0].label is not None logger.info("len examples: %s}", str(len(examples))) logger.info("Three choices: %s", str(three_choice)) logger.info("Five choices: %s", str(five_choice)) logger.info("Other choices: %s", str(other_choices)) logger.info("four choices: %s", str(four_choice)) return examples
def window_examples( examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer, enable_windowing: bool = False, stride: int = None, no_answer_text: str = None, window_fn: Callable = None, ) -> List[InputExample]: """ Loads a data file into a list of `InputFeatures` """ if enable_windowing and stride is None: raise ValueError( 'Windowing mechanism is activated, but no "stride" or ' 'was provided, please provide it or disable' 'the mechanism altogether with `enable_windowing=False`') windowed_examples = [] label_map = {label: i for i, label in enumerate(label_list)} text_tokenizer = TextTokenizer() # three special tokens will be added, remove them from the count windowing_max_length = max_length - 6 for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="windowing examples"): if enable_windowing: trigger_windowing, max_ending_length = should_window( example=example, tokenizer=tokenizer, max_length=windowing_max_length, no_answer_text=no_answer_text, ) else: trigger_windowing, max_ending_length = False, None if trigger_windowing: windowed_examples.extend( windowed_tokenization(example=example, label_map=label_map, max_window_length=windowing_max_length - max_ending_length, max_length=max_length, stride=stride, no_answer_text=no_answer_text, tokenizer=tokenizer, text_tokenizer=text_tokenizer, window_fn=window_fn)) else: # Append 00 window idx to conform to the rest of windowed features example_id = example.example_id if enable_windowing: example_id = int(str(example_id) + '00') windowed_examples.append( InputExample( example_id=example_id, question=example.question, contexts=example.contexts, endings=example.endings, label=example.label, )) return windowed_examples