def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) label = str(example_json['label']) if 'label' in example_json else None idx = example_json['idx'] guid = "%s-%s" % (set_type, idx) text_a = example_json['premise'] meta = { 'choice1': example_json['choice1'], 'choice2': example_json['choice2'], 'question': example_json['question'] } example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) examples.append(example) if set_type == 'train' or set_type == 'unlabeled': mirror_examples = [] for ex in examples: label = "1" if ex.label == "0" else "0" meta = { 'choice1': ex.meta['choice2'], 'choice2': ex.meta['choice1'], 'question': ex.meta['question'] } mirror_example = InputExample(guid=ex.guid + 'm', text_a=ex.text_a, label=label, meta=meta) mirror_examples.append(mirror_example) examples += mirror_examples logger.info(f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...") return examples
def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis", premise_name: str = "premise") -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line_idx, line in enumerate(f): example_json = json.loads(line) idx = example_json['idx'] if isinstance(idx, str): try: idx = int(idx) except ValueError: idx = line_idx label = example_json.get('label') guid = "%s-%s" % (set_type, idx) text_a = example_json[premise_name] text_b = example_json[hypothesis_name] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) passage_idx = example_json['idx'] text = example_json['passage']['text'] questions = example_json['passage']['questions'] for question_json in questions: question = question_json["question"] question_idx = question_json['idx'] answers = question_json["answers"] for answer_json in answers: label = str(answer_json["label"]) if 'label' in answer_json else None answer_idx = answer_json["idx"] guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}' meta = { 'passage_idx': passage_idx, 'question_idx': question_idx, 'answer_idx': answer_idx, 'answer': answer_json["text"] } idx = [passage_idx, question_idx, answer_idx] example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx) examples.append(example) question_indices = list(set(example.meta['question_idx'] for example in examples)) label_distribution = Counter(example.label for example in examples) logger.info(f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label " f"distribution {list(label_distribution.items())}") return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) idx = example_json['idx'] label = str(example_json['label']) if 'label' in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json['text'] meta = { 'span1_text': example_json['target']['span1_text'], 'span2_text': example_json['target']['span2_text'], 'span1_index': example_json['target']['span1_index'], 'span2_index': example_json['target']['span2_index'] } # the indices in the dataset are wrong for some examples, so we manually fix them span1_index, span1_text = meta['span1_index'], meta['span1_text'] span2_index, span2_text = meta['span2_index'], meta['span2_text'] words_a = text_a.split() words_a_lower = text_a.lower().split() words_span1_text = span1_text.lower().split() span1_len = len(words_span1_text) if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: for offset in [-1, +1]: if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text: span1_index += offset if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: logger.warning(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected " f"'{words_span1_text}' at index {span1_index} for '{words_a}'") if words_a[span2_index] != span2_text: for offset in [-1, +1]: if words_a[span2_index + offset] == span2_text: span2_index += offset if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text): words_a = words_a[:span2_index] \ + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \ + words_a[span2_index + 1:] assert words_a[span2_index] == span2_text, \ f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'" text_a = ' '.join(words_a) meta['span1_index'], meta['span2_index'] = span1_index, span2_index example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) if set_type == 'train' and label != 'True': continue examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) label = str( example_json["label"]) if "label" in example_json else None idx = example_json["idx"] guid = "%s-%s" % (set_type, idx) text_a = example_json["premise"] meta = { "choice1": example_json["choice1"], "choice2": example_json["choice2"], "question": example_json["question"], } example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) examples.append(example) if set_type == "train" or set_type == "unlabeled": mirror_examples = [] for ex in examples: label = "1" if ex.label == "0" else "0" meta = { "choice1": ex.meta["choice2"], "choice2": ex.meta["choice1"], "question": ex.meta["question"] } mirror_example = InputExample(guid=ex.guid + "m", text_a=ex.text_a, label=label, meta=meta) mirror_examples.append(mirror_example) examples += mirror_examples logger.info( f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}..." ) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path) as f: reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): label, body = row guid = "%s-%s" % (set_type, idx) text_a = body.replace('\\n', ' ').replace('\\', ' ') example = InputExample(guid=guid, text_a=text_a, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) idx = example_json['idx'] label = str(example_json['label']) if 'label' in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json['passage'] text_b = example_json['question'] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(lines: List[List[str]], set_type: str) -> List[InputExample]: examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[8] text_b = line[9] label = line[-1] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(self, path, set_type, max_examples=-1, skip_first=0): """Creates examples for the training and dev sets.""" examples = [] with open(path) as f: reader = csv.reader(f, delimiter=':->') for idx, row in enumerate(reader): guid = "%s-%s" % (set_type, idx) label = row[MyTaskDataProcessor.LABEL_COLUMN] text_a = row[MyTaskDataProcessor.TEXT_A_COLUMN] text_b = row[MyTaskDataProcessor.TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): label, question_title, question_body, answer = row guid = "%s-%s" % (set_type, idx) text_a = ' '.join([question_title.replace('\\n', ' ').replace('\\', ' '), question_body.replace('\\n', ' ').replace('\\', ' ')]) text_b = answer.replace('\\n', ' ').replace('\\', ' ') example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) idx = example_json['idx'] if isinstance(idx, str): idx = int(idx) label = "T" if example_json.get('label') else "F" guid = "%s-%s" % (set_type, idx) text_a = example_json['sentence1'] text_b = example_json['sentence2'] meta = {'word': example_json['word']} example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] df = pd.read_table(path) for idx, row in df.iterrows(): label = str(row['prefix']) guid = "%s-%s" % (set_type, idx) text_a = str(row['input_text']) text_b = str(row['target_text']) example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path) as f: reader = csv.reader(f, delimiter=",") for idx, row in enumerate(reader): label, headline, body = row guid = "%s-%s" % (set_type, idx) text_a = headline.replace("\\", " ") text_b = body.replace("\\", " ") example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(self, path: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) label = example_json['label'] id_ = example_json['id'] text_a = example_json['question'] text_b = example_json['comment'] language = example_json['language'] if self.language is not None and language != self.language: continue example = InputExample(guid=id_, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) idx = example_json["idx"] label = str( example_json["label"]) if "label" in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json["passage"] text_b = example_json["question"] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) idx = example_json["idx"] if isinstance(idx, str): idx = int(idx) label = "T" if example_json.get("label") else "F" guid = "%s-%s" % (set_type, idx) text_a = example_json["sentence1"] text_b = example_json["sentence2"] meta = {"word": example_json["word"]} example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta) examples.append(example) return examples
def _create_examples_unlabelled(self, path, set_type, max_examples=1, skip_first=0): """Creates examples for the unlabelled set.""" examples = [] with open(path, encoding="utf8") as f: reader = csv.reader(f, delimiter=":") for idx, row in enumerate(reader): guid = "%s-%s" % (set_type, idx) #label = row[MyTaskDataProcessor.LABEL_COLUMN] text_a = row[0] text_b = row[ MyTaskDataProcessor. TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None example = InputExample(guid=guid, text_a=text_a, text_b=text_b) examples.append(example) return examples
def _create_examples(self, lines: List[List[str]], set_type: str) -> List[InputExample]: examples = [] id_to_lables = self.get_labels() for (i, line) in enumerate(lines): if i == 0: continue guid = f"{set_type}-{line['idx']}" text_a = line['premise'] text_b = line['hypothesis'] label = id_to_lables[ line['label']] # need to return string, hf datasets uses int example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples( path, set_type, seed=42, max_train_candidates_per_question: int = 10) -> List[InputExample]: examples = [] entity_shuffler = random.Random(seed) with open(path, encoding='utf8') as f: for idx, line in enumerate(f): example_json = json.loads(line) idx = example_json['idx'] text = example_json['passage']['text'] entities = set() for entity_json in example_json['passage']['entities']: start = entity_json['start'] end = entity_json['end'] entity = text[start:end + 1] entities.add(entity) entities = list(entities) text = text.replace( "@highlight\n", "- " ) # we follow the GPT-3 paper wrt @highlight annotations questions = example_json['qas'] for question_json in questions: question = question_json['query'] question_idx = question_json['idx'] answers = set() for answer_json in question_json.get('answers', []): answer = answer_json['text'] answers.add(answer) answers = list(answers) if set_type == 'train': # create a single example per *correct* answer for answer_idx, answer in enumerate(answers): candidates = [ ent for ent in entities if ent not in answers ] if len(candidates ) > max_train_candidates_per_question - 1: entity_shuffler.shuffle(candidates) candidates = candidates[: max_train_candidates_per_question - 1] guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}' meta = { 'passage_idx': idx, 'question_idx': question_idx, 'candidates': [answer] + candidates, 'answers': [answer] } ex_idx = [idx, question_idx, answer_idx] example = InputExample(guid=guid, text_a=text, text_b=question, label="1", meta=meta, idx=ex_idx) examples.append(example) else: # create just one example with *all* correct answers and *all* answer candidates guid = f'{set_type}-p{idx}-q{question_idx}' meta = { 'passage_idx': idx, 'question_idx': question_idx, 'candidates': entities, 'answers': answers } example = InputExample(guid=guid, text_a=text, text_b=question, label="1", meta=meta) examples.append(example) question_indices = list( set(example.meta['question_idx'] for example in examples)) label_distribution = Counter(example.label for example in examples) logger.info( f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label " f"distribution {list(label_distribution.items())}") return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) idx = example_json["idx"] label = str( example_json["label"]) if "label" in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json["text"] meta = { "span1_text": example_json["target"]["span1_text"], "span2_text": example_json["target"]["span2_text"], "span1_index": example_json["target"]["span1_index"], "span2_index": example_json["target"]["span2_index"], } # the indices in the dataset are wrong for some examples, so we manually fix them span1_index, span1_text = meta["span1_index"], meta[ "span1_text"] span2_index, span2_text = meta["span2_index"], meta[ "span2_text"] words_a = text_a.split() words_a_lower = text_a.lower().split() words_span1_text = span1_text.lower().split() span1_len = len(words_span1_text) if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: for offset in [-1, +1]: if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text: span1_index += offset if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: logger.warning( f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected " f"'{words_span1_text}' at index {span1_index} for '{words_a}'" ) if words_a[span2_index] != span2_text: for offset in [-1, +1]: if words_a[span2_index + offset] == span2_text: span2_index += offset if words_a[span2_index] != span2_text and words_a[ span2_index].startswith(span2_text): words_a = (words_a[:span2_index] + [ words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):] ] + words_a[span2_index + 1:]) assert ( words_a[span2_index] == span2_text ), f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'" text_a = " ".join(words_a) meta["span1_index"], meta[ "span2_index"] = span1_index, span2_index example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) if set_type == "train" and label != "True": continue examples.append(example) return examples