def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) label = str(example_json['label']) if 'label' in example_json else None idx = example_json['idx'] guid = "%s-%s" % (set_type, idx) text_a = example_json['premise'] meta = { 'choice1': example_json['choice1'], 'choice2': example_json['choice2'], 'question': example_json['question'] } example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) examples.append(example) if set_type == 'train' or set_type == 'unlabeled': mirror_examples = [] for ex in examples: label = "1" if ex.label == "0" else "0" meta = { 'choice1': ex.meta['choice2'], 'choice2': ex.meta['choice1'], 'question': ex.meta['question'] } mirror_example = InputExample(guid=ex.guid + 'm', text_a=ex.text_a, label=label, meta=meta) mirror_examples.append(mirror_example) examples += mirror_examples logger.info(f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...") return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) passage_idx = example_json['idx'] text = example_json['passage']['text'] questions = example_json['passage']['questions'] for question_json in questions: question = question_json["question"] question_idx = question_json['idx'] answers = question_json["answers"] for answer_json in answers: label = str(answer_json["label"]) if 'label' in answer_json else None answer_idx = answer_json["idx"] guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}' meta = { 'passage_idx': passage_idx, 'question_idx': question_idx, 'answer_idx': answer_idx, 'answer': answer_json["text"] } idx = [passage_idx, question_idx, answer_idx] example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx) examples.append(example) question_indices = list(set(example.meta['question_idx'] for example in examples)) label_distribution = Counter(example.label for example in examples) logger.info(f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label " f"distribution {list(label_distribution.items())}") return examples
def get_parts(self, example: InputExample) -> FilledPattern: premise = self.remove_final_punc(self.shortenable(example.text_a)) choice1 = self.remove_final_punc( self.lowercase_first(example.meta['choice1'])) choice2 = self.remove_final_punc( self.lowercase_first(example.meta['choice2'])) question = example.meta['question'] assert question in ['cause', 'effect'] example.meta['choice1'], example.meta['choice2'] = choice1, choice2 num_masks = max( len(get_verbalization_ids(c, self.wrapper.tokenizer, False)) for c in [choice1, choice2]) if question == "cause": joiner = "because" else: joiner = "so" # searched patterns in fully-supervised learning # string_list_a = [choice1, 'or', choice2, '?', 'the', premise, joiner, 'the', self.mask] # string_list_a = [choice1, 'or', choice2, '?', premise, joiner, 'the', self.mask * num_masks] # string_list_a = ['"', choice1, '" or "', choice2, '"?', 'the', premise, 'the', joiner, self.mask*num_masks] # string_list_a = ['"', choice1, '" or "', choice2, '"?', premise, , joiner, 'the', self.mask*num_masks] # few-shot if self.pattern_id == 1: if question == "cause": string_list_a = [ choice1, 'or', choice2, '?', premise, 'because', 'the', self.mask * num_masks, '.' ] string_list_b = [] block_flag_a = [0, 0, 0, 0, 0, 0, 1, 0, 0] block_flag_b = [] assert len(string_list_a) == len(block_flag_a) assert len(string_list_b) == len(block_flag_b) return string_list_a, string_list_b, block_flag_a, block_flag_b elif question == "effect": string_list_a = [ choice1, 'or', choice2, '?', premise, 'so', 'the', self.mask * num_masks, '.' ] string_list_b = [] block_flag_a = [0, 0, 0, 0, 0, 0, 1, 0, 0] block_flag_b = [] assert len(string_list_a) == len(block_flag_a) assert len(string_list_b) == len(block_flag_b) return string_list_a, string_list_b, block_flag_a, block_flag_b else: raise ValueError( "currently not support the kind of questions.") else: raise ValueError("unknown pattern_ids.")
def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis", premise_name: str = "premise") -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line_idx, line in enumerate(f): example_json = json.loads(line) idx = example_json['idx'] if isinstance(idx, str): try: idx = int(idx) except ValueError: idx = line_idx label = example_json.get('label') guid = "%s-%s" % (set_type, idx) text_a = example_json[premise_name] text_b = example_json[hypothesis_name] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) idx = example_json['idx'] label = str(example_json['label']) if 'label' in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json['text'] meta = { 'span1_text': example_json['target']['span1_text'], 'span2_text': example_json['target']['span2_text'], 'span1_index': example_json['target']['span1_index'], 'span2_index': example_json['target']['span2_index'] } # the indices in the dataset are wrong for some examples, so we manually fix them span1_index, span1_text = meta['span1_index'], meta['span1_text'] span2_index, span2_text = meta['span2_index'], meta['span2_text'] words_a = text_a.split() words_a_lower = text_a.lower().split() words_span1_text = span1_text.lower().split() span1_len = len(words_span1_text) if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: for offset in [-1, +1]: if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text: span1_index += offset if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: logger.warning(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected " f"'{words_span1_text}' at index {span1_index} for '{words_a}'") if words_a[span2_index] != span2_text: for offset in [-1, +1]: if words_a[span2_index + offset] == span2_text: span2_index += offset if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text): words_a = words_a[:span2_index] \ + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \ + words_a[span2_index + 1:] assert words_a[span2_index] == span2_text, \ f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'" text_a = ' '.join(words_a) meta['span1_index'], meta['span2_index'] = span1_index, span2_index example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) if set_type == 'train' and label != 'True': continue examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) label = str( example_json["label"]) if "label" in example_json else None idx = example_json["idx"] guid = "%s-%s" % (set_type, idx) text_a = example_json["premise"] meta = { "choice1": example_json["choice1"], "choice2": example_json["choice2"], "question": example_json["question"], } example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) examples.append(example) if set_type == "train" or set_type == "unlabeled": mirror_examples = [] for ex in examples: label = "1" if ex.label == "0" else "0" meta = { "choice1": ex.meta["choice2"], "choice2": ex.meta["choice1"], "question": ex.meta["question"] } mirror_example = InputExample(guid=ex.guid + "m", text_a=ex.text_a, label=label, meta=meta) mirror_examples.append(mirror_example) examples += mirror_examples logger.info( f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}..." ) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path) as f: reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): label, body = row guid = "%s-%s" % (set_type, idx) text_a = body.replace('\\n', ' ').replace('\\', ' ') example = InputExample(guid=guid, text_a=text_a, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) idx = example_json['idx'] label = str(example_json['label']) if 'label' in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json['passage'] text_b = example_json['question'] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(self, path, set_type, max_examples=-1, skip_first=0): """Creates examples for the training and dev sets.""" examples = [] with open(path) as f: reader = csv.reader(f, delimiter=':->') for idx, row in enumerate(reader): guid = "%s-%s" % (set_type, idx) label = row[MyTaskDataProcessor.LABEL_COLUMN] text_a = row[MyTaskDataProcessor.TEXT_A_COLUMN] text_b = row[MyTaskDataProcessor.TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(lines: List[List[str]], set_type: str) -> List[InputExample]: examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[8] text_b = line[9] label = line[-1] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) idx = example_json['idx'] if isinstance(idx, str): idx = int(idx) label = "T" if example_json.get('label') else "F" guid = "%s-%s" % (set_type, idx) text_a = example_json['sentence1'] text_b = example_json['sentence2'] meta = {'word': example_json['word']} example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): label, question_title, question_body, answer = row guid = "%s-%s" % (set_type, idx) text_a = ' '.join([question_title.replace('\\n', ' ').replace('\\', ' '), question_body.replace('\\n', ' ').replace('\\', ' ')]) text_b = answer.replace('\\n', ' ').replace('\\', ' ') example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] df = pd.read_table(path) for idx, row in df.iterrows(): label = str(row['prefix']) guid = "%s-%s" % (set_type, idx) text_a = str(row['input_text']) text_b = str(row['target_text']) example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path) as f: reader = csv.reader(f, delimiter=",") for idx, row in enumerate(reader): label, headline, body = row guid = "%s-%s" % (set_type, idx) text_a = headline.replace("\\", " ") text_b = body.replace("\\", " ") example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(self, path: str) -> List[InputExample]: examples = [] with open(path, encoding='utf8') as f: for line in f: example_json = json.loads(line) label = example_json['label'] id_ = example_json['id'] text_a = example_json['question'] text_b = example_json['comment'] language = example_json['language'] if self.language is not None and language != self.language: continue example = InputExample(guid=id_, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) idx = example_json["idx"] label = str( example_json["label"]) if "label" in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json["passage"] text_b = example_json["question"] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) examples.append(example) return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) idx = example_json["idx"] if isinstance(idx, str): idx = int(idx) label = "T" if example_json.get("label") else "F" guid = "%s-%s" % (set_type, idx) text_a = example_json["sentence1"] text_b = example_json["sentence2"] meta = {"word": example_json["word"]} example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta) examples.append(example) return examples
def _create_examples_unlabelled(self, path, set_type, max_examples=1, skip_first=0): """Creates examples for the unlabelled set.""" examples = [] with open(path, encoding="utf8") as f: reader = csv.reader(f, delimiter=":") for idx, row in enumerate(reader): guid = "%s-%s" % (set_type, idx) #label = row[MyTaskDataProcessor.LABEL_COLUMN] text_a = row[0] text_b = row[ MyTaskDataProcessor. TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None example = InputExample(guid=guid, text_a=text_a, text_b=text_b) examples.append(example) return examples
def _create_examples(self, lines: List[List[str]], set_type: str) -> List[InputExample]: examples = [] id_to_lables = self.get_labels() for (i, line) in enumerate(lines): if i == 0: continue guid = f"{set_type}-{line['idx']}" text_a = line['premise'] text_b = line['hypothesis'] label = id_to_lables[ line['label']] # need to return string, hf datasets uses int example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples
def get_parts(self, example: InputExample) -> FilledPattern: premise = self.remove_final_punc(self.shortenable(example.text_a)) choice1 = self.remove_final_punc(self.lowercase_first(example.meta['choice1'])) choice2 = self.remove_final_punc(self.lowercase_first(example.meta['choice2'])) question = example.meta['question'] assert question in ['cause', 'effect'] example.meta['choice1'], example.meta['choice2'] = choice1, choice2 num_masks = max(len(get_verbalization_ids(c, self.wrapper.tokenizer, False)) for c in [choice1, choice2]) if question == 'cause': if self.pattern_id == 0: return ['"', choice1, '" or "', choice2, '"?', premise, 'because', self.mask * num_masks, '.'], [] elif self.pattern_id == 1: return [choice1, 'or', choice2, '?', premise, 'because', self.mask * num_masks, '.'], [] else: if self.pattern_id == 0: return ['"', choice1, '" or "', choice2, '"?', premise, ', so', self.mask * num_masks, '.'], [] elif self.pattern_id == 1: return [choice1, 'or', choice2, '?', premise, ', so', self.mask * num_masks, '.'], []
def get_parts(self, example: InputExample) -> FilledPattern: premise = self.remove_final_punc(self.shortenable(example.text_a)) choice1 = self.remove_final_punc( self.lowercase_first(example.meta["choice1"])) choice2 = self.remove_final_punc( self.lowercase_first(example.meta["choice2"])) question = example.meta["question"] assert question in ["cause", "effect"] example.meta["choice1"], example.meta["choice2"] = choice1, choice2 num_masks = max( len(get_verbalization_ids(c, self.wrapper.tokenizer, False)) for c in [choice1, choice2]) if question == "cause": if self.pattern_id == 0: return [ '"', choice1, '" or "', choice2, '"?', premise, "because", self.mask * num_masks, "." ], [] elif self.pattern_id == 1: return [ choice1, "or", choice2, "?", premise, "because", self.mask * num_masks, "." ], [] else: if self.pattern_id == 0: return [ '"', choice1, '" or "', choice2, '"?', premise, ", so", self.mask * num_masks, "." ], [] elif self.pattern_id == 1: return [ choice1, "or", choice2, "?", premise, ", so", self.mask * num_masks, "." ], []
def _create_examples( path, set_type, seed=42, max_train_candidates_per_question: int = 10) -> List[InputExample]: examples = [] entity_shuffler = random.Random(seed) with open(path, encoding='utf8') as f: for idx, line in enumerate(f): example_json = json.loads(line) idx = example_json['idx'] text = example_json['passage']['text'] entities = set() for entity_json in example_json['passage']['entities']: start = entity_json['start'] end = entity_json['end'] entity = text[start:end + 1] entities.add(entity) entities = list(entities) text = text.replace( "@highlight\n", "- " ) # we follow the GPT-3 paper wrt @highlight annotations questions = example_json['qas'] for question_json in questions: question = question_json['query'] question_idx = question_json['idx'] answers = set() for answer_json in question_json.get('answers', []): answer = answer_json['text'] answers.add(answer) answers = list(answers) if set_type == 'train': # create a single example per *correct* answer for answer_idx, answer in enumerate(answers): candidates = [ ent for ent in entities if ent not in answers ] if len(candidates ) > max_train_candidates_per_question - 1: entity_shuffler.shuffle(candidates) candidates = candidates[: max_train_candidates_per_question - 1] guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}' meta = { 'passage_idx': idx, 'question_idx': question_idx, 'candidates': [answer] + candidates, 'answers': [answer] } ex_idx = [idx, question_idx, answer_idx] example = InputExample(guid=guid, text_a=text, text_b=question, label="1", meta=meta, idx=ex_idx) examples.append(example) else: # create just one example with *all* correct answers and *all* answer candidates guid = f'{set_type}-p{idx}-q{question_idx}' meta = { 'passage_idx': idx, 'question_idx': question_idx, 'candidates': entities, 'answers': answers } example = InputExample(guid=guid, text_a=text, text_b=question, label="1", meta=meta) examples.append(example) question_indices = list( set(example.meta['question_idx'] for example in examples)) label_distribution = Counter(example.label for example in examples) logger.info( f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label " f"distribution {list(label_distribution.items())}") return examples
def _create_examples(path: str, set_type: str) -> List[InputExample]: examples = [] with open(path, encoding="utf8") as f: for line in f: example_json = json.loads(line) idx = example_json["idx"] label = str( example_json["label"]) if "label" in example_json else None guid = "%s-%s" % (set_type, idx) text_a = example_json["text"] meta = { "span1_text": example_json["target"]["span1_text"], "span2_text": example_json["target"]["span2_text"], "span1_index": example_json["target"]["span1_index"], "span2_index": example_json["target"]["span2_index"], } # the indices in the dataset are wrong for some examples, so we manually fix them span1_index, span1_text = meta["span1_index"], meta[ "span1_text"] span2_index, span2_text = meta["span2_index"], meta[ "span2_text"] words_a = text_a.split() words_a_lower = text_a.lower().split() words_span1_text = span1_text.lower().split() span1_len = len(words_span1_text) if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: for offset in [-1, +1]: if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text: span1_index += offset if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text: logger.warning( f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected " f"'{words_span1_text}' at index {span1_index} for '{words_a}'" ) if words_a[span2_index] != span2_text: for offset in [-1, +1]: if words_a[span2_index + offset] == span2_text: span2_index += offset if words_a[span2_index] != span2_text and words_a[ span2_index].startswith(span2_text): words_a = (words_a[:span2_index] + [ words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):] ] + words_a[span2_index + 1:]) assert ( words_a[span2_index] == span2_text ), f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'" text_a = " ".join(words_a) meta["span1_index"], meta[ "span2_index"] = span1_index, span2_index example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx) if set_type == "train" and label != "True": continue examples.append(example) return examples
def generate_ipet_train_sets( train_data: List[InputExample], unlabeled_data: List[InputExample], labels: List[str], logits_dir: str, output_dir: str, reduction: str, num_new_examples: int, logits_percentage: float, n_most_likely: int = -1, seed: int = 42, local_rank=-1, ): """ Generate training sets for the next generation of iPET models. :param train_data: the training examples :param unlabeled_data: the unlabeled examples :param labels: the list of all possible labels :param logits_dir: the directory that contains the predictions of all models in the current generation for the unlabeled data. :param output_dir: the output directory :param reduction: the strategy for merging logits, either 'mean' or 'wmean'. For 'mean', all models contribute equally, for 'wmean', each model's contribution is proportional to its accuracy on the training set before training. :param num_new_examples: the number of new examples to create :param logits_percentage: the percentage of models to use for annotating training sets for the next generation :param n_most_likely: If >0, in the first generation the n_most_likely examples per label are chosen even if their predicted label is different :param seed: the random seed to use """ subdirs = next(os.walk(logits_dir))[1] if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Found the following {} subdirectories: {}".format( len(subdirs), subdirs)) if train_data: train_examples_per_label = [ sum(1 for ex in train_data if ex.label == label) for label in labels ] multiplier = num_new_examples / len(train_data) examples_per_label = [ int(epl * multiplier) for epl in train_examples_per_label ] logger.info( f"Example distribution in the original dataset: {train_examples_per_label}" ) else: examples_per_label = eq_div(num_new_examples, len(labels)) logger.info( f"Target distribution for the new dataset: {examples_per_label}") for example in unlabeled_data: example.label, example.logits = None, None logits_lists = {} rng = random.Random(seed) rng_np = np.random.RandomState(seed) for subdir in subdirs: results_file = os.path.join(logits_dir, subdir, "results.txt") logits_file = os.path.join(logits_dir, subdir, "logits.txt") logits = [] if not os.path.exists(results_file) or not os.path.exists(logits_file): logger.warning( f"Skipping subdir '{subdir}' because 'results.txt' or 'logits.txt' not found" ) continue if reduction == "mean": result_train = 1 else: with open(results_file, "r") as fh: results = ast.literal_eval(fh.read()) result_train = results["train_set_before_training"] with open(logits_file, "r") as fh: for line in fh.read().splitlines(): example_logits = [float(x) for x in line.split()] logits.append(example_logits) logger.info("File {}: Score = {}, #Logits = {}, #Labels = {}".format( results_file, result_train, len(logits), len(logits[0]))) loglist = LogitsList(score=result_train, logits=logits) logits_lists[subdir] = loglist for subdir in subdirs: other_logits_lists = [ ll for sd, ll in logits_lists.items() if sd != subdir ] subdir_train_set = generate_ipet_train_set( other_logits_lists, labels=labels, original_data=unlabeled_data, examples_per_label=examples_per_label, logits_percentage=logits_percentage, reduction=reduction, n_most_likely=n_most_likely, rng=rng, rng_np=rng_np, ) if local_rank in [-1, 0]: InputExample.save_examples( subdir_train_set, os.path.join(output_dir, subdir + "-train.bin"))
def train_pet_ensemble( model_config: WrapperConfig, train_config: TrainConfig, eval_config: EvalConfig, pattern_ids: List[Union[str, int]], output_dir: str, ipet_data_dir: str = None, repetitions: int = 3, train_data: List[InputExample] = None, unlabeled_data: List[InputExample] = None, dev_data: List[InputExample] = None, test_data: List[InputExample] = None, do_train: bool = True, do_eval: bool = True, save_unlabeled_logits: bool = False, seed: int = 42, overwrite_dir: bool = False, save_model=False, local_rank=-1, ): """ Train and evaluate an ensemble of PET models without knowledge distillation. :param model_config: the model configuration to use :param train_config: the training configuration to use :param eval_config: the evaluation configuration to use :param pattern_ids: the ids of all PVPs to use :param output_dir: the output directory :param ipet_data_dir: optional directory containing additional training data for iPET :param repetitions: the number of training repetitions :param train_data: the training examples to use :param unlabeled_data: the unlabeled examples to use :param dev_data: the evaluation examples to use :param do_train: whether to perform training :param do_eval: whether to perform evaluation :param save_unlabeled_logits: whether logits for unlabeled examples should be saved in a file ``logits.txt``. This is required for both iPET and knowledge distillation. :param seed: the random seed to use """ results = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) set_seed(seed) for pattern_id in pattern_ids: for iteration in range(repetitions): model_config.pattern_id = pattern_id results_dict = {} shots = 0 if train_data is None else len(train_data) pattern_iter_output_dir = "{}/{}shots-{}-i{}-seed{}".format( output_dir, shots, pattern_name(pattern_id), iteration, seed) if os.path.exists(pattern_iter_output_dir) and not overwrite_dir: logger.warning( f"Path {pattern_iter_output_dir} already exists, skipping it..." ) continue if not os.path.exists(pattern_iter_output_dir) and local_rank in [ -1, 0 ]: os.makedirs(pattern_iter_output_dir) wrapper = init_model(model_config) # Training if do_train: if ipet_data_dir: p = os.path.join( ipet_data_dir, "{}-i{}-train.bin".format(pattern_name(pattern_id), iteration)) ipet_train_data = InputExample.load_examples(p) for example in ipet_train_data: example.logits = None else: ipet_train_data = None results_dict.update( train_single_model( wrapper, train_data, train_config, pattern_iter_output_dir, dev_data, eval_config, ipet_train_data=ipet_train_data, unlabeled_data=unlabeled_data, return_train_set_results=False, local_rank=local_rank, )) with open(os.path.join(pattern_iter_output_dir, "results.txt"), "w") as fh: fh.write(str(results_dict)) if local_rank in [-1, 0]: logger.info("Saving trained model at {}...".format( pattern_iter_output_dir)) train_config.save( os.path.join(pattern_iter_output_dir, "train_config.json")) eval_config.save( os.path.join(pattern_iter_output_dir, "eval_config.json")) logger.info("Saving complete") if save_unlabeled_logits: logits = evaluate(wrapper, unlabeled_data, eval_config, local_rank=local_rank)["logits"] save_logits( os.path.join(pattern_iter_output_dir, "logits.txt"), logits) if not do_eval: wrapper.model = None wrapper = None torch.cuda.empty_cache() # Evaluation if do_eval: logger.info("Starting evaluation...") try: wrapper = TransformerModelWrapper.from_pretrained( pattern_iter_output_dir) except OSError: warnings.warn( "No model found saved, proceeding with current model instead of best" ) pass for split, eval_data in { "dev": dev_data, "test": test_data }.items(): if eval_data is None: continue eval_result = evaluate(wrapper, eval_data, eval_config, priming_data=train_data, local_rank=local_rank) if local_rank in [-1, 0]: save_predictions( os.path.join(pattern_iter_output_dir, "predictions.jsonl"), wrapper, eval_result) save_logits( os.path.join(pattern_iter_output_dir, "eval_logits.txt"), eval_result["logits"]) scores = eval_result["scores"] logger.info( "--- {} result (pattern_id={}, iteration={}) ---". format(split, pattern_id, iteration)) logger.info(scores) results_dict[f"{split}_set_after_training"] = scores with open( os.path.join(pattern_iter_output_dir, "results.json"), "w") as fh: json.dump(results_dict, fh) for metric, value in scores.items(): results[split][metric][pattern_id].append(value) wrapper.model = None wrapper = None torch.cuda.empty_cache() if do_eval: logger.info("=== OVERALL RESULTS ===") results_to_log = _write_results( os.path.join(output_dir, "result_test.txt"), results) else: logger.info("=== ENSEMBLE TRAINING COMPLETE ===") results_to_log = None if do_train and not save_model: outputs = os.listdir(pattern_iter_output_dir) for item in outputs: if item.endswith(".bin"): os.remove(os.path.join(pattern_iter_output_dir, item)) return results_to_log
def train_pet_ensemble(model_config: WrapperConfig, train_config: TrainConfig, eval_config: EvalConfig, pattern_ids: List[int], output_dir: str, ipet_data_dir: str = None, repetitions: int = 3, train_data: List[InputExample] = None, unlabeled_data: List[InputExample] = None, eval_data: List[InputExample] = None, do_train: bool = True, do_eval: bool = True, save_unlabeled_logits: bool = False, seed: int = 42): """ Train and evaluate an ensemble of PET models without knowledge distillation. :param model_config: the model configuration to use :param train_config: the training configuration to use :param eval_config: the evaluation configuration to use :param pattern_ids: the ids of all PVPs to use :param output_dir: the output directory :param ipet_data_dir: optional directory containing additional training data for iPET :param repetitions: the number of training repetitions :param train_data: the training examples to use :param unlabeled_data: the unlabeled examples to use :param eval_data: the evaluation examples to use :param do_train: whether to perform training :param do_eval: whether to perform evaluation :param save_unlabeled_logits: whether logits for unlabeled examples should be saved in a file ``logits.txt``. This is required for both iPET and knowledge distillation. :param seed: the random seed to use """ results = defaultdict(lambda: defaultdict(list)) set_seed(seed) for pattern_id in pattern_ids: for iteration in range(repetitions): model_config.pattern_id = pattern_id results_dict = {} pattern_iter_output_dir = "{}/p{}-i{}".format( output_dir, pattern_id, iteration) if os.path.exists(pattern_iter_output_dir): logger.warning( f"Path {pattern_iter_output_dir} already exists, skipping it..." ) continue if not os.path.exists(pattern_iter_output_dir): os.makedirs(pattern_iter_output_dir) wrapper = init_model(model_config) # Training if do_train: if ipet_data_dir: p = os.path.join( ipet_data_dir, 'p{}-i{}-train.bin'.format(pattern_id, iteration)) ipet_train_data = InputExample.load_examples(p) for example in ipet_train_data: example.logits = None else: ipet_train_data = None results_dict.update( train_single_model(wrapper, train_data, train_config, eval_config, ipet_train_data=ipet_train_data, unlabeled_data=unlabeled_data)) with open(os.path.join(pattern_iter_output_dir, 'results.txt'), 'w') as fh: fh.write(str(results_dict)) logger.info("Saving trained model at {}...".format( pattern_iter_output_dir)) wrapper.save(pattern_iter_output_dir) train_config.save( os.path.join(pattern_iter_output_dir, 'train_config.json')) eval_config.save( os.path.join(pattern_iter_output_dir, 'eval_config.json')) logger.info("Saving complete") if save_unlabeled_logits: logits = evaluate(wrapper, unlabeled_data, eval_config)['logits'] save_logits( os.path.join(pattern_iter_output_dir, 'logits.txt'), logits) if not do_eval: wrapper.model = None wrapper = None torch.cuda.empty_cache() # Evaluation if do_eval: logger.info("Starting evaluation...") if not wrapper: wrapper = TransformerModelWrapper.from_pretrained( pattern_iter_output_dir) eval_result = evaluate(wrapper, eval_data, eval_config, priming_data=train_data) save_predictions( os.path.join(pattern_iter_output_dir, 'predictions.jsonl'), wrapper, eval_result) save_logits( os.path.join(pattern_iter_output_dir, 'eval_logits.txt'), eval_result['logits']) scores = eval_result['scores'] logger.info( "--- RESULT (pattern_id={}, iteration={}) ---".format( pattern_id, iteration)) logger.info(scores) results_dict['test_set_after_training'] = scores with open( os.path.join(pattern_iter_output_dir, 'results.json'), 'w') as fh: json.dump(results_dict, fh) for metric, value in scores.items(): results[metric][pattern_id].append(value) wrapper.model = None wrapper = None torch.cuda.empty_cache() if do_eval: logger.info("=== OVERALL RESULTS ===") _write_results(os.path.join(output_dir, 'result_test.txt'), results) else: logger.info("=== ENSEMBLE TRAINING COMPLETE ===")
def main(): parser = argparse.ArgumentParser() # required parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory. The verbalizers are written to a file 'verbalizer.json' in this directory.", ) parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the data files for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="The model type", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(PROCESSORS.keys()), ) # verbalizer search hyperparameters parser.add_argument( "--normalize", action="store_true", help= "Whether to normalize the loss as proposed in the paper. It is recommended to set this to 'true'.", ) parser.add_argument( "--combine_patterns", action="store_true", help= "If set to true, a single joint verbalizer is searched for all patterns", ) parser.add_argument( "--num_candidates", default=1000, type=int, help= "The number of candidate tokens to consider as verbalizers (see Section 4.1 of the paper)", ) parser.add_argument( "--words_per_label", default=10, type=int, help="The number of verbalizer tokens to assign to each label", ) parser.add_argument( "--score_fct", default="llr", choices=["llr", "ce", "random"], help= "The function used to score verbalizers. Choices are: the log-likelihood ratio loss proposed in the paper " "('llr'), cross-entropy loss ('ce') and 'random', which assigns random tokens to each label.", ) # other optional parameters parser.add_argument( "--train_examples", default=50, type=int, help= "The total number of train examples to use, where -1 equals all examples.", ) parser.add_argument( "--pattern_ids", default=[0], type=int, nargs="+", help="The ids of the PVPs to be used", ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--words_file", default=None, type=str, help= "Path to a file containing (unlabeled) texts from the task's domain. This text is used to compute " "verbalization candidates by selecting the most frequent words.", ) parser.add_argument( "--max_words", default=10000, type=int, help= "Only the 10,000 tokens that occur most frequently in the task’s unlabeled data (see --words_file) are " "considered as verbalization candidates", ) parser.add_argument( "--additional_input_examples", type=str, help= "An optional path to an additional set of input examples (e.g., obtained using iPET)", ) parser.add_argument("--seed", default=42, type=int, help="random seed for initialization") args = parser.parse_args() random.seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, "config.txt"), "w", encoding="utf8") as fh: json.dump(args.__dict__, fh, indent=2) # setup gpu/cpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() # prepare task args.task_name = args.task_name.lower() if args.task_name not in PROCESSORS: raise ValueError("Task not found: {}".format(args.task_name)) processor = PROCESSORS[args.task_name]() args.label_list = processor.get_labels() args.cache_dir = "" args.do_lower_case = False args.verbalizer_file = None args.wrapper_type = "mlm" # get training data train_examples_per_label = (eq_div(args.train_examples, len( args.label_list)) if args.train_examples != -1 else -1) train_data = load_examples( args.task_name, args.data_dir, set_type=TRAIN_SET, num_examples_per_label=train_examples_per_label, ) if args.additional_input_examples: additional_data = InputExample.load_examples( args.additional_input_examples) train_data += additional_data logger.info( f"Loaded {len(additional_data)} additional examples from {args.additional_input_examples}, total" f"training set size is now {len(train_data)}") expected = { label: np.array([1 if x.label == label else 0 for x in train_data]) for label in args.label_list } if args.words_file: with open(args.words_file, "r", encoding="utf8") as fh: word_counts = Counter(fh.read().split()) else: word_counts = None tokenizer_class = MODEL_CLASSES[args.model_type]["tokenizer"] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) word2idx = get_word_to_id_map(tokenizer, word_counts=word_counts, max_words=args.max_words) logits = [] for pattern_id in args.pattern_ids: logger.info(f"Processing examples with pattern id {pattern_id}...") args.pattern_id = pattern_id config = WrapperConfig( model_type=args.model_type, model_name_or_path=args.model_name_or_path, wrapper_type="mlm", task_name=args.task_name, max_seq_length=args.max_seq_length, label_list=args.label_list, pattern_id=args.pattern_id, ) wrapper = TransformerModelWrapper(config) wrapper.model.to(device) # modify all patterns so that they return a single text segment instead of two segments get_parts = wrapper.preprocessor.pvp.get_parts wrapper.preprocessor.pvp.get_parts = lambda example: ( get_parts(example)[0] + get_parts(example)[1], [], ) wrapper.preprocessor.pvp.convert_mlm_logits_to_cls_logits = lambda mask, x, _=None: x[ mask >= 0] pattern_logits = wrapper.eval( train_data, device, per_gpu_eval_batch_size=args.per_gpu_eval_batch_size, n_gpu=args.n_gpu, )["logits"] pattern_logits = pattern_logits - np.expand_dims( np.max(pattern_logits, axis=1), axis=1) logits.append(pattern_logits) logger.info("Starting verbalizer search...") if args.combine_patterns: avs = AutomaticVerbalizerSearch(word2idx, args.label_list, logits, expected) verbalizer = avs.find_verbalizer( num_candidates=args.num_candidates, words_per_label=args.words_per_label, normalize=args.normalize, score_fct=args.score_fct, ) verbalizers = { pattern_id: verbalizer for pattern_id in args.pattern_ids } else: verbalizers = {} for idx, pattern_id in enumerate(args.pattern_ids): avs = AutomaticVerbalizerSearch(word2idx, args.label_list, [logits[idx]], expected) verbalizers[pattern_id] = avs.find_verbalizer( num_candidates=args.num_candidates, words_per_label=args.words_per_label, normalize=args.normalize, score_fct=args.score_fct, ) print(json.dumps(verbalizers, indent=2)) logger.info("Verbalizer search complete, writing output...") with open(os.path.join(args.output_dir, "verbalizers.json"), "w", encoding="utf8") as fh: json.dump(verbalizers, fh, indent=2) logger.info("Done")