def _run_corpus(self, corpus: Corpus) -> pd.DataFrame: if self.model.is_causal: selection_func = final_token("sen") else: selection_func = only_mask_token(self.tokenizer.mask_token, "sen") if self.ignore_unk: sen_ids = self._create_non_unk_sen_ids(corpus) corpus = corpus.slice(sen_ids) if len(corpus) == 0: return pd.DataFrame(columns=["scores", "counter_scores"]) activations = self._calc_final_hidden(corpus, selection_func) if "counter_sen" in corpus.fields: if self.model.is_causal: selection_func = final_token("counter_sen") else: selection_func = only_mask_token(self.tokenizer.mask_token, "counter_sen") corpus.sen_column = "counter_sen" counter_activations = self._calc_final_hidden( corpus, selection_func) else: counter_activations = None scores_df = self._calc_scores( corpus, activations, counter_activations=counter_activations, ) return scores_df
def initialize(self, path: str, header: Optional[List[str]] = None) -> SyntaxEvalCorpora: if header is None: header = ["sen", "token", "counter_token"] assert "sen" in header assert "token" in header assert "counter_sen" in header or "counter_token" in header corpora = {} if os.path.isdir(path): for file in glob.glob(os.path.join(path, "*")): corpus = Corpus.create(file, header=header, tokenizer=self.tokenizer) task_name = file.split("/")[-1].split(".")[0] corpora[task_name] = corpus elif os.path.isfile(path): corpus = Corpus.create(path, header=header, tokenizer=self.tokenizer) task_name = path.split("/")[-1].split(".")[0] corpora[task_name] = corpus else: raise FileNotFoundError("Path to task is not found") return corpora
def _add_output_classes(corpus: Corpus) -> None: """ Set the the pronouns for each sentence. """ corpus.fields["token"] = RawField() corpus.fields["counter_token"] = RawField() corpus.fields["token"].is_target = False corpus.fields["counter_token"].is_target = False for ex in corpus: setattr(ex, "token", "he") setattr(ex, "counter_token", "she")
def _add_output_classes(corpus: Corpus) -> None: """ Set the correct and incorrect verb for each sentence. """ corpus.fields["token"] = RawField() corpus.fields["wrong_token"] = RawField() corpus.fields["token"].is_target = False corpus.fields["wrong_token"].is_target = False for ex in corpus: setattr(ex, "token", ["he"]) setattr(ex, "wrong_token", ["she"])
def _create_init_states_from_corpus( self, init_states_corpus: str, tokenizer: PreTrainedTokenizer, save_init_states_to: Optional[str] = None, ) -> ActivationDict: assert ( tokenizer is not None ), "Tokenizer must be provided when creating init states from corpus" corpus: Corpus = Corpus.create(init_states_corpus, tokenizer=tokenizer) activation_names: ActivationNames = [ (layer, name) for layer in range(self.num_layers) for name in ["hx", "cx"] ] extractor = Extractor( self, corpus, activation_names, activations_dir=save_init_states_to, selection_func=final_sen_token, ) init_states = extractor.extract().activation_dict return init_states
def initialize(self, path: str, subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora: """Performs the initialization for the tasks of Marvin & Linzen (2018) Arxiv link: https://arxiv.org/pdf/1808.09031.pdf Repo: https://github.com/BeckyMarvin/LM_syneval Parameters ---------- path : str Path to directory containing the Marvin datasets that can be found in the github repo. subtasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. Returns ------- corpora : Dict[str, Corpus] Dictionary mapping a subtask to a Corpus. """ subtasks: List[str] = subtasks or ENVS corpora: SyntaxEvalCorpora = {} orig_corpus = preproc_warstadt(path) for env in subtasks: raw_corpus = create_downstream_corpus(orig_corpus, envs=[env]) header = raw_corpus[0].split("\t") tokenize_columns = ["sen", "counter_sen"] fields = Corpus.create_fields(header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer) examples = [ Example.fromlist(line.split("\t"), fields) for line in raw_corpus[1:] ] corpus = Corpus(examples, fields) corpora[env] = corpus return corpora
def _create_corpus(self, path: str, condition_slice: slice) -> Corpus: """Attach the correct and incorrect verb form to each sentence in the corpus. """ raw_corpus = Corpus.create_raw_corpus(path) for idx in range(0, len(raw_corpus), 2): token = raw_corpus[idx][0].split()[-1] counter_token = raw_corpus[idx + 1][0].split()[-1] sen = " ".join(raw_corpus[idx][0].split()[:-1]) raw_corpus[idx] = [sen, token, counter_token] raw_corpus = raw_corpus[::2][condition_slice] fields = Corpus.create_fields( ["sen", "token", "counter_token"], tokenizer=self.tokenizer ) examples = Corpus.create_examples(raw_corpus, fields) return Corpus(examples, fields)
def _create_corpus( self, items: List[RawItem], verb_inflections: Dict[str, str], items_per_subtask: Optional[int], ) -> Corpus: header = ["sen", "token", "counter_token"] fields = Corpus.create_fields(header, tokenizer=self.tokenizer) examples: List[Optional[Example]] = [ self._item_to_example(item, fields, verb_inflections) for item in items ] examples: List[Example] = list(filter(None, examples)) if items_per_subtask is not None: examples = examples[:items_per_subtask] corpus = Corpus(examples, fields) return corpus
def _initialize_subtask(self, subtask: str, corpus_path: str) -> Dict[str, Corpus]: corpus_dict: Dict[str, List[Sequence[str]]] = load_pickle(corpus_path) if "npi" in subtask: header = ["sen", "counter_sen", "token"] tokenize_columns = ["sen", "counter_sen"] else: header = ["sen", "token", "counter_token"] tokenize_columns = ["sen"] fields = Corpus.create_fields( header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer ) subtask_corpora: Dict[str, Corpus] = {} for condition, sens in corpus_dict.items(): examples = self._create_examples(subtask, sens, fields) corpus = Corpus(examples, fields) subtask_corpora[condition] = corpus return subtask_corpora
def initialize(self, path: str, subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora: """ Parameters ---------- path : str Path to directory containing the Marvin datasets that can be found in the github repo. subtasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. Returns ------- corpora : Dict[str, Corpus] Dictionary mapping a subtask to a Corpus. """ subtasks = subtasks or ["stereo", "unamb"] corpora: SyntaxEvalCorpora = {} for subtask in subtasks: for condition in ["FF", "FM", "MF", "MM"]: corpus = Corpus.create( os.path.join(path, f"{subtask}_{condition}.tsv"), header_from_first_line=True, tokenizer=self.tokenizer, ) self._add_output_classes(corpus) corpora.setdefault(subtask, {})[condition] = corpus return corpora
if not os.path.isdir(results_dir): os.mkdir(results_dir) results = {mn: {} for mn in MODEL_NAMES} for mn in MODEL_NAMES: config_dict["model"]["state_dict"] = os.path.join( model_dir, f"{mn}/40.pt") config_dict["tokenizer"]["path"] = os.path.join( model_dir, f"{mn}/vocab.txt") config_dict["probe"]["save_dir"] = os.path.join(results_dir, mn) envs = [env for env in ENVS if env == mn[:len(env)]] or ENVS tokenizer: PreTrainedTokenizer = create_tokenizer( **config_dict["tokenizer"]) corpus: Corpus = Corpus.create(tokenizer=tokenizer, **config_dict["corpus"]) model: LanguageModel = import_model(**config_dict["model"]) set_init_states(model, tokenizer=tokenizer, **config_dict["init_states"]) if len(envs) == len(ENVS): print(f"Probing {mn} on", envs) results[mn]["probe"] = monotonicity_probe(model, corpus, config_dict["probe"], suppress_print=False) print(results[mn]["probe"]) if "no_npi" not in mn: results[mn]["median_rank"] = median_ranks(