Esempio n. 1
0
    def _run_corpus(self, corpus: Corpus) -> pd.DataFrame:
        if self.model.is_causal:
            selection_func = final_token("sen")
        else:
            selection_func = only_mask_token(self.tokenizer.mask_token, "sen")

        if self.ignore_unk:
            sen_ids = self._create_non_unk_sen_ids(corpus)
            corpus = corpus.slice(sen_ids)
            if len(corpus) == 0:
                return pd.DataFrame(columns=["scores", "counter_scores"])

        activations = self._calc_final_hidden(corpus, selection_func)

        if "counter_sen" in corpus.fields:
            if self.model.is_causal:
                selection_func = final_token("counter_sen")
            else:
                selection_func = only_mask_token(self.tokenizer.mask_token,
                                                 "counter_sen")

            corpus.sen_column = "counter_sen"
            counter_activations = self._calc_final_hidden(
                corpus, selection_func)
        else:
            counter_activations = None

        scores_df = self._calc_scores(
            corpus,
            activations,
            counter_activations=counter_activations,
        )

        return scores_df
Esempio n. 2
0
    def initialize(self,
                   path: str,
                   header: Optional[List[str]] = None) -> SyntaxEvalCorpora:
        if header is None:
            header = ["sen", "token", "counter_token"]

        assert "sen" in header
        assert "token" in header
        assert "counter_sen" in header or "counter_token" in header

        corpora = {}

        if os.path.isdir(path):
            for file in glob.glob(os.path.join(path, "*")):
                corpus = Corpus.create(file,
                                       header=header,
                                       tokenizer=self.tokenizer)
                task_name = file.split("/")[-1].split(".")[0]
                corpora[task_name] = corpus
        elif os.path.isfile(path):
            corpus = Corpus.create(path,
                                   header=header,
                                   tokenizer=self.tokenizer)
            task_name = path.split("/")[-1].split(".")[0]
            corpora[task_name] = corpus
        else:
            raise FileNotFoundError("Path to task is not found")

        return corpora
Esempio n. 3
0
    def _add_output_classes(corpus: Corpus) -> None:
        """ Set the the pronouns for each sentence. """
        corpus.fields["token"] = RawField()
        corpus.fields["counter_token"] = RawField()

        corpus.fields["token"].is_target = False
        corpus.fields["counter_token"].is_target = False

        for ex in corpus:
            setattr(ex, "token", "he")
            setattr(ex, "counter_token", "she")
Esempio n. 4
0
    def _add_output_classes(corpus: Corpus) -> None:
        """ Set the correct and incorrect verb for each sentence. """
        corpus.fields["token"] = RawField()
        corpus.fields["wrong_token"] = RawField()

        corpus.fields["token"].is_target = False
        corpus.fields["wrong_token"].is_target = False

        for ex in corpus:
            setattr(ex, "token", ["he"])
            setattr(ex, "wrong_token", ["she"])
Esempio n. 5
0
    def _create_init_states_from_corpus(
        self,
        init_states_corpus: str,
        tokenizer: PreTrainedTokenizer,
        save_init_states_to: Optional[str] = None,
    ) -> ActivationDict:
        assert (
            tokenizer is not None
        ), "Tokenizer must be provided when creating init states from corpus"

        corpus: Corpus = Corpus.create(init_states_corpus, tokenizer=tokenizer)

        activation_names: ActivationNames = [
            (layer, name) for layer in range(self.num_layers)
            for name in ["hx", "cx"]
        ]

        extractor = Extractor(
            self,
            corpus,
            activation_names,
            activations_dir=save_init_states_to,
            selection_func=final_sen_token,
        )
        init_states = extractor.extract().activation_dict

        return init_states
Esempio n. 6
0
    def initialize(self,
                   path: str,
                   subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora:
        """Performs the initialization for the tasks of
        Marvin & Linzen (2018)

        Arxiv link: https://arxiv.org/pdf/1808.09031.pdf

        Repo: https://github.com/BeckyMarvin/LM_syneval

        Parameters
        ----------
        path : str
            Path to directory containing the Marvin datasets that can be
            found in the github repo.
        subtasks : List[str], optional
            The downstream tasks that will be tested. If not provided this
            will default to the full set of conditions.

        Returns
        -------
        corpora : Dict[str, Corpus]
            Dictionary mapping a subtask to a Corpus.
        """
        subtasks: List[str] = subtasks or ENVS

        corpora: SyntaxEvalCorpora = {}

        orig_corpus = preproc_warstadt(path)

        for env in subtasks:
            raw_corpus = create_downstream_corpus(orig_corpus, envs=[env])

            header = raw_corpus[0].split("\t")
            tokenize_columns = ["sen", "counter_sen"]
            fields = Corpus.create_fields(header,
                                          tokenize_columns=tokenize_columns,
                                          tokenizer=self.tokenizer)
            examples = [
                Example.fromlist(line.split("\t"), fields)
                for line in raw_corpus[1:]
            ]
            corpus = Corpus(examples, fields)

            corpora[env] = corpus

        return corpora
Esempio n. 7
0
    def _create_corpus(self, path: str, condition_slice: slice) -> Corpus:
        """Attach the correct and incorrect verb form to each sentence
        in the corpus.
        """
        raw_corpus = Corpus.create_raw_corpus(path)

        for idx in range(0, len(raw_corpus), 2):
            token = raw_corpus[idx][0].split()[-1]
            counter_token = raw_corpus[idx + 1][0].split()[-1]
            sen = " ".join(raw_corpus[idx][0].split()[:-1])
            raw_corpus[idx] = [sen, token, counter_token]

        raw_corpus = raw_corpus[::2][condition_slice]

        fields = Corpus.create_fields(
            ["sen", "token", "counter_token"], tokenizer=self.tokenizer
        )

        examples = Corpus.create_examples(raw_corpus, fields)

        return Corpus(examples, fields)
Esempio n. 8
0
    def _create_corpus(
        self,
        items: List[RawItem],
        verb_inflections: Dict[str, str],
        items_per_subtask: Optional[int],
    ) -> Corpus:
        header = ["sen", "token", "counter_token"]
        fields = Corpus.create_fields(header, tokenizer=self.tokenizer)

        examples: List[Optional[Example]] = [
            self._item_to_example(item, fields, verb_inflections)
            for item in items
        ]

        examples: List[Example] = list(filter(None, examples))

        if items_per_subtask is not None:
            examples = examples[:items_per_subtask]

        corpus = Corpus(examples, fields)

        return corpus
Esempio n. 9
0
    def _initialize_subtask(self, subtask: str, corpus_path: str) -> Dict[str, Corpus]:
        corpus_dict: Dict[str, List[Sequence[str]]] = load_pickle(corpus_path)

        if "npi" in subtask:
            header = ["sen", "counter_sen", "token"]
            tokenize_columns = ["sen", "counter_sen"]
        else:
            header = ["sen", "token", "counter_token"]
            tokenize_columns = ["sen"]

        fields = Corpus.create_fields(
            header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer
        )
        subtask_corpora: Dict[str, Corpus] = {}

        for condition, sens in corpus_dict.items():
            examples = self._create_examples(subtask, sens, fields)

            corpus = Corpus(examples, fields)

            subtask_corpora[condition] = corpus

        return subtask_corpora
Esempio n. 10
0
    def initialize(self,
                   path: str,
                   subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora:
        """

        Parameters
        ----------
        path : str
            Path to directory containing the Marvin datasets that can be
            found in the github repo.
        subtasks : List[str], optional
            The downstream tasks that will be tested. If not provided this
            will default to the full set of conditions.

        Returns
        -------
        corpora : Dict[str, Corpus]
            Dictionary mapping a subtask to a Corpus.
        """
        subtasks = subtasks or ["stereo", "unamb"]

        corpora: SyntaxEvalCorpora = {}

        for subtask in subtasks:
            for condition in ["FF", "FM", "MF", "MM"]:
                corpus = Corpus.create(
                    os.path.join(path, f"{subtask}_{condition}.tsv"),
                    header_from_first_line=True,
                    tokenizer=self.tokenizer,
                )

                self._add_output_classes(corpus)

                corpora.setdefault(subtask, {})[condition] = corpus

        return corpora
Esempio n. 11
0
    if not os.path.isdir(results_dir):
        os.mkdir(results_dir)

    results = {mn: {} for mn in MODEL_NAMES}

    for mn in MODEL_NAMES:
        config_dict["model"]["state_dict"] = os.path.join(
            model_dir, f"{mn}/40.pt")
        config_dict["tokenizer"]["path"] = os.path.join(
            model_dir, f"{mn}/vocab.txt")
        config_dict["probe"]["save_dir"] = os.path.join(results_dir, mn)
        envs = [env for env in ENVS if env == mn[:len(env)]] or ENVS

        tokenizer: PreTrainedTokenizer = create_tokenizer(
            **config_dict["tokenizer"])
        corpus: Corpus = Corpus.create(tokenizer=tokenizer,
                                       **config_dict["corpus"])
        model: LanguageModel = import_model(**config_dict["model"])
        set_init_states(model,
                        tokenizer=tokenizer,
                        **config_dict["init_states"])

        if len(envs) == len(ENVS):
            print(f"Probing {mn} on", envs)
            results[mn]["probe"] = monotonicity_probe(model,
                                                      corpus,
                                                      config_dict["probe"],
                                                      suppress_print=False)
            print(results[mn]["probe"])

        if "no_npi" not in mn:
            results[mn]["median_rank"] = median_ranks(