def initialize(self, path: str, subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora: """Performs the initialization for the tasks of Marvin & Linzen (2018) Arxiv link: https://arxiv.org/pdf/1808.09031.pdf Repo: https://github.com/BeckyMarvin/LM_syneval Parameters ---------- path : str Path to directory containing the Marvin datasets that can be found in the github repo. subtasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. Returns ------- corpora : Dict[str, Corpus] Dictionary mapping a subtask to a Corpus. """ subtasks: List[str] = subtasks or ENVS corpora: SyntaxEvalCorpora = {} orig_corpus = preproc_warstadt(path) for env in subtasks: raw_corpus = create_downstream_corpus(orig_corpus, envs=[env]) header = raw_corpus[0].split("\t") tokenize_columns = ["sen", "counter_sen"] fields = Corpus.create_fields(header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer) examples = [ Example.fromlist(line.split("\t"), fields) for line in raw_corpus[1:] ] corpus = Corpus(examples, fields) corpora[env] = corpus return corpora
def _create_corpus(self, path: str, condition_slice: slice) -> Corpus: """Attach the correct and incorrect verb form to each sentence in the corpus. """ raw_corpus = Corpus.create_raw_corpus(path) for idx in range(0, len(raw_corpus), 2): token = raw_corpus[idx][0].split()[-1] counter_token = raw_corpus[idx + 1][0].split()[-1] sen = " ".join(raw_corpus[idx][0].split()[:-1]) raw_corpus[idx] = [sen, token, counter_token] raw_corpus = raw_corpus[::2][condition_slice] fields = Corpus.create_fields( ["sen", "token", "counter_token"], tokenizer=self.tokenizer ) examples = Corpus.create_examples(raw_corpus, fields) return Corpus(examples, fields)
def _create_corpus( self, items: List[RawItem], verb_inflections: Dict[str, str], items_per_subtask: Optional[int], ) -> Corpus: header = ["sen", "token", "counter_token"] fields = Corpus.create_fields(header, tokenizer=self.tokenizer) examples: List[Optional[Example]] = [ self._item_to_example(item, fields, verb_inflections) for item in items ] examples: List[Example] = list(filter(None, examples)) if items_per_subtask is not None: examples = examples[:items_per_subtask] corpus = Corpus(examples, fields) return corpus
def _initialize_subtask(self, subtask: str, corpus_path: str) -> Dict[str, Corpus]: corpus_dict: Dict[str, List[Sequence[str]]] = load_pickle(corpus_path) if "npi" in subtask: header = ["sen", "counter_sen", "token"] tokenize_columns = ["sen", "counter_sen"] else: header = ["sen", "token", "counter_token"] tokenize_columns = ["sen"] fields = Corpus.create_fields( header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer ) subtask_corpora: Dict[str, Corpus] = {} for condition, sens in corpus_dict.items(): examples = self._create_examples(subtask, sens, fields) corpus = Corpus(examples, fields) subtask_corpora[condition] = corpus return subtask_corpora