def _create_init_states_from_pickle(self, pickle_path: str) -> ActivationDict: init_states: ActivationDict = load_pickle(pickle_path) self._validate_init_states_from_pickle(init_states) return init_states
def selection_func(self) -> SelectionFunc: if self._selection_func is None: selection_func_path = os.path.join(self.activations_dir, "selection_func.dill") self._selection_func = load_pickle(selection_func_path, use_dill=True) return self._selection_func
def _create_init_states_from_pickle(model: RecurrentLM, pickle_path: str) -> ActivationDict: init_states: ActivationDict = load_pickle(pickle_path) _validate_init_states_from_pickle(model, init_states) return init_states
def set_init_states( self, pickle_path: Optional[str] = None, corpus_path: Optional[str] = None, save_init_states_to: Optional[str] = None, vocab_path: Optional[str] = None, ) -> None: """ Set up the initial LM states. If no path is provided 0-valued embeddings will be used. Note that the loaded init should provide tensors for `hx` and `cx` in all layers of the LM. Note that `init_states_pickle` takes precedence over `init_states_corpus` in case both are provided. Arguments --------- pickle_path : str, optional Path to pickled file with initial lstm states. If not provided zero-valued init states will be created. corpus_path : str, optional Path to corpus of which the final hidden state will be used as initial states. save_init_states_to : str, optional Path to which the newly computed init_states will be saved. If not provided these states won't be dumped. vocab_path : str, optional Path to the model vocabulary, which should a file containing a vocab entry at each line. Must be provided when creating the init states from a corpus. Returns ------- init_states : ActivationTensors ActivationTensors containing the init states for each layer. """ if pickle_path is not None: print("Loading extracted init states from file") init_states: ActivationTensors = load_pickle(pickle_path) self._validate(init_states) elif corpus_path is not None: assert ( vocab_path is not None ), "Vocab path must be provided when creating init states from corpus" print("Creating init states from provided corpus") init_states = self._create_init_states_from_corpus( corpus_path, vocab_path, save_init_states_to) else: init_states = self.create_zero_state() self.init_states = init_states
def _initialize_subtask(self, subtask: str, corpus_path: str) -> Dict[str, Corpus]: corpus_dict: Dict[str, List[Sequence[str]]] = load_pickle(corpus_path) if "npi" in subtask: header = ["sen", "counter_sen", "token"] tokenize_columns = ["sen", "counter_sen"] else: header = ["sen", "token", "counter_token"] tokenize_columns = ["sen"] fields = Corpus.create_fields( header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer ) subtask_corpora: Dict[str, Corpus] = {} for condition, sens in corpus_dict.items(): examples = self._create_examples(subtask, sens, fields) corpus = Corpus(examples, fields) subtask_corpora[condition] = corpus return subtask_corpora
def activation_ranges(self) -> ActivationRanges: if self._activation_ranges is None: ranges_path = os.path.join(self.activations_dir, "activation_ranges.pickle") self._activation_ranges = load_pickle(ranges_path) return self._activation_ranges
def marvin_init( vocab_path: str, path: str, tasks: Optional[List[str]] = None, device: str = "cpu", **kwargs: Any, ) -> Dict[str, Dict[str, Any]]: """ Performs the initialization for the tasks of Marvin & Linzen (2018) Arxiv link: https://arxiv.org/pdf/1808.09031.pdf Repo: https://github.com/BeckyMarvin/LM_syneval Parameters ---------- vocab_path : str Path to vocabulary file of the Language Model. path : str Path to directory containing the Marvin datasets that can be found in the github repo. tasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. device : str, optional Torch device name on which model will be run. Defaults to cpu. Returns ------- init_dict : Dict[str, Dict[str, Any]] Dictionary containing the initial task setup. """ all_paths = glob.glob(f"{path}/*.pickle") all_tasks = [path.split("/")[-1].split(".")[0] for path in all_paths] task2path = dict(zip(all_tasks, all_paths)) tasks = tasks or all_tasks init_dict: Dict[str, Dict[str, Any]] = {} for task in tasks: corpus_dict = load_pickle(task2path[task]) corpora: Dict[str, Corpus] = {} iterators: Dict[str, BucketIterator] = {} if "npi" in task: fields = [ ("sen", Field(batch_first=True, include_lengths=True)), ("wsen", Field(batch_first=True, include_lengths=True)), ("postfix", RawField()), ("idx", RawField()), ] fields[2][1].is_target = False fields[3][1].is_target = False else: fields = [ ("sen", Field(batch_first=True, include_lengths=True)), ("postfix", RawField()), ("idx", RawField()), ] fields[1][1].is_target = False fields[2][1].is_target = False for condition, sens in corpus_dict.items(): examples = create_examples(task, sens, fields, condition[:4].lower()) corpus = Dataset(examples, fields) attach_vocab(corpus, vocab_path) if "npi" in task: batch_size = min(len(sens), 20) attach_vocab(corpus, vocab_path, sen_column="wsen") else: batch_size = len(sens) corpora[condition] = corpus iterators[condition] = create_iterator(corpus, batch_size=batch_size, device=device, sort=True) init_dict[task] = {"corpora": corpora, "iterators": iterators} return init_dict