Esempio n. 1
0
    def _create_init_states_from_pickle(self,
                                        pickle_path: str) -> ActivationDict:
        init_states: ActivationDict = load_pickle(pickle_path)

        self._validate_init_states_from_pickle(init_states)

        return init_states
Esempio n. 2
0
 def selection_func(self) -> SelectionFunc:
     if self._selection_func is None:
         selection_func_path = os.path.join(self.activations_dir,
                                            "selection_func.dill")
         self._selection_func = load_pickle(selection_func_path,
                                            use_dill=True)
     return self._selection_func
Esempio n. 3
0
def _create_init_states_from_pickle(model: RecurrentLM,
                                    pickle_path: str) -> ActivationDict:
    init_states: ActivationDict = load_pickle(pickle_path)

    _validate_init_states_from_pickle(model, init_states)

    return init_states
Esempio n. 4
0
    def set_init_states(
        self,
        pickle_path: Optional[str] = None,
        corpus_path: Optional[str] = None,
        save_init_states_to: Optional[str] = None,
        vocab_path: Optional[str] = None,
    ) -> None:
        """ Set up the initial LM states.

        If no path is provided 0-valued embeddings will be used.
        Note that the loaded init should provide tensors for `hx`
        and `cx` in all layers of the LM.

        Note that `init_states_pickle` takes precedence over
        `init_states_corpus` in case both are provided.

        Arguments
        ---------
        pickle_path : str, optional
            Path to pickled file with initial lstm states. If not
            provided zero-valued init states will be created.
        corpus_path : str, optional
            Path to corpus of which the final hidden state will be used
            as initial states.
        save_init_states_to : str, optional
            Path to which the newly computed init_states will be saved.
            If not provided these states won't be dumped.
        vocab_path : str, optional
            Path to the model vocabulary, which should a file containing
            a vocab entry at each line. Must be provided when creating
            the init states from a corpus.

        Returns
        -------
        init_states : ActivationTensors
            ActivationTensors containing the init states for each layer.
        """
        if pickle_path is not None:
            print("Loading extracted init states from file")
            init_states: ActivationTensors = load_pickle(pickle_path)
            self._validate(init_states)
        elif corpus_path is not None:
            assert (
                vocab_path is not None
            ), "Vocab path must be provided when creating init states from corpus"
            print("Creating init states from provided corpus")
            init_states = self._create_init_states_from_corpus(
                corpus_path, vocab_path, save_init_states_to)
        else:
            init_states = self.create_zero_state()

        self.init_states = init_states
Esempio n. 5
0
    def _initialize_subtask(self, subtask: str, corpus_path: str) -> Dict[str, Corpus]:
        corpus_dict: Dict[str, List[Sequence[str]]] = load_pickle(corpus_path)

        if "npi" in subtask:
            header = ["sen", "counter_sen", "token"]
            tokenize_columns = ["sen", "counter_sen"]
        else:
            header = ["sen", "token", "counter_token"]
            tokenize_columns = ["sen"]

        fields = Corpus.create_fields(
            header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer
        )
        subtask_corpora: Dict[str, Corpus] = {}

        for condition, sens in corpus_dict.items():
            examples = self._create_examples(subtask, sens, fields)

            corpus = Corpus(examples, fields)

            subtask_corpora[condition] = corpus

        return subtask_corpora
Esempio n. 6
0
 def activation_ranges(self) -> ActivationRanges:
     if self._activation_ranges is None:
         ranges_path = os.path.join(self.activations_dir,
                                    "activation_ranges.pickle")
         self._activation_ranges = load_pickle(ranges_path)
     return self._activation_ranges
Esempio n. 7
0
def marvin_init(
    vocab_path: str,
    path: str,
    tasks: Optional[List[str]] = None,
    device: str = "cpu",
    **kwargs: Any,
) -> Dict[str, Dict[str, Any]]:
    """ Performs the initialization for the tasks of
    Marvin & Linzen (2018)

    Arxiv link: https://arxiv.org/pdf/1808.09031.pdf
    Repo: https://github.com/BeckyMarvin/LM_syneval

    Parameters
    ----------
    vocab_path : str
        Path to vocabulary file of the Language Model.
    path : str
        Path to directory containing the Marvin datasets that can be
        found in the github repo.
    tasks : List[str], optional
        The downstream tasks that will be tested. If not provided this
        will default to the full set of conditions.
    device : str, optional
        Torch device name on which model will be run. Defaults to cpu.

    Returns
    -------
    init_dict : Dict[str, Dict[str, Any]]
        Dictionary containing the initial task setup.
    """
    all_paths = glob.glob(f"{path}/*.pickle")
    all_tasks = [path.split("/")[-1].split(".")[0] for path in all_paths]
    task2path = dict(zip(all_tasks, all_paths))

    tasks = tasks or all_tasks

    init_dict: Dict[str, Dict[str, Any]] = {}

    for task in tasks:
        corpus_dict = load_pickle(task2path[task])

        corpora: Dict[str, Corpus] = {}
        iterators: Dict[str, BucketIterator] = {}
        if "npi" in task:
            fields = [
                ("sen", Field(batch_first=True, include_lengths=True)),
                ("wsen", Field(batch_first=True, include_lengths=True)),
                ("postfix", RawField()),
                ("idx", RawField()),
            ]
            fields[2][1].is_target = False
            fields[3][1].is_target = False
        else:
            fields = [
                ("sen", Field(batch_first=True, include_lengths=True)),
                ("postfix", RawField()),
                ("idx", RawField()),
            ]
            fields[1][1].is_target = False
            fields[2][1].is_target = False

        for condition, sens in corpus_dict.items():
            examples = create_examples(task, sens, fields,
                                       condition[:4].lower())
            corpus = Dataset(examples, fields)
            attach_vocab(corpus, vocab_path)
            if "npi" in task:
                batch_size = min(len(sens), 20)
                attach_vocab(corpus, vocab_path, sen_column="wsen")
            else:
                batch_size = len(sens)
            corpora[condition] = corpus
            iterators[condition] = create_iterator(corpus,
                                                   batch_size=batch_size,
                                                   device=device,
                                                   sort=True)

        init_dict[task] = {"corpora": corpora, "iterators": iterators}

    return init_dict