Example #1
0
def simple_extract(
    model: LanguageModel,
    corpus: Corpus,
    activation_names: ActivationNames,
    activations_dir: Optional[str] = None,
    batch_size: int = BATCH_SIZE,
    selection_func: SelectionFunc = return_all,
    sen_column: Optional[str] = None,
) -> Tuple[ActivationReader, RemoveCallback]:
    """Basic extraction method.

    Parameters
    ----------
    model : LanguageModel
        Language model that inherits from LanguageModel.
    corpus : Corpus
        Corpus containing sentences to be extracted.
    activation_names : List[tuple[int, str]]
        List of (layer, activation_name) tuples
    activations_dir : str, optional
        Directory to which activations will be written. If not provided
        the `extract()` method will only return the activations without
        writing them to disk.
    selection_func : SelectionFunc
        Function which determines if activations for a token should
        be extracted or not.
    batch_size : int, optional
        Amount of sentences processed per forward step. Higher batch
        size increases extraction speed, but should be done
        accordingly to the amount of available RAM. Defaults to 1.
    sen_column : str, optional
        Corpus column that will be tokenized and extracted. Defaults to
        ``corpus.sen_column``.

    Returns
    -------
    activation_reader : ActivationReader
        ActivationReader for the activations that have been extracted.
    remove_activations : RemoveCallback
        Callback function that can be executed at the end of a procedure
        that depends on the extracted activations. Removes all the
        activations that have been extracted. Takes no arguments.
    """
    extractor = Extractor(
        model,
        corpus,
        activation_names,
        activations_dir=activations_dir,
        selection_func=selection_func,
        batch_size=batch_size,
        sen_column=sen_column or corpus.sen_column,
    )

    activation_reader = extractor.extract()

    def remove_activations():
        if activations_dir is not None:
            shutil.rmtree(activations_dir)

    return activation_reader, remove_activations
    def _create_init_states_from_corpus(
        self,
        init_states_corpus: str,
        tokenizer: PreTrainedTokenizer,
        save_init_states_to: Optional[str] = None,
    ) -> ActivationDict:
        assert (
            tokenizer is not None
        ), "Tokenizer must be provided when creating init states from corpus"

        corpus: Corpus = Corpus.create(init_states_corpus, tokenizer=tokenizer)

        activation_names: ActivationNames = [
            (layer, name) for layer in range(self.num_layers)
            for name in ["hx", "cx"]
        ]

        extractor = Extractor(
            self,
            corpus,
            activation_names,
            activations_dir=save_init_states_to,
            selection_func=final_sen_token,
        )
        init_states = extractor.extract().activation_dict

        return init_states
    def setUpClass(cls) -> None:
        # Create directory if necessary
        if not os.path.exists(ACTIVATIONS_DIR):
            os.makedirs(ACTIVATIONS_DIR)

        test_corpus = """The ripe taste improves .\t0 0 1 0 0\tdelicious
        The hog crawled .\t0 1 0 0\thairy
        Move the vat .\t0 0 1 0\tok"""

        corpus_path = os.path.join(ACTIVATIONS_DIR, "corpus.tsv")
        with open(corpus_path, "w") as f:
            f.write(test_corpus)

        cls.corpus = import_corpus(corpus_path,
                                   header=["sen", "labels", "quality"],
                                   vocab_from_corpus=True)
        cls.iterator = create_iterator(cls.corpus, batch_size=1)

        # Mock the activations the model produces
        cls.all_words = list(
            itertools.chain(*[item.sen for item in cls.corpus]))
        cls.all_tokens = [cls.corpus.vocab.stoi[w] for w in cls.all_words]
        cls.all_labels = torch.tensor(
            [label for batch in cls.iterator for label in batch.labels])

        test_sentence_activations = []
        identifier_value = 0
        for example in cls.corpus:
            test_sentence_activations.append(
                create_sentence_dummy_activations(len(example.sen),
                                                  ACTIVATION_DIM,
                                                  identifier_value))
            identifier_value += len(example.sen)

        cls.all_activations = torch.cat(test_sentence_activations)

        # Prepare Mock Model
        cls.model = MockLanguageModel(
            num_layers=1,
            hidden_size=ACTIVATION_DIM,
            all_tokens=cls.all_tokens,
            all_activations=cls.all_activations,
        )
        cls.model.set_init_states()

        # Init extractor
        cls.extractor = Extractor(cls.model,
                                  cls.corpus,
                                  ACTIVATION_NAMES,
                                  activations_dir=ACTIVATIONS_DIR)
Example #4
0
from diagnnose.config.config_dict import create_config_dict
from diagnnose.corpus import Corpus
from diagnnose.extract import Extractor
from diagnnose.models import LanguageModel, import_model, set_init_states
from diagnnose.tokenizer.create import create_tokenizer

if __name__ == "__main__":
    config_dict = create_config_dict()

    tokenizer = create_tokenizer(**config_dict["tokenizer"])
    corpus: Corpus = Corpus.create(tokenizer=tokenizer,
                                   **config_dict["corpus"])
    model: LanguageModel = import_model(**config_dict["model"])
    set_init_states(model, tokenizer=tokenizer, use_default=True)

    extractor = Extractor(model, corpus, **config_dict["extract"])
    a_reader = extractor.extract()
Example #5
0
from diagnnose.config.config_dict import create_config_dict
from diagnnose.corpus import Corpus
from diagnnose.extract import Extractor
from diagnnose.models import LanguageModel
from diagnnose.models.import_model import import_model
from diagnnose.tokenizer.create import create_tokenizer

if __name__ == "__main__":
    config_dict = create_config_dict()

    tokenizer = create_tokenizer(**config_dict["tokenizer"])
    corpus: Corpus = Corpus.create(tokenizer=tokenizer, **config_dict["corpus"])
    model: LanguageModel = import_model(config_dict)

    extractor = Extractor(model, corpus, **config_dict["activations"])
    a_reader = extractor.extract()