def simple_extract( model: LanguageModel, corpus: Corpus, activation_names: ActivationNames, activations_dir: Optional[str] = None, batch_size: int = BATCH_SIZE, selection_func: SelectionFunc = return_all, sen_column: Optional[str] = None, ) -> Tuple[ActivationReader, RemoveCallback]: """Basic extraction method. Parameters ---------- model : LanguageModel Language model that inherits from LanguageModel. corpus : Corpus Corpus containing sentences to be extracted. activation_names : List[tuple[int, str]] List of (layer, activation_name) tuples activations_dir : str, optional Directory to which activations will be written. If not provided the `extract()` method will only return the activations without writing them to disk. selection_func : SelectionFunc Function which determines if activations for a token should be extracted or not. batch_size : int, optional Amount of sentences processed per forward step. Higher batch size increases extraction speed, but should be done accordingly to the amount of available RAM. Defaults to 1. sen_column : str, optional Corpus column that will be tokenized and extracted. Defaults to ``corpus.sen_column``. Returns ------- activation_reader : ActivationReader ActivationReader for the activations that have been extracted. remove_activations : RemoveCallback Callback function that can be executed at the end of a procedure that depends on the extracted activations. Removes all the activations that have been extracted. Takes no arguments. """ extractor = Extractor( model, corpus, activation_names, activations_dir=activations_dir, selection_func=selection_func, batch_size=batch_size, sen_column=sen_column or corpus.sen_column, ) activation_reader = extractor.extract() def remove_activations(): if activations_dir is not None: shutil.rmtree(activations_dir) return activation_reader, remove_activations
def _create_init_states_from_corpus( self, init_states_corpus: str, tokenizer: PreTrainedTokenizer, save_init_states_to: Optional[str] = None, ) -> ActivationDict: assert ( tokenizer is not None ), "Tokenizer must be provided when creating init states from corpus" corpus: Corpus = Corpus.create(init_states_corpus, tokenizer=tokenizer) activation_names: ActivationNames = [ (layer, name) for layer in range(self.num_layers) for name in ["hx", "cx"] ] extractor = Extractor( self, corpus, activation_names, activations_dir=save_init_states_to, selection_func=final_sen_token, ) init_states = extractor.extract().activation_dict return init_states
def setUpClass(cls) -> None: # Create directory if necessary if not os.path.exists(ACTIVATIONS_DIR): os.makedirs(ACTIVATIONS_DIR) test_corpus = """The ripe taste improves .\t0 0 1 0 0\tdelicious The hog crawled .\t0 1 0 0\thairy Move the vat .\t0 0 1 0\tok""" corpus_path = os.path.join(ACTIVATIONS_DIR, "corpus.tsv") with open(corpus_path, "w") as f: f.write(test_corpus) cls.corpus = import_corpus(corpus_path, header=["sen", "labels", "quality"], vocab_from_corpus=True) cls.iterator = create_iterator(cls.corpus, batch_size=1) # Mock the activations the model produces cls.all_words = list( itertools.chain(*[item.sen for item in cls.corpus])) cls.all_tokens = [cls.corpus.vocab.stoi[w] for w in cls.all_words] cls.all_labels = torch.tensor( [label for batch in cls.iterator for label in batch.labels]) test_sentence_activations = [] identifier_value = 0 for example in cls.corpus: test_sentence_activations.append( create_sentence_dummy_activations(len(example.sen), ACTIVATION_DIM, identifier_value)) identifier_value += len(example.sen) cls.all_activations = torch.cat(test_sentence_activations) # Prepare Mock Model cls.model = MockLanguageModel( num_layers=1, hidden_size=ACTIVATION_DIM, all_tokens=cls.all_tokens, all_activations=cls.all_activations, ) cls.model.set_init_states() # Init extractor cls.extractor = Extractor(cls.model, cls.corpus, ACTIVATION_NAMES, activations_dir=ACTIVATIONS_DIR)
from diagnnose.config.config_dict import create_config_dict from diagnnose.corpus import Corpus from diagnnose.extract import Extractor from diagnnose.models import LanguageModel, import_model, set_init_states from diagnnose.tokenizer.create import create_tokenizer if __name__ == "__main__": config_dict = create_config_dict() tokenizer = create_tokenizer(**config_dict["tokenizer"]) corpus: Corpus = Corpus.create(tokenizer=tokenizer, **config_dict["corpus"]) model: LanguageModel = import_model(**config_dict["model"]) set_init_states(model, tokenizer=tokenizer, use_default=True) extractor = Extractor(model, corpus, **config_dict["extract"]) a_reader = extractor.extract()
from diagnnose.config.config_dict import create_config_dict from diagnnose.corpus import Corpus from diagnnose.extract import Extractor from diagnnose.models import LanguageModel from diagnnose.models.import_model import import_model from diagnnose.tokenizer.create import create_tokenizer if __name__ == "__main__": config_dict = create_config_dict() tokenizer = create_tokenizer(**config_dict["tokenizer"]) corpus: Corpus = Corpus.create(tokenizer=tokenizer, **config_dict["corpus"]) model: LanguageModel = import_model(config_dict) extractor = Extractor(model, corpus, **config_dict["activations"]) a_reader = extractor.extract()