def simple_extract( model: LanguageModel, activations_dir: str, corpus: Corpus, activation_names: ActivationNames, selection_func: SelectFunc = lambda sen_id, pos, item: True, ) -> None: """ Basic extraction method. """ extractor = Extractor(model, corpus, activations_dir, activation_names) extractor.extract(batch_size=BATCH_SIZE, dynamic_dumping=False, selection_func=selection_func)
def _create_init_states_from_corpus( self, init_states_corpus: str, vocab_path: str, save_init_states_to: Optional[str], ) -> ActivationTensors: corpus: Corpus = import_corpus(init_states_corpus, vocab_path=vocab_path) self.init_states = self.create_zero_state() extractor = Extractor(self, corpus, save_init_states_to) init_states = extractor.extract( create_avg_eos=True, only_return_avg_eos=(save_init_states_to is None)) assert init_states is not None return init_states
def setUpClass(cls) -> None: # Create directory if necessary if not os.path.exists(ACTIVATIONS_DIR): os.makedirs(ACTIVATIONS_DIR) test_corpus = """The ripe taste improves .\t0 0 1 0 0\tdelicious The hog crawled .\t0 1 0 0\thairy Move the vat .\t0 0 1 0\tok""" corpus_path = os.path.join(ACTIVATIONS_DIR, "corpus.txt") with open(corpus_path, "w") as f: f.write(test_corpus) cls.corpus = import_corpus(corpus_path, header=["sen", "labels", "quality"], vocab_from_corpus=True) cls.examples = cls.corpus.examples cls.iterator = create_iterator(cls.corpus, batch_size=1) # Mock the activations the model produces cls.all_words = list( itertools.chain(*[item.sen for item in cls.corpus])) cls.all_tokens = [cls.corpus.vocab.stoi[w] for w in cls.all_words] cls.all_labels = cls._merge_labels( [example.labels for example in cls.corpus]) test_sentence_activations = [] identifier_value = 0 for example in cls.corpus: test_sentence_activations.append( create_sentence_dummy_activations(len(example.sen), ACTIVATION_DIM, identifier_value)) identifier_value += len(example.sen) cls.all_activations = torch.cat(test_sentence_activations) # Prepare Mock Model cls.model = MockLanguageModel( num_layers=1, hidden_size=ACTIVATION_DIM, all_tokens=cls.all_tokens, all_activations=cls.all_activations, ) cls.model.set_init_states() # Init extractor cls.extractor = Extractor( cls.model, cls.corpus, activations_dir=ACTIVATIONS_DIR, activation_names=ACTIVATION_NAMES, ) cls.extractor.activation_names = ACTIVATION_NAMES
def setUpClass(cls) -> None: # Create directory if necessary if not os.path.exists(ACTIVATIONS_DIR): os.makedirs(ACTIVATIONS_DIR) # Prepare Mock sentences cls.test_sentences = [MagicMock(), MagicMock(), MagicMock()] cls.test_sentences[0].sen = ["The", "ripe", "taste", "improves", "."] cls.test_sentences[0].labels = [0, 0, 1, 0, 0] cls.test_sentences[0].misc_info = {"quality": "delicious"} cls.test_sentences[1].sen = ["The", "hog", "crawled", "."] cls.test_sentences[1].labels = [0, 1, 0, 0] cls.test_sentences[1].misc_info = {"quality": "hairy"} cls.test_sentences[2].sen = ["Move", "the", "vat", "."] cls.test_sentences[2].labels = [0, 0, 1, 0] cls.test_sentences[2].misc_info = {"quality": "ok"} cls.corpus = { i: cls.test_sentences[i] for i in range(len(cls.test_sentences)) } # Mock the activations the model produces cls.all_tokens = list( itertools.chain(*[sentence.sen for sentence in cls.test_sentences])) cls.all_labels = cls._merge_labels( [sentence.labels for sentence in cls.corpus.values()]) cls.test_sentence_activations = [] identifier_value = 0 for sentence in cls.corpus.values(): cls.test_sentence_activations.append( create_sentence_dummy_activations(len(sentence.sen), ACTIVATION_DIM, identifier_value)) identifier_value += len(sentence.sen) cls.all_activations = torch.cat(cls.test_sentence_activations) # Prepare Mock Model cls.model = MockLanguageModel(num_layers=1, hidden_size=ACTIVATION_DIM, all_tokens=cls.all_tokens, all_activations=cls.all_activations) # Init extractor cls.extractor = Extractor(cls.model, cls.corpus, ACTIVATION_NAMES, output_dir=ACTIVATIONS_DIR)
from diagnnose.config.arg_parser import create_arg_parser from diagnnose.config.setup import ConfigSetup from diagnnose.corpora.import_corpus import import_corpus_from_path from diagnnose.extractors.base_extractor import Extractor from diagnnose.models.import_model import import_model_from_json from diagnnose.models.language_model import LanguageModel from diagnnose.typedefs.corpus import Corpus if __name__ == '__main__': arg_groups = {'model', 'activations', 'corpus', 'extract'} arg_parser, required_args = create_arg_parser(arg_groups) config_dict = ConfigSetup(arg_parser, required_args, arg_groups).config_dict model: LanguageModel = import_model_from_json(config_dict['model']) corpus: Corpus = import_corpus_from_path(**config_dict['corpus']) extractor = Extractor(model, corpus, **config_dict['activations']) extractor.extract(**config_dict['extract'])
""" Select activations only when they occur on the subject's position. """ return pos == sentence.misc_info["subj_pos"] def pos_4_selection_func(pos: int, token: str, sentence: LabeledSentence): """ Select activations only on position 4. """ return pos == 4 if __name__ == "__main__": required_args = {'model', 'vocab', 'lm_module', 'corpus_path', 'activation_names', 'output_dir'} arg_groups = { 'model': {'model', 'vocab', 'lm_module', 'device'}, 'corpus': {'corpus_path'}, 'init_extract': {'activation_names', 'output_dir', 'init_lstm_states_path'}, 'extract': {'cutoff', 'print_every'}, } argparser = init_argparser() config_object = ConfigSetup(argparser, required_args, arg_groups) config_dict = config_object.config_dict model: LanguageModel = import_model_from_json(**config_dict['model']) corpus: LabeledCorpus = convert_to_labeled_corpus(**config_dict['corpus']) extractor = Extractor(model, corpus, **config_dict['init_extract']) extractor.extract(**config_dict['extract'], selection_func=pos_4_selection_func) # In case you want to extract average eos activations as well, uncomment this line # extractor.extract_average_eos_activations(print_every=config_dict['extract']['print_every'])
from diagnnose.config.arg_parser import create_arg_parser from diagnnose.config.setup import create_config_dict from diagnnose.corpus.import_corpus import import_corpus from diagnnose.extractors.base_extractor import Extractor from diagnnose.models.import_model import import_model from diagnnose.models.lm import LanguageModel from diagnnose.typedefs.corpus import Corpus from diagnnose.vocab import get_vocab_from_config if __name__ == "__main__": arg_groups = { "model", "activations", "corpus", "extract", "init_states", "vocab" } arg_parser, required_args = create_arg_parser(arg_groups) config_dict = create_config_dict(arg_parser, required_args, arg_groups) model: LanguageModel = import_model(config_dict) corpus: Corpus = import_corpus( vocab_path=get_vocab_from_config(config_dict), **config_dict["corpus"]) extractor = Extractor(model, corpus, **config_dict["activations"]) extractor.extract(**config_dict["extract"])