def __init__( self, activations_dir: str, corpus: Corpus, test_activations_dir: Optional[str] = None, test_corpus: Optional[Corpus] = None, selection_func: SelectFunc = lambda sen_id, pos, example: True, ) -> None: assert corpus is not None, "`corpus`should be provided!" self.train_labels = create_labels_from_corpus( corpus, selection_func=selection_func) if test_activations_dir is not None: self.test_activation_reader = ActivationReader( test_activations_dir) assert test_corpus is not None, "`test_corpus` should be provided!" self.test_labels = create_labels_from_corpus( test_corpus, selection_func=selection_func) else: self.test_activation_reader = None self.test_labels = None self.activation_reader = ActivationReader(activations_dir) self.data_len = len(self.activation_reader)
def setUpClass(cls) -> None: # Create directory if necessary if not os.path.exists(ACTIVATIONS_DIR): os.makedirs(ACTIVATIONS_DIR) create_and_dump_dummy_activations(num_sentences=NUM_TEST_SENTENCES, activations_dim=ACTIVATIONS_DIM, max_tokens=5, activations_dir=ACTIVATIONS_DIR, activations_name=ACTIVATIONS_NAME, num_classes=2) cls.activation_reader = ActivationReader( activations_dir=ACTIVATIONS_DIR)
def __init__(self, model: LanguageModel, decomposer: str, activations_dir: str, decoder: Optional[str] = None, init_lstm_states_path: Optional[str] = None) -> None: # Import Decomposer class from string, assumes module name to be snake case variant # of CamelCased Decomposer class. Taken from: https://stackoverflow.com/a/30941292 module_name = camel2snake(decomposer) module = import_module(f'diagnnose.decompositions.{module_name}') self.decomposer_constructor: Type[BaseDecomposer] = getattr( module, decomposer) self.activation_reader = ActivationReader( activations_dir, store_multiple_activations=True) self.model = model self.decoder_w, self.decoder_b = self._read_decoder(decoder) self.init_cell_state: FullActivationDict = InitStates( model, init_lstm_states_path).create()
def _create_activation_reader( self, model: Optional[LanguageModel], corpus: Corpus, activations_dir: str, selection_func: Optional[SelectionFunc], ) -> ActivationReader: if activations_dir is None or self.create_new_activations: assert model is not None activation_reader, _ = simple_extract( model, corpus, self.activation_names, activations_dir=activations_dir, selection_func=selection_func or return_all, ) activation_reader.cat_activations = True else: activation_reader = ActivationReader(activations_dir, cat_activations=True) return activation_reader
def __init__( self, model: LanguageModel, activations_dir: str, create_new_activations: bool = False, corpus: Optional[Corpus] = None, sen_ids: ActivationIndex = slice(None, None), decomposer: str = "ContextualDecomposer", ) -> None: self.model = model module = import_module(f"diagnnose.decompositions") self.decomposer_constructor: Type[BaseDecomposer] = getattr( module, decomposer) if create_new_activations: assert ( corpus is not None ), "Corpus should be provided if no activations_dir is passed." self._extract_activations(activations_dir, sen_ids, corpus) self.activation_reader = ActivationReader( activations_dir, store_multiple_activations=True)
def lakretz_init( vocab_path: str, path: str, task_activations: Optional[Dict[str, str]] = None, tasks: Optional[List[str]] = None, device: str = "cpu", ) -> Dict[str, Dict[str, Any]]: """ Initializes the tasks described in Lakretz et al. (2019) Arxiv link: https://arxiv.org/pdf/1903.07435.pdf Repo: https://github.com/FAIRNS/Number_and_syntax_units_in_LSTM_LMs Parameters ---------- vocab_path : str Path to vocabulary file of the Language Model. path : str Path to directory containing the datasets that can be found in the github repo. task_activations : str, optional Dictionary mapping task names to directories to which the Lakretz task embeddings have been extracted. If a task is not provided the activations will be created during the task. tasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. device : str, optional Torch device name on which model will be run. Defaults to cpu. Returns ------- init_dict : Dict[str, Dict[str, Any]] Dictionary containing the initial task setup, mapping each task to to required fields. """ task_activations = task_activations or {} if tasks is None: tasks = list(lakretz_descriptions.keys()) init_dict: Dict[str, Dict[str, Any]] = {} for task in tasks: assert task in lakretz_descriptions, f"Provided task {task} is not recognised!" activation_dir = task_activations.get(task, None) activation_reader = (ActivationReader(activation_dir) if activation_dir is not None else None) task_specs = lakretz_descriptions[task] items_per_class = task_specs["items_per_class"] corpus = import_corpus( os.path.join(path, f"{task}.txt"), header=["sen", "type", "correct", "idx"], vocab_path=vocab_path, ) iterator = create_iterator(corpus, batch_size=(items_per_class * 2), device=device) init_dict[task] = { "activation_reader": activation_reader, "corpus": corpus, "iterator": iterator, } return init_dict
def main(config): classifier = joblib.load('output/' + config.name + '/classifiers/hx_l1.joblib') train_reader = ActivationReader('output/' + config.name + '/activations/train') test_reader = ActivationReader('output/' + config.name + '/activations/test') if 'raw' in config.name: representation = 'raw' if 'neume' in config.name: representation = 'neume' if 'syllable' in config.name: representation = 'syllable' if 'pitch' in config.name: notes = 'pitch' if 'interval' in config.name: notes = 'interval' if '_20_' in config.name: seq_length = 20 else: seq_length = 30 # train_corpus = import_corpus_from_path('data/inputs/' + notes + '_20_' + representation + '_mode_corpus_train.txt', ['sen', 'labels']) test_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_mode_corpus_test.txt', ['sen', 'labels']) test_genre_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_genre_corpus_test.txt', ['sen', 'labels']) ############################################################## ############################################################## hx_1_test = test_reader.read_activations((1, 'hx')) test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length)) test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length)) for i in range(len(test_labels)): test_labels[i] = test_corpus[i].labels[19] test_genres[i] = test_genre_corpus[i].labels[19] test_genres = test_genres.astype(int) test_labels = test_labels.astype(int) with open( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_genre_vocab.txt', 'rb') as vf: vocab_lines = vf.readlines() vocab_lines = [line.decode('utf-8') for line in vocab_lines] count = Counter(test_genres) most_common_genres = count.most_common(3) genre_vocab = [w.strip() for w in vocab_lines] resp_verse = genre_vocab.index('Responsory verse') antiphon = genre_vocab.index('Antiphon') genre_ind_dict = {} for genre in most_common_genres: print(genre_vocab[genre[0]], genre) genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0]) ct = 1 embs = {} for i in [0, 13]: activation_test = hx_1_test[i::20] activation_test = activation_test[genre_ind_dict[resp_verse]] x_emb = TSNE(n_components=2, verbose=2).fit_transform(activation_test) show_tsne_plot(x_emb, test_labels[genre_ind_dict[resp_verse]] + 1)
class DataLoader: """ Reads in pickled activations that have been extracted. Parameters ---------- activations_dir : str Directory containing the extracted activations. corpus : Corpus Corpus containing the labels for each sentence. test_activations_dir : str, optional Directory containing the extracted test activations. If not provided the train activation set will be split and partially used as test set. test_corpus : Corpus, optional Corpus containing the test labels for each sentence. Must be provided if `test_activations_dir` is provided. selection_func : SelectFunc, optional Selection function that determines whether a corpus item should be taken into account. If such a function has been used during extraction, make sure to pass it along here as well. """ def __init__( self, activations_dir: str, corpus: Corpus, test_activations_dir: Optional[str] = None, test_corpus: Optional[Corpus] = None, selection_func: SelectFunc = lambda sen_id, pos, example: True, ) -> None: assert corpus is not None, "`corpus`should be provided!" self.train_labels = create_labels_from_corpus( corpus, selection_func=selection_func) if test_activations_dir is not None: self.test_activation_reader = ActivationReader( test_activations_dir) assert test_corpus is not None, "`test_corpus` should be provided!" self.test_labels = create_labels_from_corpus( test_corpus, selection_func=selection_func) else: self.test_activation_reader = None self.test_labels = None self.activation_reader = ActivationReader(activations_dir) self.data_len = len(self.activation_reader) def create_data_split( self, activation_name: ActivationName, data_subset_size: int = -1, train_test_split: float = 0.9, ) -> DataDict: """ Creates train/test data split of activations Parameters ---------- activation_name : ActivationName (layer, name) tuple indicating the activations to be read in data_subset_size : int, optional Subset size of data to train on. Defaults to -1, indicating the entire data set. train_test_split : float Percentage of the train/test split. If separate test activations are provided this split won't be used. Defaults to 0.9/0.1. """ if data_subset_size != -1: assert (0 < data_subset_size <= self.data_len ), "Size of subset can't be bigger than the full data set." train_activations = self.activation_reader.read_activations( activation_name) # Shuffle activations data_size = self.data_len if data_subset_size == -1 else data_subset_size indices = np.random.choice(range(data_size), data_size, replace=False) train_activations = train_activations[indices] train_labels = self.train_labels[indices] if self.test_activation_reader is not None: test_activations = self.test_activation_reader.read_activations( activation_name) test_labels = self.test_labels else: split = int(data_size * train_test_split) test_activations = train_activations[split:] test_labels = train_labels[split:] train_activations = train_activations[:split] train_labels = train_labels[:split] return { "train_x": train_activations, "train_y": train_labels, "test_x": test_activations, "test_y": test_labels, }
def main(config): classifier = joblib.load('output/' + config.name + '/classifiers/hx_l1.joblib') train_reader = ActivationReader('output/' + config.name + '/activations/train') test_reader = ActivationReader('output/' + config.name + '/activations/test') if 'raw' in config.name: representation = 'raw' if 'neume' in config.name: representation = 'neume' if 'syllable' in config.name: representation = 'syllable' if 'pitch' in config.name: notes = 'pitch' if 'interval' in config.name: notes = 'interval' if '_20_' in config.name: seq_length = 20 else: seq_length = 30 if 'embedding' in config.name: train_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_mode_corpus_train.txt', ['sen', 'labels']) test_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_mode_corpus_test.txt', ['sen', 'labels']) test_genre_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_string_genre_corpus_test.txt', ['sen', 'labels']) else: train_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_mode_corpus_train.txt', ['sen', 'labels']) test_corpus = import_corpus_from_path( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_mode_corpus_test.txt', ['sen', 'labels']) # hx_1_train = train_reader.read_activations((1,'hx')) # train_labels = np.zeros(int(hx_1_train.shape[0]/seq_length)) # for i in range(len(train_labels)): # train_labels[i] = train_corpus[i].labels[19] # train_scores = np.zeros(seq_length) # train_stds = np.zeros(seq_length) # for i in range(seq_length): # activation_train = hx_1_train[i::seq_length] # train_pred = classifier.predict(activation_train) # train_scores[i] = (train_pred == train_labels).mean() # plt.plot(range(1,seq_length+1), train_scores, c='Red') ############################################################## ############################################################## hx_1_test = test_reader.read_activations((1, 'hx')) test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length)) test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length)) for i in range(len(test_labels)): test_labels[i] = test_corpus[i].labels[19] test_genres[i] = test_genre_corpus[i].labels[19] test_genres = test_genres.astype(int) test_labels = test_labels.astype(int) with open( 'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation + '_genre_vocab.txt', 'rb') as vf: vocab_lines = vf.readlines() vocab_lines = [line.decode('utf-8') for line in vocab_lines] count = Counter(test_genres) most_common_genres = count.most_common(25) genre_vocab = [w.strip() for w in vocab_lines] resp_verse = genre_vocab.index('Responsory verse') antiphon = genre_vocab.index('Antiphon') genre_ind_dict = {} for genre in most_common_genres: print(genre_vocab[genre[0]], genre) genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0]) test_scores = np.zeros(seq_length) test_stds = np.zeros(seq_length) test_scores_genre = {} for genre in genre_ind_dict: test_scores_genre[genre] = np.zeros(seq_length) preds = [] for i in range(seq_length): activation_test = hx_1_test[i::seq_length] test_pred = classifier.predict(activation_test) preds.append(test_pred) for genre in genre_ind_dict: tp = test_pred[genre_ind_dict[genre]] tl = test_labels[genre_ind_dict[genre]] test_scores_genre[genre][i] = (tp == tl).mean() # test_scores[i] = (test_pred == test_labels).mean() d = { 'antiphon': test_scores_genre[antiphon].tolist(), 'resp_verse': test_scores_genre[resp_verse].tolist() } with open('output/' + config.name + '/scores.json', 'w') as f: json.dump(d, f) # plt.plot(range(1,seq_length+1), test_scores, label='All genres') for genre in genre_ind_dict: plt.plot(range(1, seq_length + 1), test_scores_genre[genre], label=genre_vocab[genre]) # plt.plot(range(1,21), test_scores+test_stds, c='Blue') # plt.plot(range(1,21), test_scores-test_stds, c='Blue') plt.legend() plt.xticks(list(range(1, seq_length + 1))) plt.ylim(0, 1) fig = plt.gcf() plt.show() ans = input('Do you want to save this figure? y/n\n') if ans == 'y': fig.savefig('output/' + config.name + '/eval_class.jpg') print('done')