Exemple #1
0
    def __init__(
        self,
        activations_dir: str,
        corpus: Corpus,
        test_activations_dir: Optional[str] = None,
        test_corpus: Optional[Corpus] = None,
        selection_func: SelectFunc = lambda sen_id, pos, example: True,
    ) -> None:
        assert corpus is not None, "`corpus`should be provided!"

        self.train_labels = create_labels_from_corpus(
            corpus, selection_func=selection_func)

        if test_activations_dir is not None:
            self.test_activation_reader = ActivationReader(
                test_activations_dir)
            assert test_corpus is not None, "`test_corpus` should be provided!"
            self.test_labels = create_labels_from_corpus(
                test_corpus, selection_func=selection_func)
        else:
            self.test_activation_reader = None
            self.test_labels = None

        self.activation_reader = ActivationReader(activations_dir)
        self.data_len = len(self.activation_reader)
Exemple #2
0
    def setUpClass(cls) -> None:
        # Create directory if necessary
        if not os.path.exists(ACTIVATIONS_DIR):
            os.makedirs(ACTIVATIONS_DIR)

        create_and_dump_dummy_activations(num_sentences=NUM_TEST_SENTENCES,
                                          activations_dim=ACTIVATIONS_DIM,
                                          max_tokens=5,
                                          activations_dir=ACTIVATIONS_DIR,
                                          activations_name=ACTIVATIONS_NAME,
                                          num_classes=2)
        cls.activation_reader = ActivationReader(
            activations_dir=ACTIVATIONS_DIR)
Exemple #3
0
    def __init__(self,
                 model: LanguageModel,
                 decomposer: str,
                 activations_dir: str,
                 decoder: Optional[str] = None,
                 init_lstm_states_path: Optional[str] = None) -> None:

        # Import Decomposer class from string, assumes module name to be snake case variant
        # of CamelCased Decomposer class. Taken from: https://stackoverflow.com/a/30941292
        module_name = camel2snake(decomposer)
        module = import_module(f'diagnnose.decompositions.{module_name}')
        self.decomposer_constructor: Type[BaseDecomposer] = getattr(
            module, decomposer)

        self.activation_reader = ActivationReader(
            activations_dir, store_multiple_activations=True)
        self.model = model

        self.decoder_w, self.decoder_b = self._read_decoder(decoder)

        self.init_cell_state: FullActivationDict = InitStates(
            model, init_lstm_states_path).create()
Exemple #4
0
    def _create_activation_reader(
        self,
        model: Optional[LanguageModel],
        corpus: Corpus,
        activations_dir: str,
        selection_func: Optional[SelectionFunc],
    ) -> ActivationReader:
        if activations_dir is None or self.create_new_activations:
            assert model is not None

            activation_reader, _ = simple_extract(
                model,
                corpus,
                self.activation_names,
                activations_dir=activations_dir,
                selection_func=selection_func or return_all,
            )
            activation_reader.cat_activations = True
        else:
            activation_reader = ActivationReader(activations_dir, cat_activations=True)

        return activation_reader
Exemple #5
0
    def __init__(
        self,
        model: LanguageModel,
        activations_dir: str,
        create_new_activations: bool = False,
        corpus: Optional[Corpus] = None,
        sen_ids: ActivationIndex = slice(None, None),
        decomposer: str = "ContextualDecomposer",
    ) -> None:
        self.model = model

        module = import_module(f"diagnnose.decompositions")
        self.decomposer_constructor: Type[BaseDecomposer] = getattr(
            module, decomposer)

        if create_new_activations:
            assert (
                corpus is not None
            ), "Corpus should be provided if no activations_dir is passed."
            self._extract_activations(activations_dir, sen_ids, corpus)
        self.activation_reader = ActivationReader(
            activations_dir, store_multiple_activations=True)
Exemple #6
0
def lakretz_init(
    vocab_path: str,
    path: str,
    task_activations: Optional[Dict[str, str]] = None,
    tasks: Optional[List[str]] = None,
    device: str = "cpu",
) -> Dict[str, Dict[str, Any]]:
    """ Initializes the tasks described in Lakretz et al. (2019)

    Arxiv link: https://arxiv.org/pdf/1903.07435.pdf
    Repo: https://github.com/FAIRNS/Number_and_syntax_units_in_LSTM_LMs

    Parameters
    ----------
    vocab_path : str
        Path to vocabulary file of the Language Model.
    path : str
        Path to directory containing the datasets that can be found
        in the github repo.
    task_activations : str, optional
        Dictionary mapping task names to directories to which the
        Lakretz task embeddings have been extracted. If a task is not
        provided the activations will be created during the task.
    tasks : List[str], optional
        The downstream tasks that will be tested. If not provided this
        will default to the full set of conditions.
    device : str, optional
        Torch device name on which model will be run. Defaults to cpu.

    Returns
    -------
    init_dict : Dict[str, Dict[str, Any]]
        Dictionary containing the initial task setup, mapping each task
        to to required fields.
    """

    task_activations = task_activations or {}

    if tasks is None:
        tasks = list(lakretz_descriptions.keys())

    init_dict: Dict[str, Dict[str, Any]] = {}

    for task in tasks:
        assert task in lakretz_descriptions, f"Provided task {task} is not recognised!"

        activation_dir = task_activations.get(task, None)
        activation_reader = (ActivationReader(activation_dir)
                             if activation_dir is not None else None)

        task_specs = lakretz_descriptions[task]
        items_per_class = task_specs["items_per_class"]

        corpus = import_corpus(
            os.path.join(path, f"{task}.txt"),
            header=["sen", "type", "correct", "idx"],
            vocab_path=vocab_path,
        )

        iterator = create_iterator(corpus,
                                   batch_size=(items_per_class * 2),
                                   device=device)

        init_dict[task] = {
            "activation_reader": activation_reader,
            "corpus": corpus,
            "iterator": iterator,
        }

    return init_dict
Exemple #7
0
def main(config):

    classifier = joblib.load('output/' + config.name +
                             '/classifiers/hx_l1.joblib')
    train_reader = ActivationReader('output/' + config.name +
                                    '/activations/train')
    test_reader = ActivationReader('output/' + config.name +
                                   '/activations/test')

    if 'raw' in config.name:
        representation = 'raw'
    if 'neume' in config.name:
        representation = 'neume'
    if 'syllable' in config.name:
        representation = 'syllable'

    if 'pitch' in config.name:
        notes = 'pitch'
    if 'interval' in config.name:
        notes = 'interval'

    if '_20_' in config.name:
        seq_length = 20
    else:
        seq_length = 30

    # train_corpus = import_corpus_from_path('data/inputs/' + notes + '_20_' + representation + '_mode_corpus_train.txt', ['sen', 'labels'])
    test_corpus = import_corpus_from_path(
        'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation +
        '_mode_corpus_test.txt', ['sen', 'labels'])
    test_genre_corpus = import_corpus_from_path(
        'data/inputs/' + notes + '_' + str(seq_length) + '_' + representation +
        '_string_genre_corpus_test.txt', ['sen', 'labels'])

    ##############################################################
    ##############################################################

    hx_1_test = test_reader.read_activations((1, 'hx'))
    test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length))
    test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length))
    for i in range(len(test_labels)):
        test_labels[i] = test_corpus[i].labels[19]
        test_genres[i] = test_genre_corpus[i].labels[19]

    test_genres = test_genres.astype(int)
    test_labels = test_labels.astype(int)

    with open(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_genre_vocab.txt', 'rb') as vf:
        vocab_lines = vf.readlines()
        vocab_lines = [line.decode('utf-8') for line in vocab_lines]

    count = Counter(test_genres)
    most_common_genres = count.most_common(3)
    genre_vocab = [w.strip() for w in vocab_lines]

    resp_verse = genre_vocab.index('Responsory verse')
    antiphon = genre_vocab.index('Antiphon')

    genre_ind_dict = {}
    for genre in most_common_genres:
        print(genre_vocab[genre[0]], genre)
        genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0])

    ct = 1

    embs = {}

    for i in [0, 13]:
        activation_test = hx_1_test[i::20]
        activation_test = activation_test[genre_ind_dict[resp_verse]]
        x_emb = TSNE(n_components=2, verbose=2).fit_transform(activation_test)
        show_tsne_plot(x_emb, test_labels[genre_ind_dict[resp_verse]] + 1)
Exemple #8
0
class DataLoader:
    """ Reads in pickled activations that have been extracted.

    Parameters
    ----------
    activations_dir : str
        Directory containing the extracted activations.
    corpus : Corpus
        Corpus containing the labels for each sentence.
    test_activations_dir : str, optional
        Directory containing the extracted test activations. If not
        provided the train activation set will be split and partially
        used as test set.
    test_corpus : Corpus, optional
        Corpus containing the test labels for each sentence. Must be
        provided if `test_activations_dir` is provided.
    selection_func : SelectFunc, optional
        Selection function that determines whether a corpus item should
        be taken into account. If such a function has been used during
        extraction, make sure to pass it along here as well.
    """
    def __init__(
        self,
        activations_dir: str,
        corpus: Corpus,
        test_activations_dir: Optional[str] = None,
        test_corpus: Optional[Corpus] = None,
        selection_func: SelectFunc = lambda sen_id, pos, example: True,
    ) -> None:
        assert corpus is not None, "`corpus`should be provided!"

        self.train_labels = create_labels_from_corpus(
            corpus, selection_func=selection_func)

        if test_activations_dir is not None:
            self.test_activation_reader = ActivationReader(
                test_activations_dir)
            assert test_corpus is not None, "`test_corpus` should be provided!"
            self.test_labels = create_labels_from_corpus(
                test_corpus, selection_func=selection_func)
        else:
            self.test_activation_reader = None
            self.test_labels = None

        self.activation_reader = ActivationReader(activations_dir)
        self.data_len = len(self.activation_reader)

    def create_data_split(
        self,
        activation_name: ActivationName,
        data_subset_size: int = -1,
        train_test_split: float = 0.9,
    ) -> DataDict:
        """ Creates train/test data split of activations

        Parameters
        ----------
        activation_name : ActivationName
            (layer, name) tuple indicating the activations to be read in
        data_subset_size : int, optional
            Subset size of data to train on. Defaults to -1, indicating
            the entire data set.
        train_test_split : float
            Percentage of the train/test split. If separate test
            activations are provided this split won't be used.
            Defaults to 0.9/0.1.
        """

        if data_subset_size != -1:
            assert (0 < data_subset_size <= self.data_len
                    ), "Size of subset can't be bigger than the full data set."

        train_activations = self.activation_reader.read_activations(
            activation_name)

        # Shuffle activations
        data_size = self.data_len if data_subset_size == -1 else data_subset_size
        indices = np.random.choice(range(data_size), data_size, replace=False)
        train_activations = train_activations[indices]
        train_labels = self.train_labels[indices]

        if self.test_activation_reader is not None:
            test_activations = self.test_activation_reader.read_activations(
                activation_name)
            test_labels = self.test_labels
        else:
            split = int(data_size * train_test_split)

            test_activations = train_activations[split:]
            test_labels = train_labels[split:]
            train_activations = train_activations[:split]
            train_labels = train_labels[:split]

        return {
            "train_x": train_activations,
            "train_y": train_labels,
            "test_x": test_activations,
            "test_y": test_labels,
        }
Exemple #9
0
def main(config):

    classifier = joblib.load('output/' + config.name +
                             '/classifiers/hx_l1.joblib')
    train_reader = ActivationReader('output/' + config.name +
                                    '/activations/train')
    test_reader = ActivationReader('output/' + config.name +
                                   '/activations/test')

    if 'raw' in config.name:
        representation = 'raw'
    if 'neume' in config.name:
        representation = 'neume'
    if 'syllable' in config.name:
        representation = 'syllable'

    if 'pitch' in config.name:
        notes = 'pitch'
    if 'interval' in config.name:
        notes = 'interval'

    if '_20_' in config.name:
        seq_length = 20
    else:
        seq_length = 30

    if 'embedding' in config.name:
        train_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_string_mode_corpus_train.txt',
            ['sen', 'labels'])
        test_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_string_mode_corpus_test.txt', ['sen', 'labels'])
        test_genre_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_string_genre_corpus_test.txt',
            ['sen', 'labels'])
    else:
        train_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_mode_corpus_train.txt', ['sen', 'labels'])
        test_corpus = import_corpus_from_path(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_mode_corpus_test.txt', ['sen', 'labels'])

    # hx_1_train = train_reader.read_activations((1,'hx'))
    # train_labels = np.zeros(int(hx_1_train.shape[0]/seq_length))
    # for i in range(len(train_labels)):
    #     train_labels[i] = train_corpus[i].labels[19]

    # train_scores = np.zeros(seq_length)
    # train_stds = np.zeros(seq_length)

    # for i in range(seq_length):
    #     activation_train = hx_1_train[i::seq_length]
    #     train_pred = classifier.predict(activation_train)
    #     train_scores[i] = (train_pred == train_labels).mean()

    # plt.plot(range(1,seq_length+1), train_scores, c='Red')

    ##############################################################
    ##############################################################

    hx_1_test = test_reader.read_activations((1, 'hx'))
    test_labels = np.zeros(int(hx_1_test.shape[0] / seq_length))
    test_genres = np.zeros(int(hx_1_test.shape[0] / seq_length))
    for i in range(len(test_labels)):
        test_labels[i] = test_corpus[i].labels[19]
        test_genres[i] = test_genre_corpus[i].labels[19]

    test_genres = test_genres.astype(int)
    test_labels = test_labels.astype(int)

    with open(
            'data/inputs/' + notes + '_' + str(seq_length) + '_' +
            representation + '_genre_vocab.txt', 'rb') as vf:
        vocab_lines = vf.readlines()
        vocab_lines = [line.decode('utf-8') for line in vocab_lines]

    count = Counter(test_genres)
    most_common_genres = count.most_common(25)
    genre_vocab = [w.strip() for w in vocab_lines]

    resp_verse = genre_vocab.index('Responsory verse')
    antiphon = genre_vocab.index('Antiphon')

    genre_ind_dict = {}
    for genre in most_common_genres:
        print(genre_vocab[genre[0]], genre)
        genre_ind_dict[genre[0]] = np.where(np.array(test_genres) == genre[0])

    test_scores = np.zeros(seq_length)
    test_stds = np.zeros(seq_length)

    test_scores_genre = {}

    for genre in genre_ind_dict:
        test_scores_genre[genre] = np.zeros(seq_length)

    preds = []

    for i in range(seq_length):
        activation_test = hx_1_test[i::seq_length]
        test_pred = classifier.predict(activation_test)
        preds.append(test_pred)
        for genre in genre_ind_dict:
            tp = test_pred[genre_ind_dict[genre]]
            tl = test_labels[genre_ind_dict[genre]]
            test_scores_genre[genre][i] = (tp == tl).mean()
        # test_scores[i] = (test_pred == test_labels).mean()

    d = {
        'antiphon': test_scores_genre[antiphon].tolist(),
        'resp_verse': test_scores_genre[resp_verse].tolist()
    }

    with open('output/' + config.name + '/scores.json', 'w') as f:
        json.dump(d, f)

    # plt.plot(range(1,seq_length+1), test_scores, label='All genres')
    for genre in genre_ind_dict:
        plt.plot(range(1, seq_length + 1),
                 test_scores_genre[genre],
                 label=genre_vocab[genre])
    # plt.plot(range(1,21), test_scores+test_stds, c='Blue')
    # plt.plot(range(1,21), test_scores-test_stds, c='Blue')
    plt.legend()
    plt.xticks(list(range(1, seq_length + 1)))
    plt.ylim(0, 1)
    fig = plt.gcf()
    plt.show()
    ans = input('Do you want to save this figure? y/n\n')
    if ans == 'y':
        fig.savefig('output/' + config.name + '/eval_class.jpg')

    print('done')