Esempio n. 1
0
    def test_decomposition_json(self):
        "Test decomposition of all layers into separate files for json"
        output_file = f"{self.tmpdir.name}/somename.json"
        actual_output_files = [
            f"{self.tmpdir.name}/somename-layer{layer_idx}.json"
            for layer_idx in range(self.num_layers)
        ]
        writer = ActivationsWriter.get_writer(output_file,
                                              decompose_layers=True)

        for s_idx in range(len(self.sentences)):
            writer.write_activations(
                s_idx,
                self.sentences[s_idx].split(" "),
                self.expected_activations[s_idx],
            )

        writer.close()

        for layer_idx, output_file in enumerate(actual_output_files):
            saved_activations, num_layers = loader.load_activations(
                output_file)
            # Decomposed files should only have 1 layer each
            self.assertEqual(1, num_layers)

            # Check saved activations
            for sentence_idx, sentence_activations in enumerate(
                    saved_activations):
                curr_saved_activations = torch.FloatTensor(
                    saved_activations[sentence_idx])
                curr_expected_activations = self.expected_activations[
                    sentence_idx][layer_idx, :, :]
                self.assertTrue(
                    torch.allclose(curr_saved_activations,
                                   curr_expected_activations))
Esempio n. 2
0
    def test_filter_layers_json(self):
        "Test layer filtering for json"
        output_file = f"{self.tmpdir.name}/somename.json"
        writer = ActivationsWriter.get_writer(output_file,
                                              filter_layers=",".join(
                                                  map(str,
                                                      self.filter_layers)))

        for s_idx in range(len(self.sentences)):
            writer.write_activations(
                s_idx,
                self.sentences[s_idx].split(" "),
                self.expected_activations[s_idx],
            )

        writer.close()

        saved_activations, num_layers = loader.load_activations(output_file)
        self.assertEqual(len(self.filter_layers), num_layers)

        # Check saved activations
        for sentence_idx, sentence_activations in enumerate(saved_activations):
            curr_saved_activations = torch.FloatTensor(
                saved_activations[sentence_idx].reshape((
                    self.expected_activations[sentence_idx].shape[1],
                    len(self.filter_layers),
                    -1,
                )).swapaxes(0, 1))
            curr_expected_activations = self.expected_activations[
                sentence_idx][self.filter_layers, :, :]
            self.assertTrue(
                torch.allclose(curr_saved_activations,
                               curr_expected_activations))
Esempio n. 3
0
    def test_binary_data_wrapper(self, mock_create_binary_data):
        mock_create_binary_data.return_value = (
            self.test_sentences,
            self.test_sentences,
            self.activations,
        )

        annotate.annotate_data(
            f"{self.tmpdir.name}/gold.word",
            f"{self.tmpdir.name}/gold.hdf5",
            {"test"},
            f"{self.tmpdir.name}/test",
        )

        with open(f"{self.tmpdir.name}/test.word") as fp:
            for line_idx, line in enumerate(fp):
                self.assertEqual(self.test_sentences[line_idx], line.strip())

        # Load and check activations as well
        test_activations, test_num_layers = data_loader.load_activations(
            f"{self.tmpdir.name}/test.hdf5")
        self.assertEqual(self.num_layers, test_num_layers)

        gold_activations = [
            a.reshape((a.shape[1], -1)) for a in self.activations
        ]
        for act_idx, act in enumerate(test_activations):
            self.assertTrue(
                torch.allclose(gold_activations[act_idx],
                               torch.FloatTensor(act)))
Esempio n. 4
0
def annotate_data(
    source_path,
    activations_path,
    binary_filter,
    output_prefix,
    output_type="hdf5",
    decompose_layers=False,
    filter_layers=None,
):
    """
    Given a set of sentences, per word activations, a binary_filter and output_prefix, creates binary data and save it to the disk.
    A binary filter can be a set of words, a regex object or a function

    Parameters
    ----------
    source_path : text file with one sentence per line
    activations: list
        A list of sentence-wise activations
    binary_filter: a set of words or a regex object or a function
    output_prefix: prefix of the output files that will be saved as the output of this script

    Returns
    -------
    Saves a word file, a binary label file and their activations

    Example
    -------
    annotate_data(source_path, activations_path, re.compile(r'^\w\w$')) select words of two characters only as a positive class
    annotate_data(source_path, activations_path, {'is', 'can'}) select occrrences of 'is' and 'can' as a positive class
    """

    activations, num_layers = data_loader.load_activations(activations_path)

    # giving source_path instead of labels since labels will be generated later
    tokens = data_loader.load_data(source_path,
                                   source_path,
                                   activations,
                                   max_sent_l=512)

    words, labels, activations = _create_binary_data(tokens, activations,
                                                     binary_filter)
    activations = [
        np.swapaxes(a.reshape((a.shape[1], num_layers, -1)), 0, 1)
        for a in activations
    ]
    data_utils.save_files(
        words,
        labels,
        activations,
        output_prefix,
        output_type,
        decompose_layers,
        filter_layers,
    )
Esempio n. 5
0
    def test_decomposition_and_filter_layers_hdf5(self):
        "Test decomposition of specific layers into separate files for hdf5"
        output_file = f"{self.tmpdir.name}/somename.hdf5"
        actual_output_files = [
            f"{self.tmpdir.name}/somename-layer{layer_idx}.hdf5"
            for layer_idx in self.filter_layers
        ]
        writer = ActivationsWriter.get_writer(
            output_file,
            decompose_layers=True,
            filter_layers=",".join(map(str, self.filter_layers)),
        )

        for s_idx in range(len(self.sentences)):
            writer.write_activations(
                s_idx,
                self.sentences[s_idx].split(" "),
                self.expected_activations[s_idx],
            )

        writer.close()

        for layer_idx, output_file in enumerate(actual_output_files):
            saved_activations, num_layers = loader.load_activations(
                output_file)
            # Decomposed files should only have 1 layer each
            self.assertEqual(1, num_layers)

            # Check saved activations
            for sentence_idx, sentence_activations in enumerate(
                    saved_activations):
                curr_saved_activations = torch.FloatTensor(
                    saved_activations[sentence_idx])
                curr_expected_activations = self.expected_activations[
                    sentence_idx][self.filter_layers[layer_idx], :, :]
                self.assertTrue(
                    torch.equal(curr_saved_activations,
                                curr_expected_activations))