def test_creating_buffered_context_check_labels_shape(self):
     with self.assertRaises(ValueError):
         BufferedPathContext.create_from_lists(
             ([[], [], []], ConvertParameters(0, False, {})),
             {
                 FROM_TOKEN: ([[], []], ConvertParameters(0, False, {})),
                 PATH_TYPES: ([[], []], ConvertParameters(0, False, {})),
                 TO_TOKEN: ([[], []], ConvertParameters(0, False, {})),
             },
         )
def _convert_raw_buffer(convert_args: Tuple[List[str], PreprocessingConfig, Vocabulary, str, int]):
    lines, config, vocab, output_folder, buffer_id = convert_args
    labels, from_tokens, path_types, to_tokens = [], [], [], []
    for line in lines:
        label, *path_contexts = line.split()
        label = _parse_token(label, config.split_target)
        labels.append([vocab.label_to_id.get(_l, vocab.label_to_id[UNK]) for _l in label])
        converted_context = [_convert_path_context_to_ids(config.split_names, pc, vocab) for pc in path_contexts]
        from_tokens.append([cc[0] for cc in converted_context])
        path_types.append([cc[1] for cc in converted_context])
        to_tokens.append([cc[2] for cc in converted_context])

    bpc = BufferedPathContext.create_from_lists(
        (labels, ConvertParameters(config.max_target_parts, config.wrap_target, vocab.label_to_id)),
        {
            FROM_TOKEN: (from_tokens, ConvertParameters(config.max_name_parts, config.wrap_name, vocab.token_to_id),),
            PATH_TYPES: (path_types, ConvertParameters(config.max_path_length, config.wrap_path, vocab.type_to_id)),
            TO_TOKEN: (to_tokens, ConvertParameters(config.max_name_parts, config.wrap_name, vocab.token_to_id)),
        },
    )

    with open(path.join(output_folder, DESCRIPTION_FILE), "a") as desc_file:
        n_samples = len(bpc.contexts_per_label)
        n_paths = sum(bpc.contexts_per_label)
        desc_file.write(f"{buffer_id},{BUFFERED_PATH_TEMPLATE.format(buffer_id)},{n_samples},{n_paths}\n")
    bpc.dump(path.join(output_folder, BUFFERED_PATH_TEMPLATE.format(buffer_id)))
Beispiel #3
0
 def _prepare_buffer(self, file_idx: int) -> None:
     assert file_idx < len(self._buffered_files_paths)
     self._cur_buffered_path_context = BufferedPathContext.load(
         self._buffered_files_paths[file_idx])
     self._order = numpy.arange(len(self._cur_buffered_path_context))
     if self.shuffle:
         self._order = numpy.random.permutation(self._order)
     self._cur_sample_idx = 0
 def test__convert_list_to_numpy_array_long(self):
     values = [[3, 4, 5, 6, 7, 8, 9, 10]]
     to_id = {SOS: 0, EOS: 1, PAD: 2}
     true_result = numpy.array([[0], [3], [4], [5], [6], [7]])
     numpy.testing.assert_equal(
         true_result,
         BufferedPathContext._list_to_numpy_array(values, len(values), 5,
                                                  True, to_id))
 def test__convert_list_to_numpy_no_wrap(self):
     values = [[3, 4, 5]]
     to_id = {SOS: 0, EOS: 1, PAD: 2}
     true_result = numpy.array([[3], [4], [5], [2], [2]])
     numpy.testing.assert_equal(
         true_result,
         BufferedPathContext._list_to_numpy_array(values, len(values), 5,
                                                  False, to_id))
 def test__convert_list_to_numpy_array_short(self):
     values = [[3]]
     to_id = {SOS: 0, EOS: 1, PAD: 2}
     true_result = numpy.array([[0], [3], [1], [2], [2], [2]])
     numpy.testing.assert_equal(
         true_result,
         BufferedPathContext._list_to_numpy_array(values, len(values), 5,
                                                  True, to_id))
Beispiel #7
0
    def test_forward(self):
        config = EncoderConfig(self._hidden_size, self._hidden_size, True, 0.5, 1, 0.5)

        buffered_path_contexts = BufferedPathContext.load(self._test_data_path)
        batch = PathContextBatch([buffered_path_contexts[i] for i in range(self._batch_size)])
        token_vocab_size = max(batch.context[FROM_TOKEN].max().item(), batch.context[TO_TOKEN].max().item())
        type_vocab_size = batch.context[PATH_TYPES].max().item()

        model = PathEncoder(config, self._hidden_size, token_vocab_size + 1, 0, type_vocab_size + 1, 0)

        out = model(batch.context)
        number_of_paths = sum(batch.contexts_per_label)
        self.assertTupleEqual((number_of_paths, self._hidden_size), out.shape)
    def test_creating_standard_path_context(self):
        token_to_id = {SOS: 0, EOS: 1, PAD: 2}
        type_to_id = {SOS: 1, EOS: 2, PAD: 0}
        label_to_id = {SOS: 2, EOS: 0, PAD: 1}
        labels = [[4], [], [4, 5, 6]]
        from_tokens = [
            [[4], [5, 6]],
            [[], [], []],
            [[6, 5, 4]],
        ]
        path_types = [
            [[4, 5], [6]],
            [[], [], []],
            [[6, 5, 4]],
        ]
        to_tokens = [
            [[6], [4, 5]],
            [[], [], []],
            [[4, 6, 4]],
        ]

        buffered_path_context = BufferedPathContext.create_from_lists(
            (labels, ConvertParameters(3, True, label_to_id)),
            {
                FROM_TOKEN:
                (from_tokens, ConvertParameters(3, False, token_to_id)),
                PATH_TYPES:
                (path_types, ConvertParameters(3, True, type_to_id)),
                TO_TOKEN:
                (to_tokens, ConvertParameters(3, False, token_to_id)),
            },
        )

        true_labels = numpy.array([[2, 2, 2], [4, 0, 4], [0, 1, 5], [1, 1, 6]])
        true_from_tokens = numpy.array([[4, 5, 2, 2, 2, 6], [2, 6, 2, 2, 2, 5],
                                        [2, 2, 2, 2, 2, 4]])
        true_path_types = numpy.array([[1, 1, 1, 1, 1, 1], [4, 6, 2, 2, 2, 6],
                                       [5, 2, 0, 0, 0, 5], [2, 0, 0, 0, 0, 4]])
        true_to_tokens = numpy.array([[6, 4, 2, 2, 2, 4], [2, 5, 2, 2, 2, 6],
                                      [2, 2, 2, 2, 2, 4]])

        self.assertListEqual([2, 3, 1],
                             buffered_path_context.contexts_per_label)
        numpy.testing.assert_array_equal(true_labels,
                                         buffered_path_context.labels)
        numpy.testing.assert_array_equal(
            true_from_tokens, buffered_path_context.contexts[FROM_TOKEN])
        numpy.testing.assert_array_equal(
            true_path_types, buffered_path_context.contexts[PATH_TYPES])
        numpy.testing.assert_array_equal(
            true_to_tokens, buffered_path_context.contexts[TO_TOKEN])
    def test_forward(self):
        config = DecoderConfig(self._hidden_size, self._hidden_size, 1, 0.5, 1)

        model = PathDecoder(config, self._out_size, 0, 0)

        buffered_path_contexts = BufferedPathContext.load(self._test_data_path)

        batch = PathContextBatch([
            buffered_path_contexts[i]
            for i in range(len(buffered_path_contexts))
        ])
        number_of_paths = sum(batch.contexts_per_label)
        fake_encoder_input = torch.rand(number_of_paths, self._hidden_size)

        output = model(fake_encoder_input, batch.contexts_per_label,
                       self._target_length)

        self.assertTupleEqual(
            (self._target_length, len(
                batch.contexts_per_label), self._out_size), output.shape)