Esempio n. 1
0
def test_identity_filter():
    config = [
        {
            "op": "identity_filter",
            "min_characters": 0,
        },
    ]

    assert _is_filtered(config,
                        tu.TranslationUnit("Hello world!", "Hello world!"))
    assert not _is_filtered(config,
                            tu.TranslationUnit("Hello world!", "Hello world"))
    config[0]["min_characters"] = 20
    assert not _is_filtered(config,
                            tu.TranslationUnit("Hello world!", "Hello world!"))
Esempio n. 2
0
    def _get_translation_units(self, files):
        src_file = files["source"]
        tgt_file = files.get("target")
        annotations = {
            key: f for key, f in files.items() if key not in ("source", "target")
        }
        for i in range(self._file.lines_count):
            src_line = src_file.readline()
            tgt_line = tgt_file.readline() if tgt_file else None
            annot_lines = {}
            for key, annot_file in annotations.items():
                annot_lines[key] = annot_file.readline()

            num_samples = self._file.random_sample.get(i, 0)
            if num_samples == 0:
                continue

            src_line = src_line.strip()
            if tgt_line:
                tgt_line = tgt_line.strip()
            for key, line in annot_lines.items():
                annot_lines[key] = line.strip()

            while num_samples > 0:
                yield tu.TranslationUnit(
                    source=src_line, target=tgt_line, annotations=annot_lines
                )
                num_samples -= 1
Esempio n. 3
0
def test_align_perplexity_percent_threshold(lower, upper, log_probs,
                                            expected_log_probs):
    if expected_log_probs is None:
        expected_log_probs = log_probs
    tu_list = []
    tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=True)
    for log_prob in log_probs:
        single_tu = tu.TranslationUnit("a b c",
                                       "a b c",
                                       source_tokenizer=tokenizer,
                                       target_tokenizer=tokenizer)
        single_tu.set_alignment(
            _MockAligner(forward_log_prob=log_prob,
                         backward_log_prob=log_prob))
        tu_list.append(single_tu)

    config = {
        "source":
        "en",
        "target":
        "fr",
        "preprocess": [{
            "op": "align_perplexity_filter",
            "percent_threshold": {
                "lower": lower,
                "upper": upper,
            }
        }]
    }

    tu_list = _run_pipeline(config, prepoperator.ProcessType.TRAINING, tu_list)
    assert len(tu_list) == len(expected_log_probs)
    for single_tu, log_prob in zip(tu_list, expected_log_probs):
        assert single_tu.alignment_log_probs[0][0] == log_prob
def test_tokenization_with_lang():
    tokenization_config = {
        "mode": "aggressive",
        "case_markup": True,
        "soft_case_regions": True,
    }
    config = {
        "source":
        "el",
        "target":
        "en",
        "preprocess": [{
            "op": "tokenization",
            "source": tokenization_config,
            "target": tokenization_config,
        }],
    }

    example = tu.TranslationUnit("ΣΙΓΜΑ ΤΕΛΙΚΟΣ")
    pipeline = prepoperator.Pipeline(config,
                                     prepoperator.ProcessType.INFERENCE)
    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == [
        "⦅mrk_begin_case_region_U⦆",
        "σιγμα",
        "τελικος",
        "⦅mrk_end_case_region_U⦆",
    ]
def test_parentheses_filter(src, tgt, filtered, expected):
    config = [
        {
            "op": "tokenization",
            "source": {
                "mode": "conservative",
                "joiner_annotate": True
            },
            "target": {
                "mode": "conservative",
                "joiner_annotate": True
            },
        },
        {
            "op": "parentheses",
            "side": "both",
            "type": [["(", ")"], ["<", ">"]]
        },
    ]

    TU = tu.TranslationUnit(src, tgt)
    assert filtered == _is_filtered(config, TU)
    if not filtered:
        result_src = TU.src_detok
        result_tgt = TU.tgt_detok
        if expected[0] is None:
            assert src == result_src
        else:
            assert expected[0] == result_src
        if expected[1] is None:
            assert tgt == result_tgt
        else:
            assert expected[1] == result_tgt
Esempio n. 6
0
    def __call__(self):
        files = [utils.open_file(path) for path in self._files]

        try:
            tu_list = []

            # Postprocess.
            if len(self._files) > 1:
                for meta in self._metadata:

                    # TODO : prefix, features
                    num_parts = len(meta)
                    src_lines = [
                        next(files[0]).strip().split()
                        for _ in range(num_parts)
                    ]
                    tgt_lines = [
                        next(files[1]).strip().split()
                        for _ in range(num_parts)
                    ]

                    tu_list.append(
                        tu.TranslationUnit(
                            source=src_lines,
                            target=tgt_lines,
                            metadata=meta,
                            source_tokenizer=self._source_tokenizer,
                            target_tokenizer=self._target_tokenizer))

                    if len(tu_list) == self._batch_size:
                        yield tu_list, {}
                        tu_list = []

            # Preprocess.
            else:
                for line in files[0]:
                    tu_list.append(tu.TranslationUnit(source=line))
                    if len(tu_list) == self._batch_size:
                        yield tu_list, {}
                        tu_list = []

            if tu_list:
                yield tu_list, {}
        finally:
            for f in files:
                f.close()
def test_tokenization_with_inference_config(tmpdir):
    config = {
        "source":
        "en",
        "target":
        "de",
        "preprocess": [
            {
                "op": "tokenization",
                "source": {
                    "mode": "aggressive",
                },
                "target": {
                    "mode": "aggressive",
                },
            },
        ],
    }

    process_type = prepoperator.ProcessType.INFERENCE
    example = tu.TranslationUnit("2,000", "2,000")

    pipeline = prepoperator.Pipeline(config, process_type)

    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == ["2", ",", "000"]
    assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"]

    config["inference"] = {
        "overrides": {
            "tokenization_1": {
                "source": {
                    "mode": "none"
                }
            }
        }
    }
    pipeline = prepoperator.Pipeline(config, process_type)

    example = tu.TranslationUnit("2,000", "2,000")
    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == ["2,000"]
    assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"]
def test_tokenization_with_vocabulary_restriction(tmpdir):
    sp_model_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        "corpus",
        "resources",
        "subword",
        "en_de.sp",
    )
    config = {
        "source":
        "en",
        "target":
        "de",
        "preprocess": [
            {
                "op": "tokenization",
                "source": {
                    "mode": "none",
                    "sp_model_path": sp_model_path,
                    "restrict_subword_vocabulary": True,
                },
                "target": {
                    "mode": "none",
                    "sp_model_path": sp_model_path,
                },
            },
        ],
    }

    process_type = prepoperator.ProcessType.INFERENCE
    example = tu.TranslationUnit("World", "World")

    with pytest.raises(ValueError, match="restrict_subword_vocabulary"):
        pipeline = prepoperator.Pipeline(config, process_type)

    vocab_path = str(tmpdir.join("vocab.txt"))
    with open(vocab_path, "w") as vocab_file:
        vocab_file.write("# Comment\n")
        vocab_file.write("▁Wor 0.0224656\n")
    config.update({
        "vocabulary": {
            "source": {
                "path": vocab_path,
            },
            "target": {
                "path": vocab_path,
            },
        },
    })

    pipeline = prepoperator.Pipeline(config, process_type)
    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == ["▁Wor", "l", "d"]
    assert tu_list[0].tgt_tok.tokens[0] == ["▁World"]
Esempio n. 9
0
def _run_pipeline(config, process_type, tu_list):
    if isinstance(tu_list, str):
        tu_list = tu.TranslationUnit(tu_list)
    if not isinstance(tu_list, list):
        tu_list = [tu_list]
    if isinstance(config, list):
        config = {
            "source": "xx",
            "target": "yy",
            "preprocess": config,
        }
    pipeline = prepoperator.Pipeline(config, process_type)
    tu_list, _ = pipeline((tu_list, {}))
    return tu_list
Esempio n. 10
0
def test_align_perplexity_hard_threshold(lower, upper, src_length, tgt_length,
                                         fwd_log_prob, bwd_log_prob, filtered):
    config = [{
        "op": "align_perplexity_filter",
        "hard_threshold": {
            "lower": lower,
            "upper": upper,
        }
    }]

    tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=True)
    single_tu = tu.TranslationUnit(" ".join(str(i) for i in range(src_length)),
                                   " ".join(str(i) for i in range(tgt_length)),
                                   source_tokenizer=tokenizer,
                                   target_tokenizer=tokenizer)
    single_tu.set_alignment(
        _MockAligner(forward_log_prob=fwd_log_prob,
                     backward_log_prob=bwd_log_prob))
    assert filtered == _is_filtered(config, single_tu)
Esempio n. 11
0
    def _get_translation_units(self, files):
        source_file = files["source"]
        target_file = files["target"]
        for meta in self._metadata:
            # TODO : features
            num_parts = len(meta)
            src_lines = [next(source_file).strip().split(" ") for _ in range(num_parts)]
            tgt_lines = [next(target_file).strip().split(" ") for _ in range(num_parts)]

            if self._target_score_type is not None:
                score = _extract_score(tgt_lines, self._target_score_type)
                meta = [{"score": score}]

            yield tu.TranslationUnit(
                source=src_lines,
                target=tgt_lines,
                metadata=meta,
                source_tokenizer=self._source_tokenizer,
                target_tokenizer=self._target_tokenizer,
            )
Esempio n. 12
0
def test_length_filter(filter_config, filtered):
    filter_config["op"] = "length_filter"
    config = [
        {
            "op": "tokenization",
            "source": {
                "mode": "conservative",
                "joiner_annotate": True
            },
            "target": {
                "mode": "conservative",
                "joiner_annotate": True
            },
        },
        filter_config,
    ]

    source = "Hello world!"
    target = "Bonjour le monde !"
    assert filtered == _is_filtered(config, tu.TranslationUnit(source, target))
Esempio n. 13
0
def test_length_filter_empty_target():
    config = [
        {
            "op": "tokenization",
            "source": {
                "mode": "conservative",
                "joiner_annotate": True
            },
            "target": {
                "mode": "conservative",
                "joiner_annotate": True
            },
        },
        {
            "op": "length_filter",
            "min_words_ratio": 0.7,
            "max_words_ratio": 2,
        },
    ]
    source = "Hello"
    target = ""
    assert _is_filtered(config, tu.TranslationUnit(source, target))
Esempio n. 14
0
        def _get_samples():
            for i in range(self._file.lines_count):
                src_line = src_file.readline()
                tgt_line = tgt_file.readline()
                annot_lines = {}
                for key, annot_file in annotations.items():
                    annot_lines[key] = annot_file.readline()

                num_samples = self._file.random_sample.get(i, 0)
                if num_samples == 0:
                    continue

                src_line = src_line.strip()
                tgt_line = tgt_line.strip()
                for key, line in annot_lines.items():
                    annot_lines[key] = line.strip()

                while num_samples > 0:
                    yield tu.TranslationUnit(source=src_line,
                                             target=tgt_line,
                                             annotations=annot_lines)
                    num_samples -= 1
Esempio n. 15
0
 def _get_translation_units(self, files):
     source_file = files["source"]
     target_file = files.get("target", itertools.repeat(None))
     for source, target in zip(source_file, target_file):
         yield tu.TranslationUnit(source=source, target=target)