Example #1
0
    def _count_lines(root, base_name, annotations):
        file_path = os.path.join(root, base_name)
        files = {}
        logger.debug("Processing %s", file_path)

        src_path = file_path + "." + src_suffix
        tgt_path = file_path + "." + tgt_suffix

        # Check all directions are present and aligned, open files
        src_file, src_lines = utils.count_lines(src_path)
        files["src"] = src_file

        if src_file and src_lines:
            # TODO V2 : multiple sources and targets
            tgt_file, tgt_lines = utils.count_lines(tgt_path)
            if tgt_file is None:
                logger.warning(
                    'Target file %s does not exist. The source file %s will be ignored in sampling.',
                    tgt_path,
                    src_path,
                )
                return files, 0
            files["tgt"] = tgt_file
            if src_lines != tgt_lines:
                logger.warning(
                    'Target file %s (%d lines) is not aligned with source file '
                    '%s (%d lines). Files will be ignored in sampling.',
                    tgt_path,
                    tgt_lines,
                    src_path,
                    src_lines,
                )
                return files, 0

        files["annotations"] = {}
        for key, annot_path in annotations.items():
            for suffix in ["", src_suffix, tgt_suffix]:
                annot_file_path = os.path.join(annot_path, base_name)
                if suffix:
                    annot_file_path += "." + suffix
                annot_file, annot_lines = utils.count_lines(annot_file_path)
                if not annot_file:
                    continue
                if suffix:
                    key = key + ":" + suffix
                files["annotations"][key] = annot_file
                if src_lines != annot_lines:
                    logger.warning(
                        'Annotation file %s (%d lines) is not aligned with source '
                        'file %s (%d lines). Files will be ignored in sampling.',
                        annot_path,
                        annot_lines,
                        file_path + src_suffix,
                        src_lines,
                    )
                    return files, 0

        return files, src_lines
Example #2
0
def test_preprocess_gzip_file(tmpdir):
    num_lines = 10
    input_path = generate_pseudo_corpus(tmpdir, num_lines, "input", "en.gz")
    processor = InferenceProcessor(config_base)
    output_path, _ = processor.process_file(input_path)

    assert os.path.basename(output_path) == "input.en.tok"
    assert utils.count_lines(output_path)[1] == num_lines
Example #3
0
def test_postprocess_multipart_file_loader(tmpdir):
    src_num_lines = 8
    src_input_path = generate_pseudo_corpus(tmpdir, src_num_lines, "input",
                                            "en")
    tgt_num_lines = 8
    tgt_input_path = generate_pseudo_corpus(tmpdir, tgt_num_lines, "input",
                                            "de")
    processor = InferenceProcessor(config_base, postprocess=True)

    meta = [
        [None, None, None],
        [None, None],
        [None],
        [None, None],
    ]

    output_path = processor.process_file(src_input_path, tgt_input_path, meta)

    assert os.path.basename(output_path) == "input.de.detok"
    assert utils.count_lines(output_path)[1] == 4