def _count_lines(root, base_name, annotations): file_path = os.path.join(root, base_name) files = {} logger.debug("Processing %s", file_path) src_path = file_path + "." + src_suffix tgt_path = file_path + "." + tgt_suffix # Check all directions are present and aligned, open files src_file, src_lines = utils.count_lines(src_path) files["src"] = src_file if src_file and src_lines: # TODO V2 : multiple sources and targets tgt_file, tgt_lines = utils.count_lines(tgt_path) if tgt_file is None: logger.warning( 'Target file %s does not exist. The source file %s will be ignored in sampling.', tgt_path, src_path, ) return files, 0 files["tgt"] = tgt_file if src_lines != tgt_lines: logger.warning( 'Target file %s (%d lines) is not aligned with source file ' '%s (%d lines). Files will be ignored in sampling.', tgt_path, tgt_lines, src_path, src_lines, ) return files, 0 files["annotations"] = {} for key, annot_path in annotations.items(): for suffix in ["", src_suffix, tgt_suffix]: annot_file_path = os.path.join(annot_path, base_name) if suffix: annot_file_path += "." + suffix annot_file, annot_lines = utils.count_lines(annot_file_path) if not annot_file: continue if suffix: key = key + ":" + suffix files["annotations"][key] = annot_file if src_lines != annot_lines: logger.warning( 'Annotation file %s (%d lines) is not aligned with source ' 'file %s (%d lines). Files will be ignored in sampling.', annot_path, annot_lines, file_path + src_suffix, src_lines, ) return files, 0 return files, src_lines
def test_preprocess_gzip_file(tmpdir): num_lines = 10 input_path = generate_pseudo_corpus(tmpdir, num_lines, "input", "en.gz") processor = InferenceProcessor(config_base) output_path, _ = processor.process_file(input_path) assert os.path.basename(output_path) == "input.en.tok" assert utils.count_lines(output_path)[1] == num_lines
def test_postprocess_multipart_file_loader(tmpdir): src_num_lines = 8 src_input_path = generate_pseudo_corpus(tmpdir, src_num_lines, "input", "en") tgt_num_lines = 8 tgt_input_path = generate_pseudo_corpus(tmpdir, tgt_num_lines, "input", "de") processor = InferenceProcessor(config_base, postprocess=True) meta = [ [None, None, None], [None, None], [None], [None, None], ] output_path = processor.process_file(src_input_path, tgt_input_path, meta) assert os.path.basename(output_path) == "input.de.detok" assert utils.count_lines(output_path)[1] == 4