コード例 #1
0
ファイル: test_utils.py プロジェクト: ye-kyaw-thu/sockeye
def test_smart_open_without_suffix():
    with TemporaryDirectory() as temp:
        fname = os.path.join(temp, 'test')
        _touch_file(fname, compressed=True, empty=False)
        with utils.smart_open(fname) as fin:
            assert len(fin.readlines()) == 10
        _touch_file(fname, compressed=False, empty=False)
        with utils.smart_open(fname) as fin:
            assert len(fin.readlines()) == 10
コード例 #2
0
ファイル: output_handler.py プロジェクト: bricksdont/sockeye
def get_output_handler(output_type: str,
                       output_fname: Optional[str] = None) -> 'OutputHandler':
    """

    :param output_type: Type of output handler.
    :param output_fname: Output filename. If none sys.stdout is used.
    :raises: ValueError for unknown output_type.
    :return: Output handler.
    """
    output_stream = sys.stdout if output_fname is None else smart_open(
        output_fname, mode='w')
    if output_type == C.OUTPUT_HANDLER_TRANSLATION:
        return StringOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_SCORE:
        return ScoreOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_PAIR_WITH_SCORE:
        return PairWithScoreOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_SCORE:
        return StringWithScoreOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_BENCHMARK:
        return BenchmarkOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_JSON:
        return JSONOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_FACTORS:
        return FactoredStringOutputHandler(output_stream)
    else:
        raise ValueError("unknown output type")
コード例 #3
0
def merge_spm(input_fname: str, output_fname: str):
    with utils.smart_open(input_fname,
                          "r") as inp, open(output_fname,
                                            "w",
                                            encoding="utf-8") as out:
        for line in inp:
            sentence = line.replace(' ', '').replace('\u2581', ' ').lstrip()
            out.write(sentence)
コード例 #4
0
def make_inputs(
    input_file: Optional[str],
    translator: inference.Translator,
    input_is_json: bool,
    input_factors: Optional[List[str]] = None
) -> Generator[inference.TranslatorInput, None, None]:
    """
    Generates TranslatorInput instances from input. If input is None, reads from stdin. If num_input_factors > 1,
    the function will look for factors attached to each token, separated by '|'.
    If source is not None, reads from the source file. If num_source_factors > 1, num_source_factors source factor
    filenames are required.

    :param input_file: The source file (possibly None).
    :param translator: Translator that will translate each line of input.
    :param input_is_json: Whether the input is in json format.
    :param input_factors: Source factor files.
    :return: TranslatorInput objects.
    """
    if input_file is None:
        check_condition(
            input_factors is None,
            "Translating from STDIN, not expecting any factor files.")
        for sentence_id, line in enumerate(sys.stdin, 1):
            if input_is_json:
                yield inference.make_input_from_json_string(
                    sentence_id=sentence_id,
                    json_string=line,
                    translator=translator)
            else:
                yield inference.make_input_from_factored_string(
                    sentence_id=sentence_id,
                    factored_string=line,
                    translator=translator)
    else:
        input_factors = [] if input_factors is None else input_factors
        inputs = [input_file] + input_factors
        if not input_is_json:
            check_condition(
                translator.num_source_factors == len(inputs),
                "Model(s) require %d factors, but %d given (through --input and --input-factors)."
                % (translator.num_source_factors, len(inputs)))
        with ExitStack() as exit_stack:
            streams = [exit_stack.enter_context(smart_open(i)) for i in inputs]
            for sentence_id, inputs in enumerate(zip(*streams), 1):
                if input_is_json:
                    yield inference.make_input_from_json_string(
                        sentence_id=sentence_id,
                        json_string=inputs[0],
                        translator=translator)
                else:
                    yield inference.make_input_from_multiple_strings(
                        sentence_id=sentence_id, strings=list(inputs))
コード例 #5
0
ファイル: third_party.py プロジェクト: lagka/sockeye
def merge_bpe(input_fname: str, output_fname: str):
    """
    Merge byte-pair encoded sub-words.

    :param input_fname: Path of byte-pair encoded input file, plain text or
                        gzipped.
    :param output_fname: Path of tokenized output file, plain text.
    """
    with utils.smart_open(input_fname, "r") as inp, open(output_fname, "w", encoding="utf-8") as out:
        for line in inp:
            # Merge on special markers and strip stray markers (end of line)
            merged = line.replace(SUBWORD_SPECIAL + " ", "").replace(SUBWORD_SPECIAL, "")
            out.write(merged)
コード例 #6
0
def merge_bpe(input_fname: str, output_fname: str):
    """
    Merge byte-pair encoded sub-words.

    :param input_fname: Path of byte-pair encoded input file, plain text or
                        gzipped.
    :param output_fname: Path of tokenized output file, plain text.
    """
    with utils.smart_open(input_fname,
                          "r") as inp, open(output_fname,
                                            "w",
                                            encoding="utf-8") as out:
        for line in inp:
            # Merge on special markers and strip stray markers (end of line)
            merged = line.replace(SUBWORD_SPECIAL + " ",
                                  "").replace(SUBWORD_SPECIAL, "")
            out.write(merged)