Example #1
0
def read_and_translate(translator: sockeye.inference.Translator, output_handler: sockeye.output_handler.OutputHandler,
                       chunk_size: int, source: Optional[str] = None) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param source: Path to file which will be translated line-by-line if included, if none use stdin.
    """
    source_data = sys.stdin if source is None else sockeye.data_io.smart_open(source)

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    for chunk in grouper(source_data, chunk_size):
        chunk_time = translate(output_handler, chunk, translator, total_lines)
        total_lines += len(chunk)
        total_time += chunk_time

    if total_lines != 0:
        logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
                    total_lines, ceil(total_lines / translator.batch_size), total_time,
                    total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")
Example #2
0
def read_and_translate(translator: sockeye.inference.Translator,
                       output_handler: sockeye.output_handler.OutputHandler,
                       chunk_size: Optional[int],
                       source: Optional[str] = None) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param source: Path to file which will be translated line-by-line if included, if none use stdin.
    """
    if source is None:
        source_data = sys.stdin
    elif source.find("scp") != -1:
        source_data = data_io.read_content(source, "scp")
    elif source.find("lab") != -1:
        source_data = data_io.read_content(source, "lab")
    else:
        sockeye.data_io.smart_open(source)

    batch_size = translator.batch_size
    if chunk_size is None:
        if translator.batch_size == 1:
            # No batching, therefore there is not need to read segments in chunks.
            chunk_size = C.CHUNK_SIZE_NO_BATCHING
        else:
            # Get a constant number of batches per call to Translator.translate.
            chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size
    else:
        if chunk_size < translator.batch_size:
            logger.warning(
                "You specified a chunk size (%d) smaller than the batch size (%d). This will lead to "
                "a degregation of translation speed. Consider choosing a larger chunk size."
                % (chunk_size, batch_size))

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    cnt = 0
    p = 0
    for chunk in grouper(source_data, chunk_size):
        if cnt % 10 == 0:
            p = cnt
        else:
            p = 0
        cnt += 1
        chunk_time = translate(output_handler, chunk, translator, total_lines,
                               p)
        total_lines += len(chunk)
        total_time += chunk_time

    if total_lines != 0:
        logger.info(
            "Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
            total_lines, ceil(total_lines / batch_size), total_time,
            total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")
Example #3
0
def read_and_translate(translator: inference.Translator,
                       output_handler: OutputHandler,
                       chunk_size: Optional[int],
                       input_file: Optional[str] = None,
                       input_factors: Optional[List[str]] = None,
                       input_is_json: bool = False,
                       num_translations: int = 3) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param input_file: Optional path to file which will be translated line-by-line if included, if none use stdin.
    :param input_factors: Optional list of paths to files that contain source factors.
    :param input_is_json: Whether the input is in json format.
    """
    #num_translations = 3
    batch_size = translator.batch_size
    if chunk_size is None:
        if translator.batch_size == 1:
            # No batching, therefore there is not need to read segments in chunks.
            chunk_size = C.CHUNK_SIZE_NO_BATCHING
        else:
            # Get a constant number of batches per call to Translator.translate.
            chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size
    else:
        if chunk_size < translator.batch_size:
            logger.warning(
                "You specified a chunk size (%d) smaller than the batch size (%d). This will lead to "
                "a reduction in translation speed. Consider choosing a larger chunk size."
                % (chunk_size, batch_size))

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    for chunk in grouper(make_inputs(input_file, translator, input_is_json,
                                     input_factors),
                         size=chunk_size):
        chunk_time = translate(output_handler, chunk, translator,
                               num_translations)
        total_lines += len(chunk)
        total_time += chunk_time
        break

    if total_lines != 0:
        logger.info(
            "Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
            total_lines, ceil(total_lines / batch_size), total_time,
            total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")
Example #4
0
def read_and_translate(translator: inference.Translator,
                       output_handler: OutputHandler,
                       chunk_size: Optional[int],
                       input_file: Optional[str] = None,
                       input_factors: Optional[List[str]] = None,
                       input_is_json: bool = False) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param input_file: Optional path to file which will be translated line-by-line if included, if none use stdin.
    :param input_factors: Optional list of paths to files that contain source factors.
    :param input_is_json: Whether the input is in json format.
    """
    batch_size = translator.batch_size
    if chunk_size is None:
        if translator.batch_size == 1:
            # No batching, therefore there is not need to read segments in chunks.
            chunk_size = C.CHUNK_SIZE_NO_BATCHING
        else:
            # Get a constant number of batches per call to Translator.translate.
            chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size
    else:
        if chunk_size < translator.batch_size:
            logger.warning("You specified a chunk size (%d) smaller than the batch size (%d). This will lead to "
                           "a reduction in translation speed. Consider choosing a larger chunk size." % (chunk_size,
                                                                                                         batch_size))

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    for chunk in grouper(make_inputs(input_file, translator, input_is_json, input_factors), size=chunk_size):
        chunk_time = translate(output_handler, chunk, translator)
        total_lines += len(chunk)
        total_time += chunk_time

    if total_lines != 0:
        logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
                    total_lines, ceil(total_lines / batch_size), total_time,
                    total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")
Example #5
0
def read_and_translate(translator: inference.Translator,
                       output_handler: OutputHandler,
                       chunk_size: Optional[int],
                       input_file: Optional[str] = None,
                       input_factors: Optional[List[str]] = None,
                       dynamic_batch_mode_enabled: bool = False,
                       input_is_json: bool = False) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param input_file: Optional path to file which will be translated line-by-line if included, if none use stdin.
    :param input_factors: Optional list of paths to files that contain source factors.
    :param dynamic_batch_mode_enabled: Flag set to allow dynamic batches in translation, rather than fixed value
    :param input_is_json: Whether the input is in json format.
    """
    batch_size = translator.batch_size

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    # We allow for dynamic batch calls if calling from STDIN with json inputs
    if dynamic_batch_mode_enabled and input_file is None and input_is_json:
        logger.info(
            "Dynamic batch mode enabled, translating in batches as delivered..."
        )
        for translation_in in make_input_lists():
            # if the input goes beyond the max batch_size, split into batch_size chunks
            max_batches = [
                translation_in[i:i + batch_size]
                for i in xrange(0, len(translation_in), batch_size)
            ]
            for max_batch in max_batches:
                translate_time = translate(
                    output_handler=output_handler,
                    dynamic_batch_mode_enabled=dynamic_batch_mode_enabled,
                    trans_inputs=max_batch,
                    translator=translator)
                total_lines += len(max_batch)
                total_time += translate_time
    else:
        if chunk_size is None:
            if translator.batch_size == 1:
                # No batching, therefore there is not need to read segments in chunks.
                chunk_size = C.CHUNK_SIZE_NO_BATCHING
            else:
                # Get a constant number of batches per call to Translator.translate.
                chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size
        else:
            if chunk_size < translator.batch_size:
                logger.warning(
                    "You specified a chunk size (%d) smaller than the batch size (%d). This will lead to "
                    "a reduction in translation speed. Consider choosing a larger chunk size."
                    % (chunk_size, batch_size))
        for chunk in grouper(make_inputs(input_file, translator, input_is_json,
                                         input_factors),
                             size=chunk_size):
            chunk_time = translate(
                output_handler=output_handler,
                dynamic_batch_mode_enabled=dynamic_batch_mode_enabled,
                trans_inputs=chunk,
                translator=translator)
            total_lines += len(chunk)
            total_time += chunk_time

    if total_lines != 0:
        logger.info(
            "Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
            total_lines, ceil(total_lines / batch_size), total_time,
            total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")