Example #1
0
def run(p, args, aggregator_dict, cloud_logger=None):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()

    # Create one pcollection per input file or file pattern. And then flatten
    # them into one pcollection. The duplicated names need to be removed as the
    # file name is used to create unique labels for the PTransform.
    readers = []
    for pattern in list(
            set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))):
        # Setup reader.
        #
        # TODO(user): Perhaps simplify the batch prediction code by using
        # CompressionTypes.AUTO.
        if input_file_format.startswith("tfrecord"):
            if input_file_format == "tfrecord_gzip":
                compression_type = CompressionTypes.GZIP
            else:
                assert input_file_format == "tfrecord"
                compression_type = CompressionTypes.UNCOMPRESSED
            reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord(
                pattern, compression_type=compression_type)

        else:
            assert input_file_format == "text"
            reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern)

        # Put the pcollections into a list and flatten later.
        readers.append(p | reader)

    # Setup the whole pipeline.
    results, errors = (readers
                       | beam.Flatten()
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           cloud_logger=cloud_logger))

    # Convert predictions to JSON and then write to output files.
    _ = (results
         | "TO_JSON" >> beam.Map(json.dumps)
         | "WRITE_PREDICTION_RESULTS" >> WriteToText(
             os.path.join(args.output_location,
                          OUTPUT_RESULTS_FILES_BASENAME_)))
    # Write prediction errors counts to output files.
    _ = (
        errors
        | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
        | "WRITE_ERRORS" >> WriteToText(
            os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_)))

    return p.run()
def run(p, args, aggregator_dict):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()
    input_file_patterns = args.input_file_patterns

    # Setup reader.
    if input_file_format == "text":
        reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText(
            input_file_patterns)
    elif input_file_format == "tfrecord":
        reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord(
            input_file_patterns)
    elif input_file_format == "tfrecord_gzip":
        reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip(
            input_file_patterns)

    # Setup the whole pipeline.
    results, errors = (reader
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           tags=args.tags,
                           signature_name=args.signature_name,
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           user_project_id=args.user_project_id,
                           user_job_id=args.user_job_id,
                           framework=args.framework))

    # Convert predictions to JSON and then write to output files.
    _ = (results
         | "TO_JSON" >> beam.Map(json.dumps)
         |
         "WRITE_PREDICTION_RESULTS" >> WriteToText(args.output_result_prefix))
    # Write prediction errors counts to output files.
    _ = (errors
         | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
         | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix))

    return p.run()
def run(p, args, aggregator_dict):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()
    input_file_patterns = args.input_file_patterns

    # Setup reader.
    if input_file_format == "json":
        reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText(
            input_file_patterns)
    elif input_file_format == "tfrecord":
        reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord(
            input_file_patterns)
    elif input_file_format == "tfrecord_gzip":
        reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip(
            input_file_patterns)

    # Setup the whole pipeline.
    results, errors = (reader
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           tags=args.tags,
                           signature_name=args.signature_name,
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           user_project_id=args.user_project_id,
                           user_job_id=args.user_job_id,
                           framework=args.framework))

    output_file_format = args.output_file_format.lower()
    # Convert predictions to target format and then write to output files.
    if output_file_format == "json":
        _ = (results
             | "TO_JSON" >> beam.Map(json.dumps)
             | "WRITE_PREDICTION_RESULTS" >> WriteToText(
                 args.output_result_prefix))
    elif output_file_format == "csv":
        fields = (
            results
            | "SAMPLE_SINGLE_ELEMENT" >> Sample.FixedSizeGlobally(1)
            | "GET_KEYS" >> beam.Map(
                # entry could be None if no inputs were valid
                lambda entry: entry[0].keys() if entry else []))
        _ = (fields
             | "KEYS_TO_CSV" >> beam.Map(keys_to_csv)
             | "WRITE_KEYS" >> WriteToText(args.output_result_prefix,
                                           file_name_suffix="_header.csv",
                                           shard_name_template=""))
        _ = (results
             | "VALUES_TO_CSV" >> beam.Map(values_to_csv,
                                           beam.pvalue.AsSingleton(fields))
             | "WRITE_PREDICTION_RESULTS" >> WriteToText(
                 args.output_result_prefix,
                 file_name_suffix=".csv",
                 append_trailing_newlines=False))
    # Write prediction errors counts to output files.
    _ = (errors
         | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
         | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix))

    return p.run()