def run(p, args, aggregator_dict, cloud_logger=None): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() # Create one pcollection per input file or file pattern. And then flatten # them into one pcollection. The duplicated names need to be removed as the # file name is used to create unique labels for the PTransform. readers = [] for pattern in list( set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))): # Setup reader. # # TODO(user): Perhaps simplify the batch prediction code by using # CompressionTypes.AUTO. if input_file_format.startswith("tfrecord"): if input_file_format == "tfrecord_gzip": compression_type = CompressionTypes.GZIP else: assert input_file_format == "tfrecord" compression_type = CompressionTypes.UNCOMPRESSED reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord( pattern, compression_type=compression_type) else: assert input_file_format == "text" reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern) # Put the pcollections into a list and flatten later. readers.append(p | reader) # Setup the whole pipeline. results, errors = (readers | beam.Flatten() | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), batch_size=args.batch_size, aggregator_dict=aggregator_dict, cloud_logger=cloud_logger)) # Convert predictions to JSON and then write to output files. _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText( os.path.join(args.output_location, OUTPUT_RESULTS_FILES_BASENAME_))) # Write prediction errors counts to output files. _ = ( errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText( os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_))) return p.run()
def run(p, args, aggregator_dict): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() input_file_patterns = args.input_file_patterns # Setup reader. if input_file_format == "text": reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText( input_file_patterns) elif input_file_format == "tfrecord": reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord( input_file_patterns) elif input_file_format == "tfrecord_gzip": reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip( input_file_patterns) # Setup the whole pipeline. results, errors = (reader | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), tags=args.tags, signature_name=args.signature_name, batch_size=args.batch_size, aggregator_dict=aggregator_dict, user_project_id=args.user_project_id, user_job_id=args.user_job_id, framework=args.framework)) # Convert predictions to JSON and then write to output files. _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText(args.output_result_prefix)) # Write prediction errors counts to output files. _ = (errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix)) return p.run()
def run(p, args, aggregator_dict): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() input_file_patterns = args.input_file_patterns # Setup reader. if input_file_format == "json": reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText( input_file_patterns) elif input_file_format == "tfrecord": reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord( input_file_patterns) elif input_file_format == "tfrecord_gzip": reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip( input_file_patterns) # Setup the whole pipeline. results, errors = (reader | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), tags=args.tags, signature_name=args.signature_name, batch_size=args.batch_size, aggregator_dict=aggregator_dict, user_project_id=args.user_project_id, user_job_id=args.user_job_id, framework=args.framework)) output_file_format = args.output_file_format.lower() # Convert predictions to target format and then write to output files. if output_file_format == "json": _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText( args.output_result_prefix)) elif output_file_format == "csv": fields = ( results | "SAMPLE_SINGLE_ELEMENT" >> Sample.FixedSizeGlobally(1) | "GET_KEYS" >> beam.Map( # entry could be None if no inputs were valid lambda entry: entry[0].keys() if entry else [])) _ = (fields | "KEYS_TO_CSV" >> beam.Map(keys_to_csv) | "WRITE_KEYS" >> WriteToText(args.output_result_prefix, file_name_suffix="_header.csv", shard_name_template="")) _ = (results | "VALUES_TO_CSV" >> beam.Map(values_to_csv, beam.pvalue.AsSingleton(fields)) | "WRITE_PREDICTION_RESULTS" >> WriteToText( args.output_result_prefix, file_name_suffix=".csv", append_trailing_newlines=False)) # Write prediction errors counts to output files. _ = (errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix)) return p.run()