project = beam_options.view_as(GoogleCloudOptions).project # [START batch_or_stream] if args.verb == 'batch': data_files_pattern = os.path.join(args.inputs_dir, '*.sdf') results_prefix = os.path.join(args.outputs_dir, 'part') source = beam.io.Read(pubchem.ParseSDF(data_files_pattern)) sink = beam.io.WriteToText(results_prefix) elif args.verb == 'stream': if not project: parser.print_usage() print('error: argument --project is required for streaming') sys.exit(1) beam_options.view_as(StandardOptions).streaming = True source = beam.io.ReadFromPubSub( topic='projects/{}/topics/{}'.format(project, args.inputs_topic)) sink = beam.io.WriteStringsToPubSub( topic='projects/{}/topics/{}'.format(project, args.outputs_topic)) # [END batch_or_stream] else: parser.print_usage() sys.exit(1) # [START call_run] run(args.model_dir, pubchem.SimpleFeatureExtraction(source), sink, beam_options) # [END call_run]
if __name__ == '__main__': """Main function""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--work-dir', type=str, default=os.path.join(tempfile.gettempdir(), 'cloudml-samples', 'molecules'), help='Directory for staging and working files. ' 'This can be a Google Cloud Storage path.') args, pipeline_args = parser.parse_known_args() beam_options = PipelineOptions(pipeline_args) beam_options.view_as(SetupOptions).save_main_session = True data_files_pattern = os.path.join(args.work_dir, 'data', '*.sdf') preprocess_data = run( pubchem.INPUT_SCHEMA, pubchem.LABELS, # [START dataflow_molecules_feature_extraction_transform] pubchem.SimpleFeatureExtraction( beam.io.Read(pubchem.ParseSDF(data_files_pattern))), # [END dataflow_molecules_feature_extraction_transform] feature_scaling=pubchem.normalize_inputs, beam_options=beam_options, work_dir=args.work_dir) dump(preprocess_data, os.path.join(args.work_dir, 'PreprocessData'))
# [END dataflow_molecules_write_tfrecords] return PreprocessData(input_feature_spec, labels, train_dataset_prefix + '*', eval_dataset_prefix + '*') if __name__ == '__main__': """Main function""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--work-dir', required=True, help='Directory for staging and working files. ' 'This can be a Google Cloud Storage path.') args, pipeline_args = parser.parse_known_args() data_files_pattern = os.path.join(args.work_dir, 'data', '*.sdf') beam_options = PipelineOptions(pipeline_args, save_main_session=True) preprocess_data = run( pubchem.FEATURE_SPEC, pubchem.LABELS, # [START dataflow_molecules_feature_extraction_transform] pubchem.SimpleFeatureExtraction(pubchem.ParseSDF(data_files_pattern)), # [END dataflow_molecules_feature_extraction_transform] feature_scaling=pubchem.normalize_inputs, beam_options=beam_options, work_dir=args.work_dir) dump(preprocess_data, os.path.join(args.work_dir, 'PreprocessData'))
if args.verb == 'batch': data_files_pattern = os.path.join(args.inputs_dir, '*.sdf') results_prefix = os.path.join(args.outputs_dir, 'part') source = pubchem.ParseSDF(data_files_pattern) sink = beam.io.WriteToText(results_prefix) elif args.verb == 'stream': if not project: parser.print_usage() print('error: argument --project is required for streaming') sys.exit(1) beam_options.view_as(StandardOptions).streaming = True source = beam.io.ReadFromPubSub(topic='projects/{}/topics/{}'.format( project, args.inputs_topic)) sink = beam.io.WriteToPubSub(topic='projects/{}/topics/{}'.format( project, args.outputs_topic)) # [END dataflow_molecules_batch_or_stream] else: parser.print_usage() sys.exit(1) # [START dataflow_molecules_call_run] run( args.model_dir, pubchem.SimpleFeatureExtraction(source), sink, beam_options) # [END dataflow_molecules_call_run]