type=str, default='molecules-predictions', help='PubSub topic to publish predictions.') args, pipeline_args = parser.parse_known_args() beam_options = PipelineOptions(pipeline_args) beam_options.view_as(SetupOptions).save_main_session = True project = beam_options.view_as(GoogleCloudOptions).project # [START batch_or_stream] if args.verb == 'batch': data_files_pattern = os.path.join(args.inputs_dir, '*.sdf') results_prefix = os.path.join(args.outputs_dir, 'part') source = beam.io.Read(pubchem.ParseSDF(data_files_pattern)) sink = beam.io.WriteToText(results_prefix) elif args.verb == 'stream': if not project: parser.print_usage() print('error: argument --project is required for streaming') sys.exit(1) beam_options.view_as(StandardOptions).streaming = True source = beam.io.ReadFromPubSub( topic='projects/{}/topics/{}'.format(project, args.inputs_topic)) sink = beam.io.WriteStringsToPubSub( topic='projects/{}/topics/{}'.format(project, args.outputs_topic)) # [END batch_or_stream]
stream_verb.add_argument('--outputs-topic', default='molecules-predictions', help='PubSub topic to publish predictions.') args, pipeline_args = parser.parse_known_args() beam_options = PipelineOptions(pipeline_args) beam_options.view_as(SetupOptions).save_main_session = True project = beam_options.view_as(GoogleCloudOptions).project # [START dataflow_molecules_batch_or_stream] if args.verb == 'batch': data_files_pattern = os.path.join(args.inputs_dir, '*.sdf') results_prefix = os.path.join(args.outputs_dir, 'part') source = pubchem.ParseSDF(data_files_pattern) sink = beam.io.WriteToText(results_prefix) elif args.verb == 'stream': if not project: parser.print_usage() print('error: argument --project is required for streaming') sys.exit(1) beam_options.view_as(StandardOptions).streaming = True source = beam.io.ReadFromPubSub( topic='projects/{}/topics/{}'.format(project, args.inputs_topic)) sink = beam.io.WriteToPubSub( topic='projects/{}/topics/{}'.format(project, args.outputs_topic)) # [END dataflow_molecules_batch_or_stream]
# [END dataflow_molecules_write_tfrecords] return PreprocessData(input_feature_spec, labels, train_dataset_prefix + '*', eval_dataset_prefix + '*') if __name__ == '__main__': """Main function""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--work-dir', required=True, help='Directory for staging and working files. ' 'This can be a Google Cloud Storage path.') args, pipeline_args = parser.parse_known_args() data_files_pattern = os.path.join(args.work_dir, 'data', '*.sdf') beam_options = PipelineOptions(pipeline_args, save_main_session=True) preprocess_data = run( pubchem.FEATURE_SPEC, pubchem.LABELS, # [START dataflow_molecules_feature_extraction_transform] pubchem.SimpleFeatureExtraction(pubchem.ParseSDF(data_files_pattern)), # [END dataflow_molecules_feature_extraction_transform] feature_scaling=pubchem.normalize_inputs, beam_options=beam_options, work_dir=args.work_dir) dump(preprocess_data, os.path.join(args.work_dir, 'PreprocessData'))
help='PubSub topic to publish molecules.') parser.add_argument( '--inputs-dir', required=True, help='Input directory where SDF data files are read from. ' 'This can be a Google Cloud Storage path.') args, pipeline_args = parser.parse_known_args() beam_options = PipelineOptions( pipeline_args, save_main_session=True, streaming=True, ) project = beam_options.view_as(GoogleCloudOptions).project if not project: parser.print_usage() print('error: argument --project is required') sys.exit(1) data_files_pattern = os.path.join(args.inputs_dir, '*.sdf') topic_path = 'projects/{}/topics/{}'.format(project, args.topic) with beam.Pipeline(options=beam_options) as p: _ = (p | 'Read SDF files' >> pubchem.ParseSDF(data_files_pattern) | 'Print element' >> beam.Map(lambda elem: print(str(elem)[:70] + '...') or elem) | 'Publish molecules' >> beam.io.WriteToPubSub(topic=topic_path))