Exemple #1
0
                             type=str,
                             default='molecules-predictions',
                             help='PubSub topic to publish predictions.')

    args, pipeline_args = parser.parse_known_args()

    beam_options = PipelineOptions(pipeline_args)
    beam_options.view_as(SetupOptions).save_main_session = True

    project = beam_options.view_as(GoogleCloudOptions).project

    # [START batch_or_stream]
    if args.verb == 'batch':
        data_files_pattern = os.path.join(args.inputs_dir, '*.sdf')
        results_prefix = os.path.join(args.outputs_dir, 'part')
        source = beam.io.Read(pubchem.ParseSDF(data_files_pattern))
        sink = beam.io.WriteToText(results_prefix)

    elif args.verb == 'stream':
        if not project:
            parser.print_usage()
            print('error: argument --project is required for streaming')
            sys.exit(1)

        beam_options.view_as(StandardOptions).streaming = True
        source = beam.io.ReadFromPubSub(
            topic='projects/{}/topics/{}'.format(project, args.inputs_topic))
        sink = beam.io.WriteStringsToPubSub(
            topic='projects/{}/topics/{}'.format(project, args.outputs_topic))
        # [END batch_or_stream]
Exemple #2
0
    stream_verb.add_argument('--outputs-topic',
                             default='molecules-predictions',
                             help='PubSub topic to publish predictions.')

    args, pipeline_args = parser.parse_known_args()

    beam_options = PipelineOptions(pipeline_args)
    beam_options.view_as(SetupOptions).save_main_session = True

    project = beam_options.view_as(GoogleCloudOptions).project

    # [START dataflow_molecules_batch_or_stream]
    if args.verb == 'batch':
        data_files_pattern = os.path.join(args.inputs_dir, '*.sdf')
        results_prefix = os.path.join(args.outputs_dir, 'part')
        source = pubchem.ParseSDF(data_files_pattern)
        sink = beam.io.WriteToText(results_prefix)

    elif args.verb == 'stream':
        if not project:
            parser.print_usage()
            print('error: argument --project is required for streaming')
            sys.exit(1)

        beam_options.view_as(StandardOptions).streaming = True
        source = beam.io.ReadFromPubSub(
            topic='projects/{}/topics/{}'.format(project, args.inputs_topic))
        sink = beam.io.WriteToPubSub(
            topic='projects/{}/topics/{}'.format(project, args.outputs_topic))
        # [END dataflow_molecules_batch_or_stream]
Exemple #3
0
        # [END dataflow_molecules_write_tfrecords]

    return PreprocessData(input_feature_spec, labels,
                          train_dataset_prefix + '*',
                          eval_dataset_prefix + '*')


if __name__ == '__main__':
    """Main function"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--work-dir',
                        required=True,
                        help='Directory for staging and working files. '
                        'This can be a Google Cloud Storage path.')
    args, pipeline_args = parser.parse_known_args()

    data_files_pattern = os.path.join(args.work_dir, 'data', '*.sdf')
    beam_options = PipelineOptions(pipeline_args, save_main_session=True)
    preprocess_data = run(
        pubchem.FEATURE_SPEC,
        pubchem.LABELS,
        # [START dataflow_molecules_feature_extraction_transform]
        pubchem.SimpleFeatureExtraction(pubchem.ParseSDF(data_files_pattern)),
        # [END dataflow_molecules_feature_extraction_transform]
        feature_scaling=pubchem.normalize_inputs,
        beam_options=beam_options,
        work_dir=args.work_dir)

    dump(preprocess_data, os.path.join(args.work_dir, 'PreprocessData'))
Exemple #4
0
                        help='PubSub topic to publish molecules.')

    parser.add_argument(
        '--inputs-dir',
        required=True,
        help='Input directory where SDF data files are read from. '
        'This can be a Google Cloud Storage path.')

    args, pipeline_args = parser.parse_known_args()

    beam_options = PipelineOptions(
        pipeline_args,
        save_main_session=True,
        streaming=True,
    )

    project = beam_options.view_as(GoogleCloudOptions).project
    if not project:
        parser.print_usage()
        print('error: argument --project is required')
        sys.exit(1)

    data_files_pattern = os.path.join(args.inputs_dir, '*.sdf')
    topic_path = 'projects/{}/topics/{}'.format(project, args.topic)
    with beam.Pipeline(options=beam_options) as p:
        _ = (p
             | 'Read SDF files' >> pubchem.ParseSDF(data_files_pattern)
             | 'Print element' >>
             beam.Map(lambda elem: print(str(elem)[:70] + '...') or elem)
             | 'Publish molecules' >> beam.io.WriteToPubSub(topic=topic_path))