Esempio n. 1
0
    project = beam_options.view_as(GoogleCloudOptions).project

    # [START batch_or_stream]
    if args.verb == 'batch':
        data_files_pattern = os.path.join(args.inputs_dir, '*.sdf')
        results_prefix = os.path.join(args.outputs_dir, 'part')
        source = beam.io.Read(pubchem.ParseSDF(data_files_pattern))
        sink = beam.io.WriteToText(results_prefix)

    elif args.verb == 'stream':
        if not project:
            parser.print_usage()
            print('error: argument --project is required for streaming')
            sys.exit(1)

        beam_options.view_as(StandardOptions).streaming = True
        source = beam.io.ReadFromPubSub(
            topic='projects/{}/topics/{}'.format(project, args.inputs_topic))
        sink = beam.io.WriteStringsToPubSub(
            topic='projects/{}/topics/{}'.format(project, args.outputs_topic))
        # [END batch_or_stream]

    else:
        parser.print_usage()
        sys.exit(1)

    # [START call_run]
    run(args.model_dir, pubchem.SimpleFeatureExtraction(source), sink,
        beam_options)
    # [END call_run]
Esempio n. 2
0
if __name__ == '__main__':
    """Main function"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--work-dir',
                        type=str,
                        default=os.path.join(tempfile.gettempdir(),
                                             'cloudml-samples', 'molecules'),
                        help='Directory for staging and working files. '
                        'This can be a Google Cloud Storage path.')

    args, pipeline_args = parser.parse_known_args()

    beam_options = PipelineOptions(pipeline_args)
    beam_options.view_as(SetupOptions).save_main_session = True

    data_files_pattern = os.path.join(args.work_dir, 'data', '*.sdf')
    preprocess_data = run(
        pubchem.INPUT_SCHEMA,
        pubchem.LABELS,
        # [START dataflow_molecules_feature_extraction_transform]
        pubchem.SimpleFeatureExtraction(
            beam.io.Read(pubchem.ParseSDF(data_files_pattern))),
        # [END dataflow_molecules_feature_extraction_transform]
        feature_scaling=pubchem.normalize_inputs,
        beam_options=beam_options,
        work_dir=args.work_dir)

    dump(preprocess_data, os.path.join(args.work_dir, 'PreprocessData'))
Esempio n. 3
0
        # [END dataflow_molecules_write_tfrecords]

    return PreprocessData(input_feature_spec, labels,
                          train_dataset_prefix + '*',
                          eval_dataset_prefix + '*')


if __name__ == '__main__':
    """Main function"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--work-dir',
                        required=True,
                        help='Directory for staging and working files. '
                        'This can be a Google Cloud Storage path.')
    args, pipeline_args = parser.parse_known_args()

    data_files_pattern = os.path.join(args.work_dir, 'data', '*.sdf')
    beam_options = PipelineOptions(pipeline_args, save_main_session=True)
    preprocess_data = run(
        pubchem.FEATURE_SPEC,
        pubchem.LABELS,
        # [START dataflow_molecules_feature_extraction_transform]
        pubchem.SimpleFeatureExtraction(pubchem.ParseSDF(data_files_pattern)),
        # [END dataflow_molecules_feature_extraction_transform]
        feature_scaling=pubchem.normalize_inputs,
        beam_options=beam_options,
        work_dir=args.work_dir)

    dump(preprocess_data, os.path.join(args.work_dir, 'PreprocessData'))
Esempio n. 4
0
  if args.verb == 'batch':
    data_files_pattern = os.path.join(args.inputs_dir, '*.sdf')
    results_prefix = os.path.join(args.outputs_dir, 'part')
    source = pubchem.ParseSDF(data_files_pattern)
    sink = beam.io.WriteToText(results_prefix)

  elif args.verb == 'stream':
    if not project:
      parser.print_usage()
      print('error: argument --project is required for streaming')
      sys.exit(1)

    beam_options.view_as(StandardOptions).streaming = True
    source = beam.io.ReadFromPubSub(topic='projects/{}/topics/{}'.format(
        project, args.inputs_topic))
    sink = beam.io.WriteToPubSub(topic='projects/{}/topics/{}'.format(
        project, args.outputs_topic))
    # [END dataflow_molecules_batch_or_stream]

  else:
    parser.print_usage()
    sys.exit(1)

  # [START dataflow_molecules_call_run]
  run(
      args.model_dir,
      pubchem.SimpleFeatureExtraction(source),
      sink,
      beam_options)
  # [END dataflow_molecules_call_run]