def expand(self, features):
     return (features
             | 'Write to %s' % self._file_path_prefix.replace('/', '_') >>
             tfrecordio.WriteToTFRecord(
                 file_path_prefix=self._file_path_prefix,
                 file_name_suffix=self._file_name_suffix,
                 shard_name_template=self._shard_name_template,
                 coder=mlcoders.FeatureVectorOrExampleCoder(),
                 compression_type=self._compression_type))
Example #2
0
def run(known_args, pipeline_args):
    network = MinimalNetwork()

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(SetupOptions).extra_packages = [ml.sdk_location]
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    pipeline_options.view_as(GoogleCloudOptions).staging_location = \
        os.path.join(known_args.staging)
    pipeline_options.view_as(GoogleCloudOptions).temp_location = os.path.join(
        known_args.staging, 'tmp')
    pipeline_options.view_as(GoogleCloudOptions).job_name = str(
        network).replace('_', '').lower()

    beam.coders.registry.register_coder(tf.train.Example, ExampleProtoCoder)
    p = beam.Pipeline(options=pipeline_options)

    # Read Example data
    def parse_example(example):
        #TODO: add actual implementation
        yield example

    network_input = (p
                     | 'readExamples' >> beam.io.ReadFromText(known_args.input)
                     | 'processExamples' >>
                     beam.FlatMap(lambda example: parse_example(example)))

    examples = network_input | 'encodeExamples' >> beam.Map(
        lambda raw_input: network.preprocess(raw_input))

    # # Write the serialized compressed protocol buffers to Cloud Storage.
    _ = examples | beam.io.Write(
        'writeExamples',
        tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(known_args.output, 'examples'),
            compression_type=fileio.CompressionTypes.GZIP,
            coder=ExampleProtoCoder(),
            file_name_suffix='.tfrecord.gz'))

    # # Actually run the pipeline (all operations above are deferred).
    p.run()
def run(argv=None):
    """Runs the variant preprocess pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
    parser = argparse.ArgumentParser()
    parser.add_argument('--output',
                        required=True,
                        help='Output directory to which to write results.')
    parser.add_argument(
        '--data',
        required=True,
        help='Jinja file holding the query for the sample data.')
    parser.add_argument(
        '--metadata',
        required=True,
        help='Jinja file holding the query for the sample metadata.')
    parser.add_argument('--hethom_words',
                        dest='add_hethom',
                        action='store_true',
                        help='Add variant heterozygous/homozygous "word".'
                        'Defaults to true.')
    parser.add_argument(
        '--no_hethom_words',
        dest='add_hethom',
        action='store_false',
        help='Do not add variant heterozygous/homozygous "word".'
        'Defaults to true.')
    parser.set_defaults(add_hethom=True)
    parser.add_argument('--bin_size', type=int)
    known_args, pipeline_args = parser.parse_known_args(argv)
    output_dir = os.path.join(
        known_args.output,
        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(SetupOptions).extra_packages = [ml.sdk_location]
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    pipeline_options.view_as(
        GoogleCloudOptions).staging_location = os.path.join(
            output_dir, 'tmp', 'staging')
    pipeline_options.view_as(GoogleCloudOptions).temp_location = os.path.join(
        output_dir, 'tmp')
    pipeline_options.view_as(
        GoogleCloudOptions).job_name = 'preprocess-varianteatures-%s' % (
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    metadata_query = str(
        Template(open(known_args.metadata,
                      'r').read()).render(METADATA_QUERY_REPLACEMENTS))
    logging.info('metadata query : %s', metadata_query)

    data_query = str(
        Template(open(known_args.data,
                      'r').read()).render(DATA_QUERY_REPLACEMENTS))
    logging.info('data query : %s', data_query)

    beam.coders.registry.register_coder(tf.train.Example, ExampleProtoCoder)
    p = beam.Pipeline(options=pipeline_options)

    # Gather our sample metadata into a python dictionary.
    samples_metadata = (
        p
        | beam.io.Read(
            'readSampleMetadata',
            beam.io.BigQuerySource(query=metadata_query,
                                   use_standard_sql=True))
        | 'tableToDictionary' >> beam.CombineGlobally(
            util.TableToDictCombineFn(key_column=FeatureEncoder.KEY_COLUMN)))

    # Read the table rows into a PCollection.
    rows = p | beam.io.Read(
        'readVariants',
        beam.io.BigQuerySource(query=data_query, use_standard_sql=True))

    feature_encoder = FeatureEncoder(add_hethom=known_args.add_hethom)
    if known_args.bin_size is not None:
        feature_encoder = BinnedFeatureEncoder(
            add_hethom=known_args.add_hethom, bin_size=known_args.bin_size)

    # Convert the data into TensorFlow Example Protocol Buffers.
    examples = variants_to_examples(rows,
                                    samples_metadata,
                                    feature_encoder=feature_encoder)

    # Write the serialized compressed protocol buffers to Cloud Storage.
    _ = examples | beam.io.Write(
        'writeExamples',
        tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(output_dir, 'examples'),
            compression_type=fileio.CompressionTypes.GZIP,
            coder=ExampleProtoCoder(),
            file_name_suffix='.tfrecord.gz'))

    # Actually run the pipeline (all operations above are deferred).
    p.run()