def test_sample_to_example(self):
        expected = """features {
  feature {
    key: "gender"
    value {
      int64_list {
        value: 1
      }
    }
  }
  feature {
    key: "gender_string"
    value {
      bytes_list {
        value: "female"
      }
    }
  }
  feature {
    key: "population"
    value {
      int64_list {
        value: -1
      }
    }
  }
  feature {
    key: "population_string"
    value {
      bytes_list {
        value: "some pop not in the training labels"
      }
    }
  }
  feature {
    key: "sample_name"
    value {
      bytes_list {
        value: "sample1"
      }
    }
  }
  feature {
    key: "super_population"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "super_population_string"
    value {
      bytes_list {
        value: "SAS"
      }
    }
  }
  feature {
    key: "variants_9"
    value {
      int64_list {
        value: -5153783975271321865
      }
    }
  }
}
"""
        variants_to_features_fn = variant_encoder.build_variants_to_features(
            variant_to_feature_name_fn=variant_encoder.
            variant_to_contig_feature_name,
            variant_to_words_fn=variant_encoder.build_variant_to_words(
                add_hethom=False))

        sample_to_example = encoder.build_sample_to_example(
            metadata_to_features_fn=metadata_encoder.
            metadata_to_ancestry_features,
            variants_to_features_fn=variants_to_features_fn)
        self.assertEqual(
            expected,
            str(
                sample_to_example(SAMPLE_ID, [HETEROZYGOUS_VARIANT_CALL],
                                  SAMPLE_METADATA)))
def run(argv=None):
    """Runs the variant preprocess pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
    pipeline_options = PipelineOptions(flags=argv)
    preprocess_options = pipeline_options.view_as(PreprocessOptions)
    cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    output_dir = os.path.join(
        preprocess_options.output,
        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
    cloud_options.temp_location = os.path.join(output_dir, 'tmp')
    cloud_options.job_name = 'preprocess-varianteatures-%s' % (
        datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    metadata_query = str(
        Template(open(preprocess_options.metadata,
                      'r').read()).render(METADATA_QUERY_REPLACEMENTS))
    logging.info('metadata query : %s', metadata_query)

    data_query = str(
        Template(open(preprocess_options.input,
                      'r').read()).render(DATA_QUERY_REPLACEMENTS))
    logging.info('data query : %s', data_query)

    # Assemble the strategies to be used to convert the raw data to features.
    variant_to_feature_name_fn = variant_encoder.variant_to_contig_feature_name
    if preprocess_options.bin_size is not None:
        variant_to_feature_name_fn = variant_encoder.build_variant_to_binned_feature_name(
            bin_size=preprocess_options.bin_size)

    variants_to_features_fn = variant_encoder.build_variants_to_features(
        variant_to_feature_name_fn=variant_to_feature_name_fn,
        variant_to_words_fn=variant_encoder.build_variant_to_words(
            add_hethom=preprocess_options.add_hethom))

    sample_to_example_fn = encoder.build_sample_to_example(
        metadata_to_features_fn=metadata_encoder.metadata_to_ancestry_features,
        variants_to_features_fn=variants_to_features_fn)

    with beam.Pipeline(options=pipeline_options) as p:
        # Gather our sample metadata into a python dictionary.
        samples_metadata = (
            p
            | 'ReadSampleMetadata' >> beam.io.Read(
                beam.io.BigQuerySource(query=metadata_query,
                                       use_standard_sql=True))
            | 'TableToDictionary' >> beam.CombineGlobally(
                util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

        # Read the table rows into a PCollection.
        rows = p | 'ReadVariants' >> beam.io.Read(
            beam.io.BigQuerySource(query=data_query, use_standard_sql=True))

        # Convert the data into TensorFlow Example Protocol Buffers.
        examples = variants_to_examples(
            rows, samples_metadata, sample_to_example_fn=sample_to_example_fn)

        # Write the serialized compressed protocol buffers to Cloud Storage.
        _ = (examples
             | 'EncodeExamples' >>
             beam.Map(lambda example: example.SerializeToString())
             | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
                 file_path_prefix=os.path.join(output_dir, 'examples'),
                 compression_type=CompressionTypes.GZIP,
                 file_name_suffix='.tfrecord.gz'))