def create_function_embeddings(argv=None):
    """Creates Batch Prediction Pipeline using trained model.

  At a high level, this pipeline does the following things:
    - Read the Processed Github Dataset from BigQuery
    - Encode the functions using T2T problem
    - Get function embeddings using `kubeflow_batch_predict.dataflow.batch_prediction`
    - All results are stored in a BigQuery dataset (`args.function_embeddings_table`)
    - See `transforms.github_dataset.GithubBatchPredict` for details of tables created
    - Additionally, store CSV of docstring, original functions and other metadata for
      reverse index lookup during search engine queries.

  NOTE: The number of output file shards have been fixed (at 100) to avoid a large
  number of output files, making it manageable.
  """
    pipeline_opts = arguments.prepare_pipeline_opts(argv)
    args = pipeline_opts._visible_options  # pylint: disable=protected-access

    pipeline = beam.Pipeline(options=pipeline_opts)

    token_pairs_query = gh_bq.ReadTransformedGithubDatasetQuery(
        args.token_pairs_table)
    token_pairs_source = beam.io.BigQuerySource(
        query=token_pairs_query.query_string, use_standard_sql=True)
    embeddings = (
        pipeline
        | "Read Transformed Github Dataset" >> beam.io.Read(token_pairs_source)
        | "Compute Function Embeddings" >> func_embed.FunctionEmbeddings(
            args.problem, args.data_dir, args.saved_model_dir))

    function_embeddings_schema = bigquery.BigQuerySchema([
        ('nwo', 'STRING'), ('path', 'STRING'), ('function_name', 'STRING'),
        ('lineno', 'STRING'), ('original_function', 'STRING'),
        ('function_embedding', 'STRING')
    ])

    (embeddings  # pylint: disable=expression-not-assigned
     | "Save Function Embeddings" >> beam.io.WriteToBigQuery(
         table=args.function_embeddings_table,
         schema=function_embeddings_schema.table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    (embeddings  # pylint: disable=expression-not-assigned
     | "Format for CSV Write" >> beam.ParDo(
         dict_to_csv.DictToCSVString([
             'nwo', 'path', 'function_name', 'lineno', 'original_function',
             'function_embedding'
         ]))
     | "Write Embeddings to CSV" >> beam.io.WriteToText(
         '{}/func-index'.format(args.output_dir),
         file_name_suffix='.csv',
         num_shards=100))

    result = pipeline.run()
    logging.info("Submitted Dataflow job: %s", result)
    # TODO(jlewi): Doesn't dataflow define a default option.
    if args.wait_until_finished:
        result.wait_until_finish()
def create_function_embeddings(argv=None):
    """Creates Batch Prediction Pipeline using trained model.

  At a high level, this pipeline does the following things:
    - Read the Processed Github Dataset from BigQuery
    - Encode the functions using T2T problem
    - Get function embeddings using `kubeflow_batch_predict.dataflow.batch_prediction`
    - All results are stored in a BigQuery dataset (`args.target_dataset`)
    - See `transforms.github_dataset.GithubBatchPredict` for details of tables created
    - Additionally, store CSV of docstring, original functions and other metadata for
      reverse index lookup during search engine queries.

  NOTE: The number of output file shards have been fixed (at 100) to avoid a large
  number of output files, making it manageable.
  """
    pipeline_opts = arguments.prepare_pipeline_opts(argv)
    args = pipeline_opts._visible_options  # pylint: disable=protected-access

    pipeline = beam.Pipeline(options=pipeline_opts)

    token_pairs = (
        pipeline
        | "Read Transformed Github Dataset" >>
        gh_bq.ReadTransformedGithubDataset(args.project,
                                           dataset=args.target_dataset)
        | "Compute Function Embeddings" >> func_embed.FunctionEmbeddings(
            args.project, args.target_dataset, args.problem, args.data_dir,
            args.saved_model_dir))

    (token_pairs  # pylint: disable=expression-not-assigned
     | "Format for CSV Write" >> beam.ParDo(
         dict_to_csv.DictToCSVString([
             'nwo', 'path', 'function_name', 'lineno', 'original_function',
             'function_embedding'
         ]))
     | "Write Embeddings to CSV" >> beam.io.WriteToText(
         '{}/func-index'.format(args.data_dir),
         file_name_suffix='.csv',
         num_shards=100))

    result = pipeline.run()
    logging.info("Submitted Dataflow job: %s", result)
    if args.wait_until_finished:
        result.wait_until_finish()