def create_function_embeddings(argv=None): """Creates Batch Prediction Pipeline using trained model. At a high level, this pipeline does the following things: - Read the Processed Github Dataset from BigQuery - Encode the functions using T2T problem - Get function embeddings using `kubeflow_batch_predict.dataflow.batch_prediction` - All results are stored in a BigQuery dataset (`args.function_embeddings_table`) - See `transforms.github_dataset.GithubBatchPredict` for details of tables created - Additionally, store CSV of docstring, original functions and other metadata for reverse index lookup during search engine queries. NOTE: The number of output file shards have been fixed (at 100) to avoid a large number of output files, making it manageable. """ pipeline_opts = arguments.prepare_pipeline_opts(argv) args = pipeline_opts._visible_options # pylint: disable=protected-access pipeline = beam.Pipeline(options=pipeline_opts) token_pairs_query = gh_bq.ReadTransformedGithubDatasetQuery( args.token_pairs_table) token_pairs_source = beam.io.BigQuerySource( query=token_pairs_query.query_string, use_standard_sql=True) embeddings = ( pipeline | "Read Transformed Github Dataset" >> beam.io.Read(token_pairs_source) | "Compute Function Embeddings" >> func_embed.FunctionEmbeddings( args.problem, args.data_dir, args.saved_model_dir)) function_embeddings_schema = bigquery.BigQuerySchema([ ('nwo', 'STRING'), ('path', 'STRING'), ('function_name', 'STRING'), ('lineno', 'STRING'), ('original_function', 'STRING'), ('function_embedding', 'STRING') ]) (embeddings # pylint: disable=expression-not-assigned | "Save Function Embeddings" >> beam.io.WriteToBigQuery( table=args.function_embeddings_table, schema=function_embeddings_schema.table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) (embeddings # pylint: disable=expression-not-assigned | "Format for CSV Write" >> beam.ParDo( dict_to_csv.DictToCSVString([ 'nwo', 'path', 'function_name', 'lineno', 'original_function', 'function_embedding' ])) | "Write Embeddings to CSV" >> beam.io.WriteToText( '{}/func-index'.format(args.output_dir), file_name_suffix='.csv', num_shards=100)) result = pipeline.run() logging.info("Submitted Dataflow job: %s", result) # TODO(jlewi): Doesn't dataflow define a default option. if args.wait_until_finished: result.wait_until_finish()
def create_function_embeddings(argv=None): """Creates Batch Prediction Pipeline using trained model. At a high level, this pipeline does the following things: - Read the Processed Github Dataset from BigQuery - Encode the functions using T2T problem - Get function embeddings using `kubeflow_batch_predict.dataflow.batch_prediction` - All results are stored in a BigQuery dataset (`args.target_dataset`) - See `transforms.github_dataset.GithubBatchPredict` for details of tables created - Additionally, store CSV of docstring, original functions and other metadata for reverse index lookup during search engine queries. NOTE: The number of output file shards have been fixed (at 100) to avoid a large number of output files, making it manageable. """ pipeline_opts = arguments.prepare_pipeline_opts(argv) args = pipeline_opts._visible_options # pylint: disable=protected-access pipeline = beam.Pipeline(options=pipeline_opts) token_pairs = ( pipeline | "Read Transformed Github Dataset" >> gh_bq.ReadTransformedGithubDataset(args.project, dataset=args.target_dataset) | "Compute Function Embeddings" >> func_embed.FunctionEmbeddings( args.project, args.target_dataset, args.problem, args.data_dir, args.saved_model_dir)) (token_pairs # pylint: disable=expression-not-assigned | "Format for CSV Write" >> beam.ParDo( dict_to_csv.DictToCSVString([ 'nwo', 'path', 'function_name', 'lineno', 'original_function', 'function_embedding' ])) | "Write Embeddings to CSV" >> beam.io.WriteToText( '{}/func-index'.format(args.data_dir), file_name_suffix='.csv', num_shards=100)) result = pipeline.run() logging.info("Submitted Dataflow job: %s", result) if args.wait_until_finished: result.wait_until_finish()