def preprocess_github_dataset(argv=None): """Apache Beam pipeline for pre-processing Github dataset. At a high level, this pipeline does the following things: - Read Github Python files from BigQuery - If Github Python files have already been processed, use the pre-processed table instead (using flag `--pre-transformed`) - Tokenize files into pairs of function definitions and docstrings - All results are stored in a BigQuery dataset (`args.target_dataset`) - See `transforms.github_dataset.TransformGithubDataset` for details of tables created - Additionally, store pairs of docstring and function tokens in a CSV file for training NOTE: The number of output file shards have been fixed (at 100) to avoid a large number of output files, making it manageable. """ pipeline_opts = arguments.prepare_pipeline_opts(argv) args = pipeline_opts._visible_options # pylint: disable=protected-access pipeline = beam.Pipeline(options=pipeline_opts) if args.pre_transformed: token_pairs = (pipeline | "Read Transformed Github Dataset" >> gh_bq.ReadTransformedGithubDataset( args.project, dataset=args.target_dataset)) else: token_pairs = ( pipeline | "Read Github Dataset" >> gh_bq.ReadGithubDataset(args.project) | "Transform Github Dataset" >> github_dataset.TransformGithubDataset(args.project, args.target_dataset)) (token_pairs # pylint: disable=expression-not-assigned | "Format for CSV Write" >> beam.ParDo( dict_to_csv.DictToCSVString(['docstring_tokens', 'function_tokens'])) | "Write CSV" >> beam.io.WriteToText('{}/func-doc-pairs'.format( args.data_dir), file_name_suffix='.csv', num_shards=100)) result = pipeline.run() if args.runner == 'DirectRunner': result.wait_until_finish()
def create_function_embeddings(argv=None): """Creates Batch Prediction Pipeline using trained model. At a high level, this pipeline does the following things: - Read the Processed Github Dataset from BigQuery - Encode the functions using T2T problem - Get function embeddings using `kubeflow_batch_predict.dataflow.batch_prediction` - All results are stored in a BigQuery dataset (`args.target_dataset`) - See `transforms.github_dataset.GithubBatchPredict` for details of tables created - Additionally, store CSV of docstring, original functions and other metadata for reverse index lookup during search engine queries. NOTE: The number of output file shards have been fixed (at 100) to avoid a large number of output files, making it manageable. """ pipeline_opts = arguments.prepare_pipeline_opts(argv) args = pipeline_opts._visible_options # pylint: disable=protected-access pipeline = beam.Pipeline(options=pipeline_opts) token_pairs = ( pipeline | "Read Transformed Github Dataset" >> gh_bq.ReadTransformedGithubDataset(args.project, dataset=args.target_dataset) | "Compute Function Embeddings" >> func_embed.FunctionEmbeddings( args.project, args.target_dataset, args.problem, args.data_dir, args.saved_model_dir)) (token_pairs # pylint: disable=expression-not-assigned | "Format for CSV Write" >> beam.ParDo( dict_to_csv.DictToCSVString([ 'nwo', 'path', 'function_name', 'lineno', 'original_function', 'function_embedding' ])) | "Write Embeddings to CSV" >> beam.io.WriteToText( '{}/func-index'.format(args.data_dir), file_name_suffix='.csv', num_shards=100)) result = pipeline.run() logging.info("Submitted Dataflow job: %s", result) if args.wait_until_finished: result.wait_until_finish()
def preprocess_github_dataset(argv=None): """Apache Beam pipeline for pre-processing Github dataset. At a high level, this pipeline does the following things: - Read Github Python files from BigQuery - If Github Python files have already been processed, use the pre-processed table instead (using flag `--pre-transformed`) - Tokenize files into pairs of function definitions and docstrings - See `transforms.github_dataset.TransformGithubDataset` for details of tables created - Additionally, store pairs of docstring and function tokens in a CSV file for training NOTE: The number of output file shards have been fixed (at 100) to avoid a large number of output files, making it manageable. """ pipeline_opts = arguments.prepare_pipeline_opts(argv) args = pipeline_opts._visible_options # pylint: disable=protected-access pipeline = beam.Pipeline(options=pipeline_opts) if args.pre_transformed: token_pairs = (pipeline | "Read Transformed Github Dataset" >> gh_bq.ReadTransformedGithubDataset( args.project, dataset=args.target_dataset)) else: if args.github_files: logging.info("Will read the GitHub data from %s", args.github_files) input_records = (pipeline | "Read Github Dataset" >> beam.io.ReadFromText( args.github_files, coder=JsonCoder())) elif args.github_table: logging.info("Will read the entire table %s", args.github_table) source = beam.io.BigQuerySource(table=args.github_table) input_records = (pipeline | "Read Github Dataset" >> beam.io.Read(source)) else: # Use only a query of the data. logging.info("Reading data using a query.") input_records = ( pipeline | "Read Github Dataset" >> gh_bq.ReadGithubDataset(args.project)) token_pairs = (input_records | "Transform Github Dataset" >> github_dataset.TransformGithubDataset( args.token_pairs_table, args.failed_tokenize_table)) (token_pairs # pylint: disable=expression-not-assigned | "Format for CSV Write" >> beam.ParDo( dict_to_csv.DictToCSVString(['docstring_tokens', 'function_tokens'])) | "Write CSV" >> beam.io.WriteToText('{}/func-doc-pairs'.format( args.data_dir), file_name_suffix='.csv', num_shards=100)) result = pipeline.run() logging.info("Submitted Dataflow job: %s", result) if args.wait_until_finished: result.wait_until_finish() return result