def publish(udf_dirs, project_id, dependency_dir, gcs_bucket, gcs_path, public): """Publish UDFs in the provided directory.""" client = bigquery.Client(project_id) if dependency_dir and os.path.exists(dependency_dir): push_dependencies_to_gcs(gcs_bucket, gcs_path, dependency_dir, project_id) raw_udfs = read_udf_dirs(*udf_dirs) published_udfs = [] for raw_udf in raw_udfs: # get all dependencies for UDF and publish as persistent UDF udfs_to_publish = accumulate_dependencies([], raw_udfs, raw_udf) udfs_to_publish.append(raw_udf) for dep in udfs_to_publish: if dep not in published_udfs and raw_udfs[dep].filepath not in SKIP: publish_udf( raw_udfs[dep], client, project_id, gcs_bucket, gcs_path, raw_udfs.keys(), public, ) published_udfs.append(dep)
def validate(project_dirs): """Validate UDF docs.""" # parse UDFs parsed_udfs = read_udf_dirs(*project_dirs) is_valid = True for project_dir in project_dirs: if os.path.isdir(project_dir): for root, dirs, files in os.walk(project_dir): if os.path.basename(root) == EXAMPLE_DIR: for file in files: dry_run_sql = sql_for_dry_run(os.path.join(root, file), parsed_udfs, project_dir) # store sql in temporary file for dry_run tmp_dir = Path(tempfile.mkdtemp()) / Path(root) tmp_dir.mkdir(parents=True, exist_ok=True) tmp_example_file = tmp_dir / file tmp_example_file.write_text(dry_run_sql) if not DryRun(str(tmp_example_file)).is_valid(): is_valid = False if not is_valid: print("Invalid examples.") sys.exit(1)
def test_read_udf_dirs(self): udf_dir = TEST_DIR / "data" / "udf" raw_udfs = parse_udf.read_udf_dirs((udf_dir)) assert len(raw_udfs.keys()) == 5 assert "udf.test_shift_28_bits_one_day" in raw_udfs assert "udf.test_safe_crc32_uuid" in raw_udfs assert "udf.test_safe_sample_id" in raw_udfs assert "udf.test_shift_28_bits_one_day" assert (raw_udfs["udf.test_shift_28_bits_one_day"].name == "udf.test_shift_28_bits_one_day") assert type( raw_udfs["udf.test_shift_28_bits_one_day"]) == parse_udf.RawUdf
def test_accumulate_dependencies(self): udf_dir = TEST_DIR / "data" / "udf" raw_udfs = parse_udf.read_udf_dirs((udf_dir)) result = parse_udf.accumulate_dependencies( [], raw_udfs, "udf.test_shift_28_bits_one_day") assert "udf.test_shift_28_bits_one_day" in result assert "udf.test_bitmask_lowest_28" in result result = parse_udf.accumulate_dependencies( [], raw_udfs, "udf.test_bitmask_lowest_28") assert "udf.test_bitmask_lowest_28" in result
def test_udf_tests_sql(self): udf_dir = TEST_DIR / "data" / "udf" raw_udfs = parse_udf.read_udf_dirs((udf_dir)) raw_udf = parse_udf.RawUdf.from_file( udf_dir / "test_shift_28_bits_one_day" / "udf.sql") result = parse_udf.udf_tests_sql(raw_udf, raw_udfs)[0] assert "CREATE TEMP FUNCTION udf_test_shift_28_bits_one_day" in result assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" in result raw_udf = parse_udf.RawUdf.from_file( udf_dir / "test_bitmask_lowest_28" / "udf.sql") result = parse_udf.udf_tests_sql(raw_udf, raw_udfs) assert result == []
def test_udf_usage_definitions(self): udf_dir = TEST_DIR / "data" / "udf" raw_udfs = parse_udf.read_udf_dirs((udf_dir)) text = "SELECT udf.test_bitmask_lowest_28(0), udf.test_safe_sample_id('')" result = parse_udf.udf_usage_definitions(text, raw_udfs) assert len(result) == 11 assert ("CREATE OR REPLACE FUNCTION udf.test_bitmask_lowest_28()" + " AS (\n 0x0FFFFFFF\n);" in result) assert ( "CREATE OR REPLACE FUNCTION udf.test_safe_sample_id(client_id STRING) AS" + " (\n MOD(udf.test_safe_crc32_uuid(CAST(client_id AS BYTES)), 100)\n);" in result)
def test_persistent_udf_as_temp(self): udf_dir = TEST_DIR / "data" / "udf" raw_udfs = parse_udf.read_udf_dirs((udf_dir)) raw_udf = parse_udf.RawUdf.from_file( udf_dir / "test_shift_28_bits_one_day" / "udf.sql").tests[0] assert "CREATE TEMP FUNCTION" not in raw_udf assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" not in raw_udf result = parse_udf.persistent_udf_as_temp(raw_udf, raw_udfs) assert "CREATE TEMP FUNCTION udf_test_shift_28_bits_one_day" in result assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" in result text = "SELECT udf.test_bitmask_lowest_28(23), mozfun.hist.range('{}')" result = parse_udf.persistent_udf_as_temp(text, raw_udfs) assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" in result assert "hist.range" in result assert "mozfun.hist.range" not in result
QueryParameter = Union[bigquery.ArrayQueryParameter, bigquery.ScalarQueryParameter, bigquery.StructQueryParameter, ] TABLE_EXTENSIONS = { "ndjson": bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, "csv": bigquery.SourceFormat.CSV, "backup_info": bigquery.SourceFormat.DATASTORE_BACKUP, "export_metadata": bigquery.SourceFormat.DATASTORE_BACKUP, "avro": bigquery.SourceFormat.AVRO, "parquet": bigquery.SourceFormat.PARQUET, "orc": bigquery.SourceFormat.ORC, } raw_udfs = parse_udf.read_udf_dirs() @dataclass class Table: """Define info needed to create a table for a generated test.""" name: str source_format: str # a tuple means read via `load(*source_path)` and format as source_format # a string means source_path is already in source_format source_path: Union[str, Tuple[str, str]] # post_init fields schema: Optional[List[bigquery.SchemaField]] = None def __post_init__(self):
def udfs(): """Get all udfs and assertions.""" return read_udf_dirs("tests/assert", "udf", "udf_js")