def publish(udf_dirs, project_id, dependency_dir, gcs_bucket, gcs_path,
            public):
    """Publish UDFs in the provided directory."""
    client = bigquery.Client(project_id)

    if dependency_dir and os.path.exists(dependency_dir):
        push_dependencies_to_gcs(gcs_bucket, gcs_path, dependency_dir,
                                 project_id)

    raw_udfs = read_udf_dirs(*udf_dirs)

    published_udfs = []

    for raw_udf in raw_udfs:
        # get all dependencies for UDF and publish as persistent UDF
        udfs_to_publish = accumulate_dependencies([], raw_udfs, raw_udf)
        udfs_to_publish.append(raw_udf)

        for dep in udfs_to_publish:
            if dep not in published_udfs and raw_udfs[dep].filepath not in SKIP:
                publish_udf(
                    raw_udfs[dep],
                    client,
                    project_id,
                    gcs_bucket,
                    gcs_path,
                    raw_udfs.keys(),
                    public,
                )
                published_udfs.append(dep)
def validate(project_dirs):
    """Validate UDF docs."""
    # parse UDFs
    parsed_udfs = read_udf_dirs(*project_dirs)
    is_valid = True

    for project_dir in project_dirs:
        if os.path.isdir(project_dir):
            for root, dirs, files in os.walk(project_dir):
                if os.path.basename(root) == EXAMPLE_DIR:
                    for file in files:
                        dry_run_sql = sql_for_dry_run(os.path.join(root, file),
                                                      parsed_udfs, project_dir)

                        # store sql in temporary file for dry_run
                        tmp_dir = Path(tempfile.mkdtemp()) / Path(root)
                        tmp_dir.mkdir(parents=True, exist_ok=True)
                        tmp_example_file = tmp_dir / file
                        tmp_example_file.write_text(dry_run_sql)

                        if not DryRun(str(tmp_example_file)).is_valid():
                            is_valid = False

    if not is_valid:
        print("Invalid examples.")
        sys.exit(1)
Example #3
0
 def test_read_udf_dirs(self):
     udf_dir = TEST_DIR / "data" / "udf"
     raw_udfs = parse_udf.read_udf_dirs((udf_dir))
     assert len(raw_udfs.keys()) == 5
     assert "udf.test_shift_28_bits_one_day" in raw_udfs
     assert "udf.test_safe_crc32_uuid" in raw_udfs
     assert "udf.test_safe_sample_id" in raw_udfs
     assert "udf.test_shift_28_bits_one_day"
     assert (raw_udfs["udf.test_shift_28_bits_one_day"].name ==
             "udf.test_shift_28_bits_one_day")
     assert type(
         raw_udfs["udf.test_shift_28_bits_one_day"]) == parse_udf.RawUdf
Example #4
0
    def test_accumulate_dependencies(self):
        udf_dir = TEST_DIR / "data" / "udf"
        raw_udfs = parse_udf.read_udf_dirs((udf_dir))

        result = parse_udf.accumulate_dependencies(
            [], raw_udfs, "udf.test_shift_28_bits_one_day")
        assert "udf.test_shift_28_bits_one_day" in result
        assert "udf.test_bitmask_lowest_28" in result

        result = parse_udf.accumulate_dependencies(
            [], raw_udfs, "udf.test_bitmask_lowest_28")
        assert "udf.test_bitmask_lowest_28" in result
Example #5
0
    def test_udf_tests_sql(self):
        udf_dir = TEST_DIR / "data" / "udf"
        raw_udfs = parse_udf.read_udf_dirs((udf_dir))
        raw_udf = parse_udf.RawUdf.from_file(
            udf_dir / "test_shift_28_bits_one_day" / "udf.sql")
        result = parse_udf.udf_tests_sql(raw_udf, raw_udfs)[0]
        assert "CREATE TEMP FUNCTION udf_test_shift_28_bits_one_day" in result
        assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" in result

        raw_udf = parse_udf.RawUdf.from_file(
            udf_dir / "test_bitmask_lowest_28" / "udf.sql")
        result = parse_udf.udf_tests_sql(raw_udf, raw_udfs)
        assert result == []
Example #6
0
    def test_udf_usage_definitions(self):
        udf_dir = TEST_DIR / "data" / "udf"
        raw_udfs = parse_udf.read_udf_dirs((udf_dir))

        text = "SELECT udf.test_bitmask_lowest_28(0), udf.test_safe_sample_id('')"
        result = parse_udf.udf_usage_definitions(text, raw_udfs)
        assert len(result) == 11
        assert ("CREATE OR REPLACE FUNCTION udf.test_bitmask_lowest_28()" +
                " AS (\n  0x0FFFFFFF\n);" in result)
        assert (
            "CREATE OR REPLACE FUNCTION udf.test_safe_sample_id(client_id STRING) AS"
            +
            " (\n  MOD(udf.test_safe_crc32_uuid(CAST(client_id AS BYTES)), 100)\n);"
            in result)
Example #7
0
    def test_persistent_udf_as_temp(self):
        udf_dir = TEST_DIR / "data" / "udf"
        raw_udfs = parse_udf.read_udf_dirs((udf_dir))
        raw_udf = parse_udf.RawUdf.from_file(
            udf_dir / "test_shift_28_bits_one_day" / "udf.sql").tests[0]

        assert "CREATE TEMP FUNCTION" not in raw_udf
        assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" not in raw_udf
        result = parse_udf.persistent_udf_as_temp(raw_udf, raw_udfs)
        assert "CREATE TEMP FUNCTION udf_test_shift_28_bits_one_day" in result
        assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" in result

        text = "SELECT udf.test_bitmask_lowest_28(23), mozfun.hist.range('{}')"
        result = parse_udf.persistent_udf_as_temp(text, raw_udfs)
        assert "CREATE TEMP FUNCTION udf_test_bitmask_lowest_28" in result
        assert "hist.range" in result
        assert "mozfun.hist.range" not in result
Example #8
0
QueryParameter = Union[bigquery.ArrayQueryParameter,
                       bigquery.ScalarQueryParameter,
                       bigquery.StructQueryParameter, ]

TABLE_EXTENSIONS = {
    "ndjson": bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    "csv": bigquery.SourceFormat.CSV,
    "backup_info": bigquery.SourceFormat.DATASTORE_BACKUP,
    "export_metadata": bigquery.SourceFormat.DATASTORE_BACKUP,
    "avro": bigquery.SourceFormat.AVRO,
    "parquet": bigquery.SourceFormat.PARQUET,
    "orc": bigquery.SourceFormat.ORC,
}

raw_udfs = parse_udf.read_udf_dirs()


@dataclass
class Table:
    """Define info needed to create a table for a generated test."""

    name: str
    source_format: str
    # a tuple means read via `load(*source_path)` and format as source_format
    # a string means source_path is already in source_format
    source_path: Union[str, Tuple[str, str]]
    # post_init fields
    schema: Optional[List[bigquery.SchemaField]] = None

    def __post_init__(self):
def udfs():
    """Get all udfs and assertions."""
    return read_udf_dirs("tests/assert", "udf", "udf_js")