Example #1
0
def example():

    rp = RedisPal()
    key = "test"

    # Integers
    inp = 1
    rp.set(key, inp)
    ans = rp.get(key)
    print("Inp is {} of type {}, ans is {} of type {}".format(
        inp, type(inp), ans, type(ans)))

    # Floating points
    inp = 1.23
    rp.set(key, inp)
    ans = rp.get(key)
    print("Inp is {} of type {}, ans is {} of type {}".format(
        inp, type(inp), ans, type(ans)))

    # Strings
    inp = "Test"
    rp.set(key, inp)
    ans = rp.get(key)
    print("Inp is {} of type {}, ans is {} of type {}".format(
        inp, type(inp), ans, type(ans)))

    # Functions
    def echo(arg):
        return arg

    inp = echo
    rp.set(key, inp)
    ans = rp.get(key)
    print("Inp is {} of type {}, ans is {} of type {}".format(
        inp, type(inp), ans, type(ans)))

    # Numpy arrays
    import numpy as np
    inp = np.array([0, 1, 2, 3, 4])
    rp.set(key, inp)
    ans = rp.get(key)
    print("Inp is {} of type {}, ans is {} of type {}".format(
        inp, type(inp), ans, type(ans)))
Example #2
0
def delete_managed_views(
    context,
    blob_names,
    materialization_locked: bool,
    materialization_lock: Redlock,
):
    try:
        r = Redis(constants.REDIS_HOST.value)
        rp = RedisPal(constants.REDIS_HOST.value)
        lock = Redlock(
            key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
            masters=[r],
            auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
        )
        with lock:
            materialized_views: dict = rp.get(
                constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
            if materialized_views is None:
                materialized_views = {}
                materialized_views["views"] = {}
            for blob_name in blob_names:
                context.log.info(f"Deleting managed view {blob_name}")
                if blob_name in materialized_views["views"]:
                    del materialized_views["views"][blob_name]
                    prefix: str = os.getenv("BQ_PROJECT_NAME", "rj-smtr-dev")
                    table_name: str = f"{prefix}.{blob_name}"
                    update_view(table_name, {}, "", "", "", delete=True)
                    context.log.info("Success!")
                else:
                    context.log.info("View not found, skipping...")
            rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                   materialized_views)
    except Exception as e:
        try:
            materialization_lock.release()
        except:
            pass
        raise e
Example #3
0
def get_configs_for_materialized_view(context, query_names: list,
                                      materialization_locked: bool,
                                      materialization_lock) -> dict:
    """Retrieves configs for materialized views"""
    try:
        for query_name in query_names:

            # Split query name into dataset_name and view_name
            dataset_name, view_name = query_name.split(".")

            # Load configs from GCS
            view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.yaml'
            defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name)}/defaults.yaml'
            context.log.info(f"Defaults blob: {defaults_yaml}")
            context.log.info(f"View blob: {view_yaml}")
            defaults_blob = get_blob(defaults_yaml,
                                     SENSOR_BUCKET,
                                     mode="staging")
            view_blob = get_blob(view_yaml, SENSOR_BUCKET, mode="staging")
            if defaults_blob is None:
                raise Exception(f"Blob {defaults_yaml} not found!")
            defaults_dict = yaml.safe_load(defaults_blob.download_as_string())
            if view_blob:
                view_dict = yaml.safe_load(view_blob.download_as_string())
            else:
                context.log.warning(
                    f"Blob {view_yaml} not found. This is not an error.")
                view_dict = {}

            # Merge configs
            query_params = {**defaults_dict, **view_dict}

            # Build base configs
            now = datetime.datetime.now(pytz.timezone("America/Sao_Paulo"))
            run_key = build_run_key(query_name, now)
            with open(
                    str(
                        Path(__file__).parent /
                        "materialized_views_base_config.yaml"), "r") as f:
                base_params: dict = yaml.safe_load(f)
            base_params["run_timestamp"] = "'{}'".format(
                convert_datetime_to_datetime_string(now))
            base_params["maestro_sha"] = "'{}'".format(
                fetch_branch_sha(constants.MAESTRO_REPOSITORY.value,
                                 constants.MAESTRO_DEFAULT_BRANCH.value))
            base_params["maestro_bq_sha"] = "'{}'".format(
                fetch_branch_sha(constants.MAESTRO_BQ_REPOSITORY.value,
                                 constants.MAESTRO_BQ_DEFAULT_BRANCH.value))
            base_params["run_key"] = "'{}'".format(run_key)

            # Few more params
            r = Redis(constants.REDIS_HOST.value)
            rp = RedisPal(constants.REDIS_HOST.value)
            lock = Redlock(
                key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
                masters=[r],
                auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
            )
            table_name = parse_filepath_to_tablename(view_yaml)
            with lock:
                managed = rp.get(
                    constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
                if managed is None:
                    managed = {}
                    managed["views"] = {}
                if query_name not in managed["views"]:
                    raise Exception(
                        f"Query {query_name} not found in managed views: {managed}"
                    )
                d = managed["views"][query_name]
                changed = d["query_modified"]
                context.log.info(f"{query_name} changed: {changed}")
                d["query_modified"] = False
                last_run = d["last_run"]
                d["last_run"] = now
                rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                       managed)

            # Get query on GCS
            query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.sql'
            query_blob = get_blob(query_file, SENSOR_BUCKET, mode="staging")
            if query_blob is None:
                raise Exception(f"Blob {query_file} not found!")
            base_query = query_blob.download_as_string().decode("utf-8")

            # Get parent queries on GCS
            parent_queries = {}
            for query_name in d["depends_on"]:
                if query_name in managed["views"] and managed["views"][
                        query_name]["materialized"]:
                    continue
                query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.sql'
                query_blob = get_blob(query_file,
                                      SENSOR_BUCKET,
                                      mode="staging")
                if query_blob is None:
                    context.log.warning(
                        f"Blob for parent query \"{query_file}\" not found, skipping..."
                    )
                    continue
                parent_view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.yaml'
                parent_view_blob = get_blob(parent_view_yaml,
                                            SENSOR_BUCKET,
                                            mode="staging")
                if parent_view_blob is not None:
                    parent_view_dict = yaml.safe_load(
                        parent_view_blob.download_as_string())
                else:
                    parent_view_dict = {}
                parent_defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:1]))}/defaults.yaml'
                parent_defaults_blob = get_blob(parent_defaults_yaml,
                                                SENSOR_BUCKET,
                                                mode="staging")
                if parent_defaults_blob is not None:
                    parent_defaults_dict = yaml.safe_load(
                        parent_defaults_blob.download_as_string())
                else:
                    context.log.warning(
                        f"Blob for parent query \"{parent_defaults_yaml}\" not found, skipping..."
                    )
                    continue
                parent_queries[query_name] = {}
                parent_queries[query_name][
                    "base_query"] = query_blob.download_as_string().decode(
                        "utf-8")
                parent_queries[query_name]["query_params"] = {
                    **parent_defaults_dict,
                    **parent_view_dict
                }
            context.log.info(f"Parent queries: {parent_queries}")

            # Build configs
            # - table_name: str
            # - changed: bool
            # - base_query: str
            # - base_params: dict
            # - query_params: dict
            # - now: str
            # - last_run: str
            date_ranges = get_date_ranges(
                last_run
                if last_run else query_params["backfill"]["start_timestamp"],
                query_params["backfill"]["interval"], now)
            context.log.info(f"{date_ranges}")
            for i, _ in enumerate(date_ranges[:-1]):
                configs = {
                    "table_name": table_name,
                    "changed": changed if i == 0 else False,
                    "base_query": base_query,
                    "base_params": base_params,
                    "query_params": query_params,
                    "now": date_ranges[i + 1],
                    "last_run": date_ranges[i],
                    "parent_queries": parent_queries,
                }
                yield DynamicOutput(
                    {
                        "config_dict": configs,
                        "materialization_lock": materialization_lock
                    },
                    mapping_key=
                    f'{configs["table_name"]}_{configs["last_run"]}_{configs["now"]}'
                    .replace(".",
                             "_").replace("-",
                                          "_").replace(" ",
                                                       "_").replace(":", "_"))
    except Exception as e:
        try:
            locks.release(materialization_lock)
        except:
            pass
        raise e
Example #4
0
def manage_view(context, input_dict):

    view_name = input_dict["view_name"]
    materialization_lock = input_dict["materialization_lock"]

    try:
        # Setup Redis and Redlock
        r = Redis(constants.REDIS_HOST.value)
        rp = RedisPal(constants.REDIS_HOST.value)
        lock = Redlock(
            key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
            masters=[r],
            auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
        )

        # Get materialization information from Redis
        materialized_views: dict = rp.get(
            constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
        if materialized_views is None:
            materialized_views = {}
            materialized_views["views"] = {}
        materialized = materialized_views["views"][view_name]["materialized"]

        # If this is materialized, generate temp view
        if materialized:
            with lock:
                materialized_views["views"][view_name]["query_modified"] = True
                materialized_views["views"][view_name]["last_run"] = None
                rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                       materialized_views)
            context.log.info(
                f"Generate {view_name} as a view for now, materialization comes later"
            )

        # We need to build the query using
        # latest parameters and build a view with it.

        # Get defaults for view_name
        blob_path = os.path.join(*([MATERIALIZED_VIEWS_PREFIX] +
                                   [n for n in view_name.split(".")][:-1]))
        defaults_path = blob_path + "/defaults.yaml"
        context.log.info(f"Defaults path -> {defaults_path}")
        defaults_blob = get_blob(defaults_path, SENSOR_BUCKET, mode="staging")
        if defaults_blob is None:
            raise Exception(f"Blob {defaults_path} not found")
        defaults_dict: dict = yaml.safe_load(
            defaults_blob.download_as_string())

        # Parse dataset_name
        dataset_name = view_name.split(".")[0]

        # Parse view yaml path
        view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, view_name)}.yaml'

        # Parse table_name
        prefix: str = os.getenv("BQ_PROJECT_NAME", "rj-smtr-dev")
        table_name: str = f"{prefix}.{view_name}"
        context.log.info(f"Table name is {table_name}")

        # Update view
        update_view(table_name,
                    defaults_dict,
                    dataset_name,
                    view_name.split(".")[-1],
                    view_yaml,
                    delete=False,
                    context=context)

    except Exception as e:
        try:
            materialization_lock.release()
        except:
            pass
        raise e
Example #5
0
def update_managed_views(
    context,
    blob_names,
    materialization_locked: bool,
    materialization_lock: Redlock,
):
    try:
        # Setup Redis and Redlock
        r = Redis(constants.REDIS_HOST.value)
        rp = RedisPal(constants.REDIS_HOST.value)
        views_lock = Redlock(
            key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value,
            masters=[r],
            auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
        )

        # Initialize graph
        graph = nx.DiGraph()

        # If blob_name ends with "defaults.yaml", we need to
        # either add it to Redis or update its values and add
        # runs for every child it has and its dependencies.
        for blob_name in [
                b for b in blob_names if b.endswith("defaults.yaml")
        ]:

            # Get dataset name
            blob_path = "/".join([n for n in blob_name.split("/")
                                  if n != ""][:-1])
            dataset_name: str = blob_path.split("/")[-1]

            context.log.info("#" * 80)
            context.log.info(f"Updating {dataset_name} defaults")

            # Read the blob
            blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging")
            if blob is None:
                raise Exception(f"Blob {blob_name} not found")
            blob_dict: dict = yaml.safe_load(blob.download_as_string())

            # Add it to Redis
            with views_lock:
                materialized_views: dict = rp.get(
                    constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
                if materialized_views is None:
                    materialized_views = {}
                    materialized_views["views"] = {}
                # Add every child to Redis
                if "views" not in blob_dict:
                    raise Exception(
                        f"Malformed blob (missing views key): {blob_name}")
                for key in blob_dict["views"].keys():

                    # Build key with dataset_name
                    m_key = f"{dataset_name}.{key}"

                    # This child also needs a run
                    context.log.info(f"Adding {m_key} to runs")
                    if m_key not in graph.nodes:
                        graph.add_node(m_key)

                    # Avoid KeyError
                    if "views" not in materialized_views:
                        materialized_views["views"] = {}

                    # Add to Redis
                    if m_key not in materialized_views["views"]:
                        materialized_views["views"][m_key] = {}
                    update_dict_with_dict(
                        materialized_views["views"][m_key], {
                            "cron_expression": blob_dict["scheduling"]["cron"],
                            "last_run": None,
                            "materialized":
                            blob_dict["views"][key]["materialized"],
                            "query_modified": True,
                            "depends_on":
                            blob_dict["views"][key]["depends_on"],
                        })

                    # Adds dependencies to runs
                    for dep in blob_dict["views"][key]["depends_on"]:
                        context.log.info(
                            f"Adding {dep} to runs as dependency of {m_key}")
                        if dep not in graph.nodes:
                            graph.add_node(dep)
                        graph.add_edge(dep, m_key)

                    # Try to find specific values for this view
                    blob = get_blob(blob_path + key + ".yaml",
                                    SENSOR_BUCKET,
                                    mode="staging")
                    if blob:
                        # Replace values in Redis
                        specific = yaml.safe_load(
                            blob.download_as_string().decode("utf-8"))
                        materialized_views["views"][m_key][
                            "cron_expression"] = specific["scheduling"]["cron"]
                    else:
                        context.log.warning(
                            f"No specific values for {m_key} found. This is not an error."
                        )

                # Update Redis effectively
                rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                       materialized_views)

        # Otherwise, we need to add the blob_name and its
        # dependencies to the graph.
        for blob_name in [
                b for b in blob_names if not b.endswith("defaults.yaml")
        ]:

            # Get table name
            file_name = ".".join(blob_name.split("/")[-2:])
            table_name = ".".join(file_name.split(".")[:-1])

            context.log.info("#" * 80)
            context.log.info(f"Updating {table_name} specific values...")

            # If it's YAML file, update values on Redis
            if blob_name.endswith(".yaml"):

                # Read the blob
                blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging")
                if blob is None:
                    raise Exception(f"Blob {blob_name} not found")
                blob_dict: dict = yaml.safe_load(blob.download_as_string())

                # Update Redis
                with views_lock:
                    materialized_views: dict = rp.get(
                        constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
                    if materialized_views is None:
                        materialized_views = {}
                        materialized_views["views"] = {}

                    if table_name not in materialized_views["views"]:
                        materialized_views["views"][table_name] = {}
                    update_dict_with_dict(
                        materialized_views["views"][table_name], {
                            "cron_expression": blob_dict["scheduling"]["cron"],
                            "last_run": None,
                            "query_modified": True,
                        })
                    rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value,
                           materialized_views)

            # Add table_name and its dependencies to runs
            context.log.info(f"Adding {table_name} to runs")
            if table_name not in graph.nodes:
                graph.add_node(table_name)

            materialized_views: dict = rp.get(
                constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value)
            if materialized_views is None:
                materialized_views = {}
                materialized_views["views"] = {}
            if table_name in materialized_views["views"]:
                for dep in materialized_views["views"][table_name][
                        "depends_on"]:
                    context.log.info(
                        f"Adding {dep} to runs as dependency of {table_name}")
                    if dep not in graph.nodes:
                        graph.add_node(dep)
                    graph.add_edge(dep, table_name)

        context.log.info(f"Graph edges: {graph.edges()}")

        # Get topological order
        order = list(nx.topological_sort(graph))

        # Filter out views that are not on materialized_views["views"]
        order = [o for o in order if o in materialized_views["views"]]

        # Log topological order
        context.log.info(f"Order: {order}")

        # Execute queries in topological order
        for q in order:
            yield DynamicOutput(
                {
                    "view_name": q,
                    "materialization_lock": materialization_lock
                },
                mapping_key=q.replace(".", "_"))
    except Exception as e:
        try:
            materialization_lock.release()
        except:
            pass
        raise e
Example #6
0
def redis_keepalive_on_failure(context: HookContext):
    rp = RedisPal(host=constants.REDIS_HOST.value)
    rp.set(context.resources.keepalive_key["key"], 1)
    message = f"Although solid {context.solid.name} has failed, a keep-alive was sent to Redis!"
    url = context.resources.discord_webhook["url"]
    requests.post(url, data={"content": message})
Example #7
0
def redis_keepalive_on_succes(context: HookContext):
    rp = RedisPal(host=constants.REDIS_HOST.value)
    rp.set(context.resources.keepalive_key["key"], 1)
Example #8
0
def materialized_views_update_sensor(context: SensorExecutionContext):
    """Sensor for updating materialized views on file changes.

    For every new or modified file, the pipeline `update_managed_materialized_views`
    is triggered. This ensures BQ materialized views are always up-to-date.
    """
    # Store largest mtime
    largest_mtime = 0

    # Store deleted and modified blobs
    deleted_blobs = []
    modified_blobs = []

    # Get connection to Redis
    rp = RedisPal(host=constants.REDIS_HOST.value)

    # Get list of blobs in bucket
    blobs_list = get_list_of_blobs(MATERIALIZED_VIEWS_PREFIX, SENSOR_BUCKET)

    # Get previous set of blobs in Redis
    previous_blobs_set: set = rp.get(
        constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value)

    # If there is no previous set, create it
    if not previous_blobs_set:
        rp.set(constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value,
               set([b.name for b in blobs_list]))

    # If there is a previous set, compare it to the current set
    else:
        deleted_blobs: set = previous_blobs_set - \
            set([b.name for b in blobs_list])

    # Get previous run mtime
    previous_run_mtime = rp.get(
        constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value)

    # If there is no previous run mtime, set modified blobs to the current blobs
    if not previous_run_mtime:
        modified_blobs = blobs_list

    # If there is a previous run mtime, compare it to the current list
    # and get modified files
    else:
        modified_blobs = filter_blobs_by_mtime(blobs_list, previous_run_mtime)

    # Update last run time
    largest_mtime = get_largest_blob_mtime(blobs_list)
    rp.set(constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value, largest_mtime)

    # If there are modified or deleted files, trigger pipeline
    if modified_blobs or deleted_blobs:

        # Load run configuration and set inputs
        config: dict = read_config(
            Path(__file__).parent / "materialized_views_update.yaml")
        config["solids"]["delete_managed_views"]["inputs"]["blob_names"][
            "value"] = list(deleted_blobs)
        config["solids"]["update_managed_views"]["inputs"]["blob_names"][
            "value"] = [b.name for b in modified_blobs]

        # Set a run key
        run_key: str = build_run_key("update-managed-views", largest_mtime)

        # Yield a run request
        yield RunRequest(run_key=run_key, run_config=config)

    # If there are no modified or deleted files,
    # skip the pipeline
    else:
        yield SkipReason(
            f"Modified files: {len(modified_blobs)}. Deleted files: {len(deleted_blobs)}"
        )