Example #1
0
def _id_range_for_time(start: int, end: int, hn_client):
    check.invariant(end >= start, "End time comes before start time")

    def _get_item_timestamp(item_id):
        item = hn_client.fetch_item_by_id(item_id)
        return item["time"]

    max_item_id = hn_client.fetch_max_item_id()

    # declared by resource to allow testability against snapshot
    min_item_id = hn_client.min_item_id()

    start_id = binary_search_nearest_left(_get_item_timestamp, min_item_id,
                                          max_item_id, start)
    end_id = binary_search_nearest_right(_get_item_timestamp, min_item_id,
                                         max_item_id, end)

    start_timestamp = str(
        datetime.fromtimestamp(_get_item_timestamp(start_id), tz=timezone.utc))
    end_timestamp = str(
        datetime.fromtimestamp(_get_item_timestamp(end_id), tz=timezone.utc))

    metadata_entries = [
        MetadataEntry.int(value=max_item_id, label="max_item_id"),
        MetadataEntry.int(value=start_id, label="start_id"),
        MetadataEntry.int(value=end_id, label="end_id"),
        MetadataEntry.int(value=end_id - start_id, label="items"),
        MetadataEntry.text(text=start_timestamp, label="start_timestamp"),
        MetadataEntry.text(text=end_timestamp, label="end_timestamp"),
    ]

    id_range = (start_id, end_id)
    return id_range, metadata_entries
Example #2
0
    def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]):
        schema, table = DB_SCHEMA, context.asset_key.path[-1]

        time_window = context.asset_partitions_time_window if context.has_asset_partitions else None
        with connect_snowflake(config=self._config, schema=schema) as con:
            con.execute(self._get_cleanup_statement(table, schema, time_window))

        if isinstance(obj, SparkDataFrame):
            yield from self._handle_spark_output(obj, schema, table)
        elif isinstance(obj, PandasDataFrame):
            yield from self._handle_pandas_output(obj, schema, table)
        elif obj is None:  # dbt
            config = dict(SHARED_SNOWFLAKE_CONF)
            config["schema"] = DB_SCHEMA
            with connect_snowflake(config=config) as con:
                df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con)
                num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone()

            yield MetadataEntry.md(df.to_markdown(), "Data sample")
            yield MetadataEntry.int(num_rows, "Rows")
        else:
            raise Exception(
                "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames"
            )

        yield MetadataEntry.text(
            self._get_select_statement(
                table,
                schema,
                None,
                time_window,
            ),
            "Query",
        )
Example #3
0
    def handle_output(self, context, obj):
        table_name = context.name
        write_dataframe_to_table(name=table_name, dataframe=obj)

        # attach these to the Handled Output event
        yield MetadataEntry.int(len(obj), label="number of rows")
        yield MetadataEntry.text(table_name, label="table name")
Example #4
0
    def handle_output(self, context, obj):
        file_path = os.path.join("my_base_dir", context.step_key, context.name)

        obj.to_csv(file_path)

        yield MetadataEntry.int(obj.shape[0], label="number of rows")
        yield MetadataEntry.float(obj["some_column"].mean(),
                                  "some_column mean")
Example #5
0
 def materialize(_):
     yield AssetMaterialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry.text("text is cool", "text"),
             MetadataEntry.url("https://bigty.pe/neato", "url"),
             MetadataEntry.fspath("/tmp/awesome", "path"),
             MetadataEntry.json({"is_dope": True}, "json"),
             MetadataEntry.python_artifact(MetadataEntry, "python class"),
             MetadataEntry.python_artifact(file_relative_path,
                                           "python function"),
             MetadataEntry.float(1.2, "float"),
             MetadataEntry.int(1, "int"),
             MetadataEntry.float(float("nan"), "float NaN"),
             MetadataEntry.int(LONG_INT, "long int"),
             MetadataEntry.pipeline_run("fake_run_id", "pipeline run"),
             MetadataEntry.asset(AssetKey("my_asset"), "my asset"),
             MetadataEntry.table(
                 label="table",
                 records=[
                     TableRecord(foo=1, bar=2),
                     TableRecord(foo=3, bar=4),
                 ],
             ),
             MetadataEntry.table_schema(
                 label="table_schema",
                 schema=TableSchema(
                     columns=[
                         TableColumn(
                             name="foo",
                             type="integer",
                             constraints=TableColumnConstraints(
                                 unique=True),
                         ),
                         TableColumn(name="bar", type="string"),
                     ],
                     constraints=TableConstraints(other=["some constraint"
                                                         ], ),
                 ),
             ),
         ],
     )
     yield Output(None)
Example #6
0
    def handle_output(self, context, obj):
        key = context.asset_key.path[-1]
        bucket = context.resource_config["bucket"]

        context.log.debug("about to pickle object")
        pickled_obj = pickle.dumps(obj)
        yield MetadataEntry.int(len(pickled_obj), "Bytes")
        client = s3_client()
        context.log.debug("created S3 client")
        client.put_object(Bucket=bucket, Key=key, Body=pickled_obj)
Example #7
0
 def backcompat_materialize(_):
     yield Materialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry.text("text is cool", "text"),
             MetadataEntry.url("https://bigty.pe/neato", "url"),
             MetadataEntry.fspath("/tmp/awesome", "path"),
             MetadataEntry.json({"is_dope": True}, "json"),
             MetadataEntry.python_artifact(MetadataEntry, "python class"),
             MetadataEntry.python_artifact(file_relative_path,
                                           "python function"),
             MetadataEntry.float(1.2, "float"),
             MetadataEntry.int(1, "int"),
             MetadataEntry.float(float("nan"), "float NaN"),
             MetadataEntry.int(LONG_INT, "long int"),
             MetadataEntry.pipeline_run("fake_run_id", "pipeline run"),
             MetadataEntry.asset(AssetKey("my_asset"), "my asset"),
         ],
     )
     yield Output(None)
Example #8
0
    def handle_output(
        self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]
    ):

        path = self._get_path(context)
        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")
Example #9
0
    def handle_output(self, context, obj: pd.DataFrame):
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        obj.to_csv(fpath)
        with open(fpath + ".version", "w") as f:
            f.write(context.version if context.version else "None")

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.path(fpath, "Path")
        yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample")
        yield MetadataEntry.text(context.version, "Resolved version")
        yield MetadataEntry.table_schema(
            self.get_schema(context.dagster_type),
            "Schema",
        )
Example #10
0
    def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]:
        """
        Here, we run queries against our output Snowflake database tables to add additional context
        to our asset materializations.
        """

        table_name = result["unique_id"].split(".")[-1]
        with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con:
            n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con)
            sample_rows = pandas.read_sql_query(
                f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con
            )
        return super()._get_metadata(result) + [
            MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"),
            MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"),
        ]
Example #11
0
    def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str):
        from snowflake import connector  # pylint: disable=no-name-in-module

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns")

        connector.paramstyle = "pyformat"
        with connect_snowflake(config=self._config, schema=schema) as con:
            with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")
            with_uppercase_cols.to_sql(
                table,
                con=con,
                if_exists="append",
                index=False,
                method=pd_writer,
            )
Example #12
0
    def handle_output(self, context: OutputContext,
                      obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]):
        path = self._get_path(context)
        if "://" not in self._base_path:
            os.makedirs(os.path.dirname(path), exist_ok=True)

        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            context.log.info(f"Row count: {row_count}")
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")
Example #13
0
def test_assets():

    ab_resource = airbyte_resource(
        build_init_resource_context(config={
            "host": "some_host",
            "port": "8000",
        }))
    ab_assets = build_airbyte_assets("12345", ["foo", "bar"],
                                     asset_key_prefix=["some", "prefix"])

    assert len(ab_assets[0].op.output_defs) == 2

    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/get",
        json={
            "name": "xyz",
            "syncCatalog": {
                "streams": [
                    {
                        "stream": {
                            "name": "foo",
                            "jsonSchema": {
                                "properties": {
                                    "a": {
                                        "type": "str"
                                    },
                                    "b": {
                                        "type": "int"
                                    }
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                    {
                        "stream": {
                            "name": "bar",
                            "jsonSchema": {
                                "properties": {
                                    "c": {
                                        "type": "str"
                                    },
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                    {
                        "stream": {
                            "name": "baz",
                            "jsonSchema": {
                                "properties": {
                                    "d": {
                                        "type": "str"
                                    },
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                ]
            },
        },
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/sync",
        json={"job": {
            "id": 1
        }},
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/jobs/get",
        json={
            "job": {
                "id": 1,
                "status": AirbyteState.SUCCEEDED
            },
            "attempts": [{
                "attempt": {
                    "streamStats": [
                        {
                            "streamName": "foo",
                            "stats": {
                                "bytesEmitted": 1234,
                                "recordsCommitted": 4321,
                            },
                        },
                        {
                            "streamName": "bar",
                            "stats": {
                                "bytesEmitted": 1234,
                                "recordsCommitted": 4321,
                            },
                        },
                        {
                            "streamName": "baz",
                            "stats": {
                                "bytesEmitted": 1111,
                                "recordsCommitted": 1111,
                            },
                        },
                    ]
                }
            }],
        },
        status=200,
    )

    ab_job = build_assets_job(
        "ab_job",
        ab_assets,
        resource_defs={
            "airbyte":
            airbyte_resource.configured({
                "host": "some_host",
                "port": "8000",
            })
        },
    )

    res = ab_job.execute_in_process()

    materializations = [
        event for event in res.events_for_node("airbyte_sync_12345")
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 3
    assert (MetadataEntry.text("a,b", "columns") in materializations[0].
            event_specific_data.materialization.metadata_entries)
    assert (MetadataEntry.int(1234, "bytesEmitted") in materializations[0].
            event_specific_data.materialization.metadata_entries)
    assert (MetadataEntry.int(4321, "recordsCommitted") in materializations[0].
            event_specific_data.materialization.metadata_entries)
Example #14
0
def test_assets():

    ab_resource = airbyte_resource(
        build_init_resource_context(config={
            "host": "some_host",
            "port": "8000",
        }))
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/get",
        json={
            "name": "xyz",
            "syncCatalog": {
                "streams": [
                    {
                        "stream": {
                            "name": "foo",
                            "jsonSchema": {
                                "properties": {
                                    "a": {
                                        "type": "str"
                                    },
                                    "b": {
                                        "type": "int"
                                    }
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                    {
                        "stream": {
                            "name": "bar",
                            "jsonSchema": {
                                "properties": {
                                    "c": {
                                        "type": "str"
                                    },
                                }
                            },
                        },
                        "config": {
                            "selected": True
                        },
                    },
                    {
                        "stream": {
                            "name": "baz",
                            "jsonSchema": {
                                "properties": {
                                    "d": {
                                        "type": "str"
                                    },
                                }
                            },
                        },
                        "config": {
                            "selected": False
                        },
                    },
                ]
            },
        },
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/sync",
        json={"job": {
            "id": 1
        }},
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/jobs/get",
        json={
            "job": {
                "id": 1,
                "status": AirbyteState.SUCCEEDED
            },
            "attempts": [{
                "attempt": {
                    "streamStats": [
                        {
                            "streamName": "foo",
                            "stats": {
                                "bytesEmitted": 1234,
                                "recordsCommitted": 4321,
                            },
                        },
                        {
                            "streamName": "bar",
                            "stats": {
                                "bytesEmitted": 1234,
                                "recordsCommitted": 4321,
                            },
                        },
                    ]
                }
            }],
        },
        status=200,
    )

    airbyte_output = ab_resource.sync_and_poll("some_connection", 0, None)

    materializations = list(generate_materializations(airbyte_output, []))
    assert len(materializations) == 2

    assert MetadataEntry.text(
        "a,b", "columns") in materializations[0].metadata_entries
    assert MetadataEntry.int(
        1234, "bytesEmitted") in materializations[0].metadata_entries
    assert MetadataEntry.int(
        4321, "recordsCommitted") in materializations[0].metadata_entries