Exemple #1
0
def test_register_after_solid_definition():
    class MyClass:
        pass

    @solid
    def _my_solid(_) -> MyClass:
        return MyClass()

    my_dagster_type = DagsterType(name="aaaa",
                                  type_check_fn=lambda _, _a: True)

    with pytest.raises(DagsterInvalidDefinitionError):
        make_python_type_usable_as_dagster_type(MyClass, my_dagster_type)
Exemple #2
0
def test_make_usable_as_dagster_type():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType",)

    make_python_type_usable_as_dagster_type(EvenType, EvenDagsterType)

    @solid
    def double_even(_, even_num: EvenType) -> EvenType:
        return EvenType(even_num.num * 2)

    assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success
Exemple #3
0
def test_make_usable_as_dagster_type_called_twice():
    class AType:
        pass

    ADagsterType = PythonObjectDagsterType(AType, name="ADagsterType",)
    BDagsterType = PythonObjectDagsterType(AType, name="BDagsterType",)

    make_python_type_usable_as_dagster_type(AType, ADagsterType)
    make_python_type_usable_as_dagster_type(AType, ADagsterType)  # should not raise an error

    with pytest.raises(DagsterInvalidDefinitionError):
        make_python_type_usable_as_dagster_type(AType, BDagsterType)
Exemple #4
0
# start-snippet
from pathlib import Path

from dagster import graph, make_python_type_usable_as_dagster_type, op, repository
from dagster.core.definitions.no_step_launcher import no_step_launcher
from dagster_aws.emr import emr_pyspark_step_launcher
from dagster_aws.s3 import s3_pickle_io_manager, s3_resource
from dagster_pyspark import DataFrame as DagsterPySparkDataFrame
from dagster_pyspark import pyspark_resource
from pyspark.sql import DataFrame, Row
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

# Make pyspark.sql.DataFrame map to dagster_pyspark.DataFrame
make_python_type_usable_as_dagster_type(python_type=DataFrame, dagster_type=DagsterPySparkDataFrame)


@op(required_resource_keys={"pyspark", "pyspark_step_launcher"})
def make_people(context) -> DataFrame:
    schema = StructType([StructField("name", StringType()), StructField("age", IntegerType())])
    rows = [Row(name="Thom", age=51), Row(name="Jonny", age=48), Row(name="Nigel", age=49)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema)


@op(required_resource_keys={"pyspark_step_launcher"})
def filter_over_50(people: DataFrame) -> DataFrame:
    return people.filter(people["age"] > 50)


@op(required_resource_keys={"pyspark_step_launcher"})
def count_people(people: DataFrame) -> int:
    return people.count()
Exemple #5
0
    check,
    composite_solid,
    make_python_type_usable_as_dagster_type,
    solid,
)
from dagster.core.types.dagster_type import create_string_type

from .cache_file_from_s3 import cache_file_from_s3
from .unzip_file_handle import unzip_file_handle

SqlTableName = create_string_type("SqlTableName", description="The name of a database table")


# Make pyspark.sql.DataFrame map to dagster_pyspark.DataFrame
make_python_type_usable_as_dagster_type(
    python_type=DataFrame, dagster_type=dagster_pyspark.DataFrame
)


PARQUET_SPECIAL_CHARACTERS = r"[ ,;{}()\n\t=]"


def _notebook_path(name):
    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "notebooks", name)


# start_solids_marker_3
def notebook_solid(name, notebook_path, input_defs, output_defs, required_resource_keys):
    return define_dagstermill_solid(
        name,
        _notebook_path(notebook_path),
Exemple #6
0
from dagster import make_python_type_usable_as_dagster_type, PythonObjectDagsterType, input_hydration_config, Selector, \
    Int, Field
from datetime import date
import pandas as pd


make_python_type_usable_as_dagster_type(pd.DataFrame, PythonObjectDagsterType(pd.DataFrame))


@input_hydration_config(Selector({"date": {"year": Field(Int),
                                           "month": Field(Int),
                                           "day": Field(Int)}}))
def parse_date(context, selector):
    date_selector = selector["date"]
    return date(date_selector["year"], date_selector["month"], date_selector["day"])


make_python_type_usable_as_dagster_type(date, PythonObjectDagsterType(date, input_hydration_config=parse_date))
Exemple #7
0
import argparse
import csv
from dataclasses import dataclass
import logging
import functools
import sys
from typing import Any, Callable, NoReturn, TextIO, TypeVar, cast

from dagster import make_python_type_usable_as_dagster_type
from dagster.core.types.dagster_type import String as DagsterString

from dagster_utils.contrib.google import default_google_access_token
from dagster_utils.contrib.data_repo.typing import JobId
from data_repo_client import ApiClient, Configuration, RepositoryApi, ApiException, ResourcesApi

make_python_type_usable_as_dagster_type(JobId, DagsterString)

data_repo_host = {
    "dev": "https://jade.datarepo-dev.broadinstitute.org/",
    "prod": "https://jade-terra.datarepo-prod.broadinstitute.org/",
    "real_prod": "https://data.terra.bio/"
}

data_repo_profile_ids = {
    "dev": "390e7a85-d47f-4531-b612-165fc977d3bd",
    "prod": "db61c343-6dfe-4d14-84e9-60ddf97ea73f"
}


@dataclass
class ProblemCount:
Exemple #8
0
from dagster import (
    execute_pipeline,
    make_python_type_usable_as_dagster_type,
    pipeline,
    repository,
    solid,
)
from dagster_pandas import DataFrame as DagsterPandasDataFrame
from pandas import DataFrame

make_python_type_usable_as_dagster_type(python_type=DataFrame, dagster_type=DagsterPandasDataFrame)

# start_intro_0
@solid(description="Calculates the grams of sugar per cup of each kind of cereal.")
def sugar_by_volume(_, cereals: DataFrame) -> DataFrame:
    df = cereals[["name"]].copy()
    df["sugar_per_cup"] = cereals["sugars"] / cereals["cups"]
    return df


@solid(description="Finds the sugar-per-cup cutoff for the top quartile of cereals.")
def top_quartile_cutoff(_, cereals: DataFrame) -> float:
    return cereals["sugar_per_cup"].quantile(0.75)


@solid(description="Selects cereals whose sugar-per-cup exceeds the given cutoff.")
def sugariest_cereals(_, cereals: DataFrame, cutoff: float) -> DataFrame:
    return cereals[cereals["sugar_per_cup"] > cutoff]


@pipeline
Exemple #9
0
from dagster import solid, SolidExecutionContext, Field, Array, String, PythonObjectDagsterType, make_python_type_usable_as_dagster_type
from typing import Any, Optional, List, TYPE_CHECKING
from azmeta.access.specifications import AzureComputeSpecifications, load_compute_specifications

AzureComputeSpecificationsDagsterType = PythonObjectDagsterType(
    AzureComputeSpecifications)
make_python_type_usable_as_dagster_type(AzureComputeSpecifications,
                                        AzureComputeSpecificationsDagsterType)


@solid(
    config_schema={
        'subscription':
        Field(String,
              is_required=False,
              description='The subscription ID to list SKUs from.')
    })
def load_compute_specs(
        context: SolidExecutionContext) -> AzureComputeSpecifications:
    return load_compute_specifications(logger=context.log)
Exemple #10
0
from dagster import PythonObjectDagsterType, make_python_type_usable_as_dagster_type, op


class EvenType:
    def __init__(self, num):
        assert num % 2 is 0
        self.num = num


EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType")

make_python_type_usable_as_dagster_type(EvenType, EvenDagsterType)


@op
def double_even(even_num: EvenType) -> EvenType:
    return EvenType(even_num.num * 2)
Exemple #11
0
"""
Complex type signatures that appear multiple times throughout the code
base can live here, for easy reference and descriptive naming.
"""

from typing import NamedTuple

from dagster import make_python_type_usable_as_dagster_type
from dagster.core.types.dagster_type import String as DagsterString


class HcaScratchDatasetName(str):
    pass


make_python_type_usable_as_dagster_type(HcaScratchDatasetName, DagsterString)


class MetadataType(str):
    pass


class MetadataTypeFanoutResult(NamedTuple):
    scratch_dataset_name: HcaScratchDatasetName
    metadata_type: MetadataType
    path: str