class Iris(Relation):
    # setting some example additional Field attributes:
    # 1. of course most importantly, one can set a datatype to complete the
    #    Relation's schema definition in the classical sense:
    SEPAL_LENGTH = Field("sepal_length", datatype=float)

    # for use with Pandas, `datatype` accepts Python inbuilts (e.g. float, object, int),
    # quoted names of Pandas datatypes, and also instances of numpy types, such as
    # numpy.int, numpy.number, etc.

    # 2. to capture metadata, and for documentation purposes, a description can be set:
    SEPAL_WIDTH = Field("sepal_width", description="The Sepal length")

    # 3. `default` also to capture a default row value for a Field:
    PETAL_LENGTH = Field("petal_length", default=0.0)
    PETAL_WIDTH = Field("petal_width")

    # 4. the boolean flag `key` allows to specify key fields:
    SPECIES = Field("species", key=True)

    @classmethod
    def load(cls) -> pd.DataFrame:
        iris = pd.read_csv(
            "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7"
            "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv")
        # using all defined key fields, one could run a simply deduplication command:
        iris_dedup = iris.drop_duplicates(subset=cls.get_key_fields())
        return iris_dedup
Ejemplo n.º 2
0
def test_field_type_to_string():
    assert _field_type_to_string(Field("test").datatype) == "Undefined"
    assert _field_type_to_string(Field("test",
                                       datatype="str").datatype) == "str"
    assert (_field_type_to_string(
        Field("test",
              datatype=pyspark.sql.types.StringType).datatype) == "string")
Ejemplo n.º 3
0
class TestApplyDedupFieldIsMissing(Relation):
    """

    """

    NAME = Field(name="name")
    AGE = Field(name="age")

    @classmethod
    @blizz.apply.deduplication(key=AGE, sort_order=blizz.apply.ASC)
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(data={"name": ["Tom", "Mike", "Mike"]})
Ejemplo n.º 4
0
class TestApplyDefaultsPartial2(Relation):
    """

    """

    NAME = Field(name="name", default="Peter")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults(fill=NAME)
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(data={"name": ["Tom", None, "Sarah"], "age": [5, 31, None]})
Ejemplo n.º 5
0
class Iris(Relation):

    SEPAL_LENGTH = Field("sepal_length")
    SEPAL_WIDTH = Field("sepal_width")
    PETAL_LENGTH = Field("petal_length")
    PETAL_WIDTH = Field("petal_width")
    SPECIES = Field("species")

    @classmethod
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(
            "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7"
            "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv")
Ejemplo n.º 6
0
class TestApplyDedupSort(Relation):
    """

    """

    NAME = Field(name="name")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    @blizz.apply.deduplication(key=[NAME], sort_on=[AGE], sort_order=blizz.apply.ASC)
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(data={"name": ["Tom", "Mike", "Mike"], "age": [5, 25, 31]})
Ejemplo n.º 7
0
class TestApplyDedupPartial(Relation):
    """

    """

    NAME = Field(name="name")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    @blizz.apply.deduplication(key=[AGE])
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(data={"name": ["Tom", "Sarah", "Mike"], "age": [5, 31, 31]})
Ejemplo n.º 8
0
class TestApplyDefaults1(Relation):
    """

    """

    NAME = Field(name="name", default="")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(
            data={"name": ["Tom", "Mike", "Sarah"], "age": [5, 31, None]}
        )
Ejemplo n.º 9
0
class TestApplyRenameThroughArg(Relation):
    """

    """

    NAME = Field(name="name_renamed")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    @blizz.apply.deduplication
    @blizz.apply.renames(columns={"name": "name_renamed"})
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(data={"name": ["Tom", "Mike", "Mike"], "age": [5, 31, 31]})
Ejemplo n.º 10
0
class StudentPerformance(Relation):
    """
    This is the example data source "StudentPerformance" for testing.
    """

    STUDENT_ID = Field(name="Student_ID", datatype=object)
    SEMSTER_NAME = Field("Semster_Name", datatype=object)
    PAPER_ID = Field(name="Paper_ID", datatype=object)
    MARKS = Field(name="Marks", datatype=int)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return pd.read_csv(path_student_performance_test().as_posix())
Ejemplo n.º 11
0
class Iris(Relation):

    SEPAL_LENGTH = Field("sepal_length", datatype=float)
    SEPAL_WIDTH = Field("sepal_width", datatype=float)
    PETAL_LENGTH = Field("petal_length", default=0.0)
    PETAL_WIDTH = Field("petal_width", datatype=float)
    SPECIES = Field("species_renamed", datatype=object, key=True, source_name="species")

    @classmethod
    @blizz.check.func(function=my_custom_check, on_fail=blizz.check.WARN)
    @blizz.apply.func(function=my_custom_apply)
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(
            "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7"
            "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv"
        )
Ejemplo n.º 12
0
class TestApplyDedup1(Relation):
    """

    """

    NAME = Field(name="name", key=True)
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.keys
    @blizz.check.fields
    @blizz.apply.defaults
    @blizz.apply.deduplication
    def load(cls) -> pd.DataFrame:
        return pd.DataFrame(
            data={"name": ["Tom", "Mike", "Mike"], "age": [5, 31, None]}
        )
Ejemplo n.º 13
0
    class RelationWithNoKey(Relation):
        @classmethod
        def load(
                cls, *args, **kwargs
        ) -> Union["pyspark.sql.DataFrame", "pandas.DataFrame"]:
            pass

        C1 = Field("test")
Ejemplo n.º 14
0
class StudentCouncelingInformation(Relation):
    """
    This is the example data source "StudentCouncelingInformation" of the tutorial.
    """

    STUDENT_ID = Field(
        name="Student_ID",
        datatype=StringType,
        description="The ID of the student",
        key=True,
    )

    DATE_OF_ADMISSION = Field("DOA",
                              datatype=DateType,
                              description="Date of admission to university.")

    DATE_OF_BIRTH = Field(name="DOB",
                          datatype=DateType,
                          description="Student's birth date.")

    DEPARTMENT_CHOICES = Field(
        name="Department_Choices",
        datatype=StringType,
        description="Choice of department a student submitted",
    )

    DEPARTMENT_ADMISSION = Field(
        name="Department_Admission",
        datatype=StringType,
        description="Department where student got admitted",
    )

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return (get_or_create_spark_session().read.csv(
            path=os.path.join(PATH_TEST_DATA,
                              "Student_Counceling_Information.csv"),
            inferSchema=True,
            header=True,
        ).withColumn(
            cls.DATE_OF_ADMISSION,
            F.expr(f"cast({cls.DATE_OF_ADMISSION} as date)")).withColumn(
                cls.DATE_OF_BIRTH,
                F.expr(f"cast({cls.DATE_OF_BIRTH} as date)")))
Ejemplo n.º 15
0
class StudentPerformanceFaulty3(Relation):
    """
    Example of a duplicated field defined as key.
    """

    STUDENT_ID = Field(name="Student_ID")
    # this is actually not the key:
    SEMSTER_NAME = Field("Semster_Name", key=True)
    PAPER_ID = Field(name="Paper_ID")
    MARKS = Field(name="Marks")

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    @blizz.check.keys
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(path_student_performance_test().as_posix())
Ejemplo n.º 16
0
class TestApplyDefaults1(Relation):
    """

    """

    NAME = Field(name="name", default="")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    def load(cls,
             spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
        return spark_session.createDataFrame(
            pd.DataFrame(data={
                "name": ["Tom", "Mike", "Sarah"],
                "age": [5, 31, None]
            }))
Ejemplo n.º 17
0
class DepartmentInformation(Relation):
    """
    This is the example data source "DepartmentInformation" for testing.
    """

    DEPARTMENT_ID = Field(name="Department_ID", datatype=StringType)
    DATE_OF_ESTABLISHMENT = Field(name="DOE",
                                  datatype=TimestampType,
                                  description="Department Establishment Date")
    DEPARTMENT_NAME = Field(name="Department_Name", datatype=StringType)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=path_department_test().as_posix(),
            inferSchema=True,
            header=True)
Ejemplo n.º 18
0
class TestApplyDedupPartial(Relation):
    """

    """

    NAME = Field(name="name")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    @blizz.apply.deduplication(key=[AGE])
    def load(cls,
             spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
        return spark_session.createDataFrame(
            pd.DataFrame(data={
                "name": ["Tom", "Sarah", "Mike"],
                "age": [5, 31, 31]
            }))
Ejemplo n.º 19
0
class StudentPerformance(Relation):
    """
    This is the example data source "StudentPerformance" for testing.
    """

    STUDENT_ID = Field(name="Student_ID", datatype=StringType, key=True)
    SEMSTER_NAME = Field("Semster_Name", datatype=StringType)
    PAPER_ID = Field(name="Paper_ID", datatype=StringType)
    MARKS = Field(name="Marks", datatype=IntegerType)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=path_student_performance_test().as_posix(),
            inferSchema=True,
            header=True,
        )
Ejemplo n.º 20
0
class DepartmentInformation(Relation):
    """
    This is the example data source "DepartmentInformation" of the tutorial.
    """

    DEPARTMENT_ID = Field(name="Department_ID", datatype=StringType)
    DATE_OF_ESTABLISHMENT = Field(name="DOE",
                                  datatype=DateType,
                                  description="Department Establishment Date")
    DEPARTMENT_NAME = Field(name="Department_Name", datatype=StringType)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=os.path.join(PATH_TEST_DATA, "Department_Information.csv"),
            inferSchema=True,
            header=True,
        )
Ejemplo n.º 21
0
class StudentPerformance(Relation):
    """
    This is the example data source "StudentPerformance" of the tutorial.
    """

    STUDENT_ID = Field(name="Student_ID", datatype=StringType)
    SEMSTER_NAME = Field("Semster_Name", datatype=StringType)
    PAPER_ID = Field(name="Paper_ID", datatype=StringType)
    MARKS = Field(name="Marks", datatype=IntegerType)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=os.path.join(PATH_TEST_DATA,
                              "Student_Performance_Data.csv.gz"),
            inferSchema=True,
            header=True,
        )
Ejemplo n.º 22
0
class Iris(Relation):

    SEPAL_LENGTH = Field("sepal_length", datatype=float)
    SEPAL_WIDTH = Field("sepal_width", datatype=float)
    PETAL_LENGTH = Field("petal_length", default=0.0)
    PETAL_WIDTH = Field("petal_width", datatype=float)
    SPECIES = Field("species_renamed", datatype=object, key=True, source_name="species")

    @classmethod
    @blizz.check.types
    @blizz.check.fields
    @blizz.check.keys
    @blizz.apply.defaults(fill=PETAL_LENGTH)  # you can use field as arguments, too!
    @blizz.apply.deduplication
    @blizz.apply.renames  # renames should be applied first
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(
            "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7"
            "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv"
        )
Ejemplo n.º 23
0
class TestApplyRenameThroughField(Relation):
    """

    """

    NAME = Field(name="name_renamed", source_name="name")
    AGE = Field(name="age", default=20)

    @classmethod
    @blizz.check.fields
    @blizz.apply.defaults
    @blizz.apply.deduplication
    @blizz.apply.renames
    def load(cls,
             spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
        return spark_session.createDataFrame(
            pd.DataFrame(data={
                "name": ["Tom", "Mike", "Mike"],
                "age": [5, 31, 31]
            }))
Ejemplo n.º 24
0
class EmployeeInformation(Relation):
    """
    This is the example data source "EmployeeInformation" for testing.
    """

    EMPLOYEE_ID = Field(name="Employee ID", datatype=StringType)
    DATE_OF_BIRTH = Field(name="DOB",
                          datatype=DateType,
                          description="Employee's birth date.")
    DOJ = Field(name="DOJ", datatype=DateType, description="Date Of Joining")
    DEPARTMENT_ID = Field(name="Department_ID", datatype=StringType)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=path_employee_test().as_posix(),
            inferSchema=True,
            header=True)
Ejemplo n.º 25
0
class StudentPerformanceFaultyWarn(Relation):
    """
    Example of a defined field missing.
    """

    THIS_IS_MISSING = Field(name="I'm missing")

    @classmethod
    @blizz.check.fields(on_fail=blizz.check.WARN)
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(path_student_performance_test().as_posix())
Ejemplo n.º 26
0
class Iris(Relation):

    SEPAL_LENGTH = Field("sepal_length", datatype=DoubleType)
    SEPAL_WIDTH = Field("sepal_width", datatype="double")
    PETAL_LENGTH = Field("petal_length", datatype=DoubleType)
    PETAL_WIDTH = Field("petal_width", datatype=DoubleType)
    SPECIES = Field("species", datatype="string")

    @classmethod
    @blizz.check.types
    @blizz.check.fields
    def load(cls, spark_session) -> DataFrame:

        spark_session.sparkContext.addFile(
            "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7"
            "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv")
        df = spark_session.read.csv(SparkFiles.get("iris.csv"),
                                    inferSchema=True,
                                    header=True)

        return df
Ejemplo n.º 27
0
class StudentPerformanceFaulty3(Relation):
    """
    Example of a duplicated field defined as key.
    """

    STUDENT_ID = Field(name="Student_ID", datatype=StringType)
    # this is actually not the key:
    SEMSTER_NAME = Field("Semster_Name", datatype=StringType, key=True)
    PAPER_ID = Field(name="Paper_ID", datatype=StringType)
    MARKS = Field(name="Marks", datatype=IntegerType)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    @blizz.check.keys
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=path_student_performance_test().as_posix(),
            inferSchema=True,
            header=True,
        )
Ejemplo n.º 28
0
class RelationTypeDefVariance1(Relation):
    """
    This is an example data source for testing,
    here specifically testing various ways of stating a Pandas datatype:
        -   quoted string, e.g. "int", "float",...
        -   numpy type, e.g. np.int, np.float
        -   numpy umbrella type, e.g. np.number
    """

    F_STRING_1 = Field(name="F_String_1", datatype=object)

    # type float using Python inbuilt:
    F_FLOAT_1 = Field(name="F_Float_1", datatype=float)
    # type float as string:
    F_FLOAT_2 = Field(name="F_Float_2", datatype="float")

    # type integer using Python inbuilt:
    F_INTEGER_1 = Field(name="F_Integer_1", datatype=int)

    # type integer from numpy:
    F_INTEGER_2 = Field(name="F_Integer_2", datatype=np.int)

    # instead of integer, use "umbrella" type np.number:
    F_INTEGER_3 = Field(name="F_Integer_3", datatype=np.number)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(
            path_to_test_data().joinpath("test_type_variance.csv").as_posix())
Ejemplo n.º 29
0
class StudentPerformanceFaulty2(Relation):
    """
    Example of a defined field with a wrong type.
    """

    # this is actually an int:
    MARKS = Field(name="Marks", datatype=float)

    @classmethod
    @blizz.check.fields
    @blizz.check.types
    def load(cls) -> pd.DataFrame:
        return pd.read_csv(path_student_performance_test().as_posix())
Ejemplo n.º 30
0
class StudentPerformanceFaulty1(Relation):
    """
    Example of a defined field missing.
    """

    THIS_IS_MISSING = Field(name="I'm missing")

    @classmethod
    @blizz.check.fields
    def load(cls) -> DataFrame:
        return get_or_create_spark_session().read.csv(
            path=path_student_performance_test().as_posix(),
            inferSchema=True,
            header=True,
        )