class Iris(Relation): # setting some example additional Field attributes: # 1. of course most importantly, one can set a datatype to complete the # Relation's schema definition in the classical sense: SEPAL_LENGTH = Field("sepal_length", datatype=float) # for use with Pandas, `datatype` accepts Python inbuilts (e.g. float, object, int), # quoted names of Pandas datatypes, and also instances of numpy types, such as # numpy.int, numpy.number, etc. # 2. to capture metadata, and for documentation purposes, a description can be set: SEPAL_WIDTH = Field("sepal_width", description="The Sepal length") # 3. `default` also to capture a default row value for a Field: PETAL_LENGTH = Field("petal_length", default=0.0) PETAL_WIDTH = Field("petal_width") # 4. the boolean flag `key` allows to specify key fields: SPECIES = Field("species", key=True) @classmethod def load(cls) -> pd.DataFrame: iris = pd.read_csv( "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7" "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv") # using all defined key fields, one could run a simply deduplication command: iris_dedup = iris.drop_duplicates(subset=cls.get_key_fields()) return iris_dedup
def test_field_type_to_string(): assert _field_type_to_string(Field("test").datatype) == "Undefined" assert _field_type_to_string(Field("test", datatype="str").datatype) == "str" assert (_field_type_to_string( Field("test", datatype=pyspark.sql.types.StringType).datatype) == "string")
class TestApplyDedupFieldIsMissing(Relation): """ """ NAME = Field(name="name") AGE = Field(name="age") @classmethod @blizz.apply.deduplication(key=AGE, sort_order=blizz.apply.ASC) def load(cls) -> pd.DataFrame: return pd.DataFrame(data={"name": ["Tom", "Mike", "Mike"]})
class TestApplyDefaultsPartial2(Relation): """ """ NAME = Field(name="name", default="Peter") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults(fill=NAME) def load(cls) -> pd.DataFrame: return pd.DataFrame(data={"name": ["Tom", None, "Sarah"], "age": [5, 31, None]})
class Iris(Relation): SEPAL_LENGTH = Field("sepal_length") SEPAL_WIDTH = Field("sepal_width") PETAL_LENGTH = Field("petal_length") PETAL_WIDTH = Field("petal_width") SPECIES = Field("species") @classmethod def load(cls) -> pd.DataFrame: return pd.read_csv( "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7" "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv")
class TestApplyDedupSort(Relation): """ """ NAME = Field(name="name") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults @blizz.apply.deduplication(key=[NAME], sort_on=[AGE], sort_order=blizz.apply.ASC) def load(cls) -> pd.DataFrame: return pd.DataFrame(data={"name": ["Tom", "Mike", "Mike"], "age": [5, 25, 31]})
class TestApplyDedupPartial(Relation): """ """ NAME = Field(name="name") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults @blizz.apply.deduplication(key=[AGE]) def load(cls) -> pd.DataFrame: return pd.DataFrame(data={"name": ["Tom", "Sarah", "Mike"], "age": [5, 31, 31]})
class TestApplyDefaults1(Relation): """ """ NAME = Field(name="name", default="") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults def load(cls) -> pd.DataFrame: return pd.DataFrame( data={"name": ["Tom", "Mike", "Sarah"], "age": [5, 31, None]} )
class TestApplyRenameThroughArg(Relation): """ """ NAME = Field(name="name_renamed") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults @blizz.apply.deduplication @blizz.apply.renames(columns={"name": "name_renamed"}) def load(cls) -> pd.DataFrame: return pd.DataFrame(data={"name": ["Tom", "Mike", "Mike"], "age": [5, 31, 31]})
class StudentPerformance(Relation): """ This is the example data source "StudentPerformance" for testing. """ STUDENT_ID = Field(name="Student_ID", datatype=object) SEMSTER_NAME = Field("Semster_Name", datatype=object) PAPER_ID = Field(name="Paper_ID", datatype=object) MARKS = Field(name="Marks", datatype=int) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return pd.read_csv(path_student_performance_test().as_posix())
class Iris(Relation): SEPAL_LENGTH = Field("sepal_length", datatype=float) SEPAL_WIDTH = Field("sepal_width", datatype=float) PETAL_LENGTH = Field("petal_length", default=0.0) PETAL_WIDTH = Field("petal_width", datatype=float) SPECIES = Field("species_renamed", datatype=object, key=True, source_name="species") @classmethod @blizz.check.func(function=my_custom_check, on_fail=blizz.check.WARN) @blizz.apply.func(function=my_custom_apply) def load(cls) -> pd.DataFrame: return pd.read_csv( "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7" "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv" )
class TestApplyDedup1(Relation): """ """ NAME = Field(name="name", key=True) AGE = Field(name="age", default=20) @classmethod @blizz.check.keys @blizz.check.fields @blizz.apply.defaults @blizz.apply.deduplication def load(cls) -> pd.DataFrame: return pd.DataFrame( data={"name": ["Tom", "Mike", "Mike"], "age": [5, 31, None]} )
class RelationWithNoKey(Relation): @classmethod def load( cls, *args, **kwargs ) -> Union["pyspark.sql.DataFrame", "pandas.DataFrame"]: pass C1 = Field("test")
class StudentCouncelingInformation(Relation): """ This is the example data source "StudentCouncelingInformation" of the tutorial. """ STUDENT_ID = Field( name="Student_ID", datatype=StringType, description="The ID of the student", key=True, ) DATE_OF_ADMISSION = Field("DOA", datatype=DateType, description="Date of admission to university.") DATE_OF_BIRTH = Field(name="DOB", datatype=DateType, description="Student's birth date.") DEPARTMENT_CHOICES = Field( name="Department_Choices", datatype=StringType, description="Choice of department a student submitted", ) DEPARTMENT_ADMISSION = Field( name="Department_Admission", datatype=StringType, description="Department where student got admitted", ) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return (get_or_create_spark_session().read.csv( path=os.path.join(PATH_TEST_DATA, "Student_Counceling_Information.csv"), inferSchema=True, header=True, ).withColumn( cls.DATE_OF_ADMISSION, F.expr(f"cast({cls.DATE_OF_ADMISSION} as date)")).withColumn( cls.DATE_OF_BIRTH, F.expr(f"cast({cls.DATE_OF_BIRTH} as date)")))
class StudentPerformanceFaulty3(Relation): """ Example of a duplicated field defined as key. """ STUDENT_ID = Field(name="Student_ID") # this is actually not the key: SEMSTER_NAME = Field("Semster_Name", key=True) PAPER_ID = Field(name="Paper_ID") MARKS = Field(name="Marks") @classmethod @blizz.check.fields @blizz.check.types @blizz.check.keys def load(cls) -> pd.DataFrame: return pd.read_csv(path_student_performance_test().as_posix())
class TestApplyDefaults1(Relation): """ """ NAME = Field(name="name", default="") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults def load(cls, spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame: return spark_session.createDataFrame( pd.DataFrame(data={ "name": ["Tom", "Mike", "Sarah"], "age": [5, 31, None] }))
class DepartmentInformation(Relation): """ This is the example data source "DepartmentInformation" for testing. """ DEPARTMENT_ID = Field(name="Department_ID", datatype=StringType) DATE_OF_ESTABLISHMENT = Field(name="DOE", datatype=TimestampType, description="Department Establishment Date") DEPARTMENT_NAME = Field(name="Department_Name", datatype=StringType) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=path_department_test().as_posix(), inferSchema=True, header=True)
class TestApplyDedupPartial(Relation): """ """ NAME = Field(name="name") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults @blizz.apply.deduplication(key=[AGE]) def load(cls, spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame: return spark_session.createDataFrame( pd.DataFrame(data={ "name": ["Tom", "Sarah", "Mike"], "age": [5, 31, 31] }))
class StudentPerformance(Relation): """ This is the example data source "StudentPerformance" for testing. """ STUDENT_ID = Field(name="Student_ID", datatype=StringType, key=True) SEMSTER_NAME = Field("Semster_Name", datatype=StringType) PAPER_ID = Field(name="Paper_ID", datatype=StringType) MARKS = Field(name="Marks", datatype=IntegerType) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=path_student_performance_test().as_posix(), inferSchema=True, header=True, )
class DepartmentInformation(Relation): """ This is the example data source "DepartmentInformation" of the tutorial. """ DEPARTMENT_ID = Field(name="Department_ID", datatype=StringType) DATE_OF_ESTABLISHMENT = Field(name="DOE", datatype=DateType, description="Department Establishment Date") DEPARTMENT_NAME = Field(name="Department_Name", datatype=StringType) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=os.path.join(PATH_TEST_DATA, "Department_Information.csv"), inferSchema=True, header=True, )
class StudentPerformance(Relation): """ This is the example data source "StudentPerformance" of the tutorial. """ STUDENT_ID = Field(name="Student_ID", datatype=StringType) SEMSTER_NAME = Field("Semster_Name", datatype=StringType) PAPER_ID = Field(name="Paper_ID", datatype=StringType) MARKS = Field(name="Marks", datatype=IntegerType) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=os.path.join(PATH_TEST_DATA, "Student_Performance_Data.csv.gz"), inferSchema=True, header=True, )
class Iris(Relation): SEPAL_LENGTH = Field("sepal_length", datatype=float) SEPAL_WIDTH = Field("sepal_width", datatype=float) PETAL_LENGTH = Field("petal_length", default=0.0) PETAL_WIDTH = Field("petal_width", datatype=float) SPECIES = Field("species_renamed", datatype=object, key=True, source_name="species") @classmethod @blizz.check.types @blizz.check.fields @blizz.check.keys @blizz.apply.defaults(fill=PETAL_LENGTH) # you can use field as arguments, too! @blizz.apply.deduplication @blizz.apply.renames # renames should be applied first def load(cls) -> pd.DataFrame: return pd.read_csv( "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7" "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv" )
class TestApplyRenameThroughField(Relation): """ """ NAME = Field(name="name_renamed", source_name="name") AGE = Field(name="age", default=20) @classmethod @blizz.check.fields @blizz.apply.defaults @blizz.apply.deduplication @blizz.apply.renames def load(cls, spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame: return spark_session.createDataFrame( pd.DataFrame(data={ "name": ["Tom", "Mike", "Mike"], "age": [5, 31, 31] }))
class EmployeeInformation(Relation): """ This is the example data source "EmployeeInformation" for testing. """ EMPLOYEE_ID = Field(name="Employee ID", datatype=StringType) DATE_OF_BIRTH = Field(name="DOB", datatype=DateType, description="Employee's birth date.") DOJ = Field(name="DOJ", datatype=DateType, description="Date Of Joining") DEPARTMENT_ID = Field(name="Department_ID", datatype=StringType) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=path_employee_test().as_posix(), inferSchema=True, header=True)
class StudentPerformanceFaultyWarn(Relation): """ Example of a defined field missing. """ THIS_IS_MISSING = Field(name="I'm missing") @classmethod @blizz.check.fields(on_fail=blizz.check.WARN) def load(cls) -> pd.DataFrame: return pd.read_csv(path_student_performance_test().as_posix())
class Iris(Relation): SEPAL_LENGTH = Field("sepal_length", datatype=DoubleType) SEPAL_WIDTH = Field("sepal_width", datatype="double") PETAL_LENGTH = Field("petal_length", datatype=DoubleType) PETAL_WIDTH = Field("petal_width", datatype=DoubleType) SPECIES = Field("species", datatype="string") @classmethod @blizz.check.types @blizz.check.fields def load(cls, spark_session) -> DataFrame: spark_session.sparkContext.addFile( "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7" "/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv") df = spark_session.read.csv(SparkFiles.get("iris.csv"), inferSchema=True, header=True) return df
class StudentPerformanceFaulty3(Relation): """ Example of a duplicated field defined as key. """ STUDENT_ID = Field(name="Student_ID", datatype=StringType) # this is actually not the key: SEMSTER_NAME = Field("Semster_Name", datatype=StringType, key=True) PAPER_ID = Field(name="Paper_ID", datatype=StringType) MARKS = Field(name="Marks", datatype=IntegerType) @classmethod @blizz.check.fields @blizz.check.types @blizz.check.keys def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=path_student_performance_test().as_posix(), inferSchema=True, header=True, )
class RelationTypeDefVariance1(Relation): """ This is an example data source for testing, here specifically testing various ways of stating a Pandas datatype: - quoted string, e.g. "int", "float",... - numpy type, e.g. np.int, np.float - numpy umbrella type, e.g. np.number """ F_STRING_1 = Field(name="F_String_1", datatype=object) # type float using Python inbuilt: F_FLOAT_1 = Field(name="F_Float_1", datatype=float) # type float as string: F_FLOAT_2 = Field(name="F_Float_2", datatype="float") # type integer using Python inbuilt: F_INTEGER_1 = Field(name="F_Integer_1", datatype=int) # type integer from numpy: F_INTEGER_2 = Field(name="F_Integer_2", datatype=np.int) # instead of integer, use "umbrella" type np.number: F_INTEGER_3 = Field(name="F_Integer_3", datatype=np.number) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> pd.DataFrame: return pd.read_csv( path_to_test_data().joinpath("test_type_variance.csv").as_posix())
class StudentPerformanceFaulty2(Relation): """ Example of a defined field with a wrong type. """ # this is actually an int: MARKS = Field(name="Marks", datatype=float) @classmethod @blizz.check.fields @blizz.check.types def load(cls) -> pd.DataFrame: return pd.read_csv(path_student_performance_test().as_posix())
class StudentPerformanceFaulty1(Relation): """ Example of a defined field missing. """ THIS_IS_MISSING = Field(name="I'm missing") @classmethod @blizz.check.fields def load(cls) -> DataFrame: return get_or_create_spark_session().read.csv( path=path_student_performance_test().as_posix(), inferSchema=True, header=True, )