def create_question( self, category: non_null(non_blank(str)), question: non_null(non_blank(str)), q_type: non_null(non_blank(str)) = 'range' ) -> Union[BooleanMarkQuestion, RangeMarkQuestion]: if q_type == 'range': return RangeMarkQuestionService.create(category, question) return BooleanMarkQuestionService.create(category, question)
def __init__( self, h3_resolutions: non_blank(List[int]), lat_column: non_blank(str), lng_column: non_blank(str), ): super().__init__() self.h3_resolutions = h3_resolutions self.lat_column = lat_column self.lng_column = lng_column self.stack_transform = None
def create( cls, name: non_null(non_blank(str)), surname: non_null(non_blank(str)), position: non_null(non_blank(str)), level: non_null(non_blank(str)), email: non_null(non_blank(str)), pay: non_negative(non_null(int)) ) -> User: validate_email(email) return cls.model.objects.create(name=name, surname=surname, position=position, hire_date=date.today(), level=level, email=email, pay=pay)
def __init__( self, name: non_blank(str), description: non_blank(str), dtype: non_blank(DataType) = None, from_column: non_blank(str) = None, transformation: non_null(TransformComponent) = None, ) -> None: self.name = name self.description = description self.transformation = transformation self.dtype = dtype self.from_column = from_column
def foo( a: non_blank(), b: non_null(), c: non_empty(), d: no_whitespaces(), e: non_negative(), f: strongly_typed(List), ): return a, b, c, d, e, f
def create(cls, category: non_null(non_blank(str)), question: non_null(non_blank(str))): return cls.model.objects.create(question_string=question, category=category)
def forward_fill( dataframe: DataFrame, partition_by: non_blank(Union[str, List[str]]), order_by: non_blank(Union[str, List[str]]), fill_column: non_blank(str), filled_column: non_blank(str) = None, ): """Applies a forward fill to a single column. Filling null values with the last known non-null value, leaving leading nulls alone. Attributes: dataframe: dataframe to be transformed. partition_by: list of columns' names to be used as partition for the operation. order_by: list of columns' names to be used when sorting column values. fill_column: column to be forward filled. filled_column: new column name. Optional. When none, operation will be inplace. Example: >>> dataframe.orderBy("ts", "sensor_type", "location").show() +-----------+-------------------+--------+-----------+ |sensor_type| ts|location|temperature| +-----------+-------------------+--------+-----------+ | 1|2017-09-09 12:00:00| shade| 18.83018| | 1|2017-09-09 12:00:00| sun| null| | 2|2017-09-09 12:00:00| shade| 18.61258| | 2|2017-09-09 12:00:00| sun| 25.4986| | 1|2017-09-09 13:00:00| shade| 18.78458| | 1|2017-09-09 13:00:00| sun| 25.68457| | 2|2017-09-09 13:00:00| shade| null| | 2|2017-09-09 13:00:00| sun| null| | 1|2017-09-09 14:00:00| shade| 17.98115| | 1|2017-09-09 14:00:00| sun| 24.15754| | 2|2017-09-09 14:00:00| shade| 18.61258| | 2|2017-09-09 14:00:00| sun| null| +-----------+-------------------+--------+-----------+ >>> filled_df = forward_fill( ... dataframe, ... ["sensor_type", "location"], ... "ts", ... "temperature", ... "temperature_filled" ... ) >>> filled_df.orderBy("ts", "sensor_type", "location").show() +-----------+-------------------+--------+-----------+------------------+ |sensor_type| ts|location|temperature|temperature_filled| +-----------+-------------------+--------+-----------+------------------+ | 1|2017-09-09 12:00:00| shade| 18.83018| 18.83018| | 1|2017-09-09 12:00:00| sun| null| null| | 2|2017-09-09 12:00:00| shade| 18.61258| 18.61258| | 2|2017-09-09 12:00:00| sun| 25.4986| 25.4986| | 1|2017-09-09 13:00:00| shade| 18.78458| 18.78458| | 1|2017-09-09 13:00:00| sun| 25.68457| 25.68457| | 2|2017-09-09 13:00:00| shade| null| 18.61258| | 2|2017-09-09 13:00:00| sun| null| 25.4986| | 1|2017-09-09 14:00:00| shade| 17.98115| 17.98115| | 1|2017-09-09 14:00:00| sun| 24.15754| 24.15754| | 2|2017-09-09 14:00:00| shade| 18.61258| 18.61258| | 2|2017-09-09 14:00:00| sun| null| 25.4986| +-----------+-------------------+--------+-----------+------------------+ >>> # inplace forward fill >>> filled_df = forward_fill( ... dataframe, ... ["sensor_type", "location"], ... "ts", ... "temperature" ... ) >>> filled_df.orderBy("ts", "sensor_type", "location").show() +-----------+-------------------+--------+-----------+ |sensor_type| ts|location|temperature| +-----------+-------------------+--------+-----------+ | 1|2017-09-09 12:00:00| shade| 18.83018| | 1|2017-09-09 12:00:00| sun| null| | 2|2017-09-09 12:00:00| shade| 18.61258| | 2|2017-09-09 12:00:00| sun| 25.4986| | 1|2017-09-09 13:00:00| shade| 18.78458| | 1|2017-09-09 13:00:00| sun| 25.68457| | 2|2017-09-09 13:00:00| shade| 18.61258| | 2|2017-09-09 13:00:00| sun| 25.4986| | 1|2017-09-09 14:00:00| shade| 17.98115| | 1|2017-09-09 14:00:00| sun| 24.15754| | 2|2017-09-09 14:00:00| shade| 18.61258| | 2|2017-09-09 14:00:00| sun| 25.4986| +-----------+-------------------+--------+-----------+ """ window = ( Window.partitionBy(partition_by).orderBy(order_by).rowsBetween(-sys.maxsize, 0) ) return dataframe.withColumn( filled_column or fill_column, functions.last(dataframe[fill_column], ignorenulls=True).over(window), )
def positional_args_validations_method(self, a: non_blank(str), b, *, c: dict, d): pass
def __init__( self, expression: non_blank(str), ): super().__init__() self.expression = expression
def guinea_pig(s: no_whitespaces(non_blank(str)) = default_value): return s
def pivot( dataframe: DataFrame, group_by_columns: non_blank(List[str]), pivot_column: non_blank(str), agg_column: non_blank(str), aggregation: non_null(Callable), mock_value: non_null(object) = None, mock_type: non_null(object) = None, with_forward_fill: non_null(bool) = False, ): """Defines a pivot transformation. Attributes: dataframe: dataframe to be pivoted. group_by_columns: list of columns' names to be grouped. pivot_column: column to be pivoted. agg_column: column to be aggregated by pivoted category. aggregation: desired spark aggregation function to be performed. An example: spark_agg(col_name). See docs for all spark_agg: https://spark.apache.org/docs/2.3.1/api/python/_modules/pyspark/sql/functions.html mock_value: value used to make a difference between true nulls resulting from the aggregation and empty values from the pivot transformation. mock_type: mock_value data type (compatible with spark). with_forward_fill: applies a forward fill to null values after the pivot operation. Example: >>> dataframe.orderBy("ts", "id", "amenity").show() +---+---+-------+-----+ | id| ts|amenity| has| +---+---+-------+-----+ | 1| 1| fridge|false| | 1| 1| oven| true| | 1| 1| pool|false| | 2| 2|balcony|false| | 1| 3|balcony| null| | 1| 4| oven| null| | 1| 4| pool| true| | 1| 5|balcony| true| +---+---+-------+-----+ >>> pivoted = pivot(dataframe, ["id", "ts"], "amenity", "has", functions.first) >>> pivoted.orderBy("ts", "id").show() +---+---+-------+------+----+-----+ | id| ts|balcony|fridge|oven| pool| +---+---+-------+------+----+-----+ | 1| 1| null| false|true|false| | 2| 2| false| null|null| null| | 1| 3| null| null|null| null| | 1| 4| null| null|null| true| | 1| 5| true| null|null| null| +---+---+-------+------+----+-----+ But, sometimes, you would like to keep the last values that some feature has assumed from previous modifications. In this example, amenity "oven" for the id=1 was set to null and "pool" was set to true at ts=4. All other amenities should then be kept to their actual state at that ts. To do that, we will use a technique called forward fill: >>> pivoted = pivot( ... dataframe, ... ["id", "ts"], ... "amenity", ... "has", ... functions.first, ... with_forward_fill=True ...) >>> pivoted.orderBy("ts", "id").show() +---+---+-------+------+----+-----+ | id| ts|balcony|fridge|oven| pool| +---+---+-------+------+----+-----+ | 1| 1| null| false|true|false| | 2| 2| false| null|null| null| | 1| 3| null| false|true|false| | 1| 4| null| false|true| true| | 1| 5| true| false|true| true| +---+---+-------+------+----+-----+ Great! Now every amenity that didn't have been changed kept it's state. BUT, the force change to null for amenity "oven" on id=1 at ts=4 was ignored during forward fill. If the user wants to respect this change, it must provide a mock value and type to be used as a signal for "true nulls". In other words, we want to forward fill only nulls that were created by the pivot transformation. In this example, amenities only assume boolean values. So there is no mock values for a boolean. It is only true or false. So users can give a mock value of another type (for which the column can be cast to). Check this out: >>> pivoted = pivot( ... dataframe, ... ["id", "ts"], ... "amenity", ... "has", ... functions.first, ... with_forward_fill=True, ... mock_value=-1, ... mock_type="int" ...) >>> pivoted.orderBy("ts", "id").show() +---+---+-------+------+----+-----+ | id| ts|balcony|fridge|oven| pool| +---+---+-------+------+----+-----+ | 1| 1| null| false|true|false| | 2| 2| false| null|null| null| | 1| 3| null| false|true|false| | 1| 4| null| false|null| true| | 1| 5| true| false|null| true| +---+---+-------+------+----+-----+ During transformation, this method will cast the agg_column to mock_type data type and fill all "true nulls" with the mock_value. After pivot and forward fill are applied, all new pivoted columns will then return to the original type with all mock values replaced by null. """ agg_column_type = None if mock_value is not None: if mock_type is None: raise AttributeError( "When proving a mock value, users must inform the data type," " which should be supported by Spark.") agg_column_type = dict(dataframe.dtypes).get(agg_column) dataframe = dataframe.withColumn( agg_column, functions.col(agg_column).cast(mock_type)).fillna( {agg_column: mock_value}) pivoted = (dataframe.groupBy(*group_by_columns).pivot(pivot_column).agg( aggregation(agg_column))) new_columns = [c for c in pivoted.columns if c not in group_by_columns] if with_forward_fill: for c in new_columns: pivoted = forward_fill( dataframe=pivoted, partition_by=group_by_columns[:-1], order_by=group_by_columns[-1], fill_column=c, ) if mock_value is not None: for c in new_columns: pivoted = pivoted.withColumn( c, functions.when( functions.col(c) != mock_value, functions.col(c)).cast(agg_column_type), ) return pivoted
def args_validations_function(a: non_blank(str), b, *, c: non_empty(dict), d): pass
def __init__(self, func: non_blank(callable), data_type: non_blank(DataType)): self.func = func self.data_type = data_type
def bar(arg: non_blank(str)): pass
def __init__(self, functions: non_blank(List[Function])): super().__init__() self.functions = functions self._windows = []
def args_validations_staticmethod( a: non_blank(str), b, *, c: non_empty(dict), d): pass
def args_validations_method(self, a: non_blank(str), b, *, c: non_empty(dict), d): pass
def positional_args_validations_staticmethod( a: non_blank(str), b, *, c: dict, d): pass
def __init__(self, functions: non_blank(List[Function]), filter_expression: str = None): super(AggregatedTransform, self).__init__() self.functions = functions self.filter_expression = filter_expression
def positional_args_validations_function(a: non_blank(str), b, *, c: dict, d): pass