def _spark(cls, column, strftime_format, **kwargs):
        # Below is a simple validation that the provided format can both format and parse a datetime object.
        # %D is an example of a format that can format but not parse, e.g.
        try:
            datetime.strptime(
                datetime.strftime(datetime.now(), strftime_format),
                strftime_format)
        except ValueError as e:
            raise ValueError(
                f"Unable to use provided strftime_format: {str(e)}")

        def is_parseable_by_format(val):
            if val is None:
                return False
            try:
                datetime.strptime(val, strftime_format)
                return True
            except TypeError:
                raise TypeError(
                    "Values passed to expect_column_values_to_match_strftime_format must be of type string.\nIf you want to validate a column of dates or timestamps, please call the expectation before converting from string format."
                )
            except ValueError:
                return False

        success_udf = F.udf(is_parseable_by_format, sparktypes.BooleanType())
        return success_udf(column)
    def _spark(cls, column, **kwargs):
        def is_ascii(val):
            return str(val).isascii()

        is_ascii_udf = F.udf(is_ascii, sparktypes.BooleanType())

        return is_ascii_udf(column)
    def _spark(cls, column, **kwargs):
        def is_xml(val):
            try:
                xml_doc = etree.fromstring(val)
                return True
            except:
                return False

        is_xml_udf = F.udf(is_xml, sparktypes.BooleanType())

        return is_xml_udf(column)
    def _spark(cls, column, json_schema, **kwargs):
        def is_json(val):
            try:
                json.loads(val)
                return True
            except:
                return False

        is_json_udf = F.udf(is_json, sparktypes.BooleanType())

        return is_json_udf(column)
Beispiel #5
0
    def _spark(cls, column, **kwargs):
        center_point = kwargs.get("center_point")
        unit = kwargs.get("unit")
        range = kwargs.get("range")
        projection = kwargs.get("projection")

        if projection == "fcc":
            if unit == "kilometers":
                distances = F.udf(
                    lambda x, y=center_point: fcc_projection(x, y),
                    sparktypes.FloatType(),
                )
            elif unit == "miles":
                distances = F.udf(
                    lambda x, y=center_point: fcc_projection(x, y) * 1.609344,
                    sparktypes.FloatType(),
                )
                range = range * 1.609344

            return F.when(distances(column) < range,
                          F.lit(True)).otherwise(F.lit(False))

        elif projection == "pythagorean":
            if unit == "kilometers":
                distances = F.udf(
                    lambda x, y=center_point: pythagorean_projection(x, y),
                    sparktypes.FloatType(),
                )
            elif unit == "miles":
                distances = F.udf(
                    lambda x, y=center_point: pythagorean_projection(x, y) *
                    1.609344,
                    sparktypes.FloatType(),
                )
                range = range * 1.609344

            return F.when(distances(column) < range,
                          F.lit(True)).otherwise(F.lit(False))
    def _spark(cls, column, json_schema, **kwargs):
        def matches_json_schema(val):
            if val is None:
                return False
            try:
                val_json = json.loads(val)
                jsonschema.validate(val_json, json_schema)
                # jsonschema.validate raises an error if validation fails.
                # So if we make it this far, we know that the validation succeeded.
                return True
            except jsonschema.ValidationError:
                return False
            except jsonschema.SchemaError:
                raise
            except:
                raise

        matches_json_schema_udf = F.udf(matches_json_schema, sparktypes.BooleanType())

        return matches_json_schema_udf(column)
Beispiel #7
0
    def _spark(cls, column, xml_schema, **kwargs):
        try:
            xmlschema_doc = etree.fromstring(xml_schema)
            xmlschema = etree.XMLSchema(xmlschema_doc)
        except etree.ParseError:
            raise
        except:
            raise

        def matches_xml_schema(val):
            if val is None:
                return False
            try:
                xml_doc = etree.fromstring(val)
                return xmlschema(xml_doc)
            except:
                raise

        matches_xml_schema_udf = F.udf(matches_xml_schema,
                                       sparktypes.BooleanType())

        return matches_xml_schema_udf(column)
Beispiel #8
0
    def _spark(cls, column, **kwargs):

        tz_udf = F.udf(is_valid_timezone, sparktypes.BooleanType())

        return tz_udf(column)