Beispiel #1
0
def resolve_partitions(patterns):
    """
    Given a list of patterns, returns all the files matching or in folders matching
    one of them.

    The file are returned in a list of tuple of 2 elements:
    - The first tuple is the file path
    - The second being the partition keys and values if any were encountered else None

    In addition to this list, return, if the data was partitioned, a schema for the
    partition keys, else None

    :type patterns: list of str
    :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
    """
    file_paths = File.get_content(patterns)
    if not file_paths:
        raise AnalysisException('Path does not exist: {0}'.format(patterns))
    partitions = {}
    for file_path in file_paths:
        if "=" in file_path:
            row = row_from_keyed_values(
                folder.split("=")
                for folder in file_path.split("/")[:-1]
                if folder.count("=") == 1
            )
            partitions[file_path] = row
        else:
            partitions[file_path] = None

    partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None)
    if len(partitioning_field_sets) > 1:
        raise Exception(
            "Conflicting directory structures detected while reading {0}. "
            "All partitions must have the same partitioning fields, found fields {1}".format(
                ",".join(patterns),
                " and also ".join(
                    str(fields) for fields in partitioning_field_sets
                )
            )
        )

    if partitioning_field_sets:
        if any(value is None for value in partitions.values()):
            raise AnalysisException(
                "Unable to parse those malformed folders: {1} of {0}".format(
                    file_paths,
                    [path for path, value in partitions.items() if value is None]
                )
            )
        partitioning_fields = partitioning_field_sets.pop()
        partition_schema = guess_schema_from_strings(
            partitioning_fields, partitions.values(), options={}
        )
    else:
        partition_schema = None

    return partitions, partition_schema
Beispiel #2
0
def get_checked_matches(matches, field_name, schema, show_id):
    if not matches:
        raise AnalysisException(
            "Unable to find the column '{0}' among {1}".format(
                field_name, format_schema(schema, show_id)))

    if len(matches) > 1:
        raise AnalysisException(
            "Reference '{0}' is ambiguous, found {1} columns matching it.".
            format(field_name, len(matches)))

    return matches.pop()
Beispiel #3
0
    def eval(self, row, schema):
        value_1 = self.arg1.eval(row, schema)
        value_2 = self.arg2.eval(row, schema)
        if value_1 is None or value_2 is None:
            return None

        type_1 = value_1.__class__
        type_2 = value_2.__class__
        if type_1 == type_2:
            return self.unsafe_operation(value_1, value_2)

        try:
            order_1 = INTERNAL_TYPE_ORDER.index(type_1)
            order_2 = INTERNAL_TYPE_ORDER.index(type_2)
        except ValueError as e:
            raise AnalysisException("Unable to process type: {0}".format(e))

        spark_type_1 = python_to_spark_type(type_1)
        spark_type_2 = python_to_spark_type(type_2)

        if order_1 > order_2:
            caster = get_caster(from_type=spark_type_2,
                                to_type=spark_type_1,
                                options={})
            value_2 = caster(value_2)
        elif order_1 < order_2:
            caster = get_caster(from_type=spark_type_1,
                                to_type=spark_type_2,
                                options={})
            value_1 = caster(value_1)

        return self.unsafe_operation(value_1, value_2)
Beispiel #4
0
 def eval(self, row, schema):
     column_value = self.column.eval(row, schema)
     if isinstance(column_value, (list, dict)):
         return len(column_value)
     raise AnalysisException(
         "{0} value should be an array or a map, got {1}".format(
             self.column, type(column_value)))
Beispiel #5
0
def guess_type_from_values_as_string(values, options):
    # Reproduces inferences available in Spark
    # PartitioningUtils.inferPartitionColumnValue()
    # located in org.apache.spark.sql.execution.datasources
    tested_types = (
        IntegerType(),
        LongType(),
        DecimalType(),
        DoubleType(),
        TimestampType(),
        StringType()
    )
    string_type = StringType()
    for tested_type in tested_types:
        type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options)
        try:
            for value in values:
                casted_value = type_caster(value)
                if casted_value is None and value not in ("null", None):
                    raise ValueError
            return tested_type
        except ValueError:
            pass
    # Should never happen
    raise AnalysisException(
        "Unable to find a matching type for some fields, even StringType did not work"
    )
Beispiel #6
0
def cast_to_binary(value, from_type, options):
    if isinstance(from_type, StringType):
        # noinspection PyTypeChecker
        return bytearray(value, 'utf-8')
    if isinstance(from_type, BinaryType):
        return value
    raise AnalysisException("Cannot cast type {0} to binary".format(from_type))
Beispiel #7
0
 def eval(self, row, schema):
     metadata = row.get_metadata()
     if metadata is None or "grouping" not in metadata:
         raise AnalysisException("grouping_id() can only be used with GroupingSets/Cube/Rollup")
     id_binary_string_value = "".join(
         "1" if grouping else "0" for grouping in metadata["grouping"]
     )
     return int(id_binary_string_value, 2)
Beispiel #8
0
def cast_to_array(value, from_type, to_type, options):
    if isinstance(from_type, ArrayType):
        caster = get_caster(from_type=from_type.elementType,
                            to_type=to_type.elementType,
                            options=options)
        return [
            caster(sub_value) if sub_value is not None else None
            for sub_value in value
        ]
    raise AnalysisException("Cannot cast type {0} to array".format(from_type))
Beispiel #9
0
def cast_to_boolean(value, from_type, options):
    if value == "" or value is None:
        return None
    if isinstance(from_type, StringType):
        return True if value.lower() == "true" else False if value.lower(
        ) == "false" else None
    if isinstance(from_type, (NumericType, BooleanType)):
        return bool(value)
    raise AnalysisException(
        "Cannot cast type {0} to boolean".format(from_type))
Beispiel #10
0
def cast_to_float(value, from_type, options):
    # NB: pysparkling does not mimic the loss of accuracy of Spark nor value
    # bounding between float min&max values
    try:
        return cast_value(value, options=options)
    except ValueError:
        if isinstance(from_type,
                      (DateType, TimestampType, NumericType, StringType)):
            return None
        raise AnalysisException(
            "Cannot cast type {0} to float".format(from_type))
Beispiel #11
0
def cast_to_map(value, from_type, to_type, options):
    if isinstance(from_type, MapType):
        key_caster = get_caster(from_type=from_type.keyType,
                                to_type=to_type.keyType,
                                options=options)
        value_caster = get_caster(from_type=from_type.valueType,
                                  to_type=to_type.valueType,
                                  options=options)
        return {
            key_caster(key):
            (value_caster(sub_value) if sub_value is not None else None)
            for key, sub_value in value.items()
        }
    raise AnalysisException("Cannot cast type {0} to map".format(from_type))
Beispiel #12
0
    def eval(self, row, schema):
        value = self.column.eval(row, schema)
        if not isinstance(value, str) or value == "":
            raise AnalysisException(
                "type mismatch: The input csv should be a string literal and not null; "
                "however, got {0}.".format(value))
        # pylint: disable=import-outside-toplevel; circular import
        from pysparkling.sql.internal_utils.readers.csvreader import csv_record_to_row
        from pysparkling.sql.internal_utils.readers.utils import guess_schema_from_strings

        record_as_row = csv_record_to_row(value, self.options)
        schema = guess_schema_from_strings(record_as_row.__fields__,
                                           [record_as_row], self.options)
        return schema.simpleString()
Beispiel #13
0
    def eval(self, row, schema):
        value_1 = self.arg1.eval(row, schema)
        value_2 = self.arg2.eval(row, schema)
        if value_1 is None or value_2 is None:
            return None

        type_1 = value_1.__class__
        type_2 = value_2.__class__
        if type_1 == type_2 or (isinstance(value_1, (int, float))
                                and isinstance(value_2, (int, float))):
            return self.unsafe_operation(value_1, value_2)

        raise AnalysisException(
            "Cannot resolve {0} due to data type mismatch, first value is {1}, second value is {2}."
            "".format(self, type_1, type_2))
Beispiel #14
0
def get_caster(from_type, to_type, options):
    to_type_class = to_type.__class__
    if from_type == to_type:
        return partial(identity, options=options)
    if to_type_class == NullType:
        return partial(cast_from_none, from_type=from_type, options=options)
    if to_type_class == TimestampType:
        return get_datetime_parser(options.get("timestampFormat"))
    if to_type_class in DESTINATION_DEPENDENT_CASTERS:
        caster = DESTINATION_DEPENDENT_CASTERS[to_type_class]
        return partial(caster,
                       from_type=from_type,
                       to_type=to_type,
                       options=options)
    if to_type_class in CASTERS:
        return partial(CASTERS[to_type_class],
                       from_type=from_type,
                       options=options)
    raise AnalysisException("Cannot cast from {0} to {1}".format(
        from_type, to_type))
Beispiel #15
0
def _cast_to_bounded_type(name, min_value, max_value, value, from_type,
                          options):
    if value == "" or value is None:
        return None
    size = max_value - min_value + 1
    if isinstance(from_type, DateType):
        return None
    if isinstance(from_type, TimestampType):
        return _cast_to_bounded_type(name,
                                     min_value,
                                     max_value,
                                     cast_to_float(value,
                                                   from_type,
                                                   options=options),
                                     FloatType(),
                                     options=options)
    if isinstance(from_type, StringType):
        casted_value = int(value)
        return casted_value if min_value <= casted_value <= max_value else None
    if isinstance(from_type, (NumericType, BooleanType)):
        value = int(value)
        return value % size if value % size <= max_value else value % -size
    raise AnalysisException("Cannot cast type {0} to {1}".format(
        from_type, name))
Beispiel #16
0
def cast_to_timestamp(value, from_type, options):
    if value == "" or value is None:
        return None
    if isinstance(value, str):
        date_as_string, time_as_string = split_datetime_as_string(value)
        date = cast_to_date(date_as_string, from_type, options=options)
        time_of_day = parse_time_as_string(time_as_string)

        return None if date is None or time_of_day is None else datetime.datetime(
            year=date.year, month=date.month, day=date.day, **
            time_of_day).astimezone(tzlocal()).replace(tzinfo=None)
    if isinstance(value, datetime.datetime):
        return value
    if isinstance(value, datetime.date):
        return datetime.datetime(year=value.year,
                                 month=value.month,
                                 day=value.day)
    if isinstance(value, (int, float)):
        return datetime.datetime.fromtimestamp(value)
    if isinstance(from_type,
                  (StringType, TimestampType, NumericType, BooleanType)):
        return None
    raise AnalysisException(
        "Cannot cast type {0} to timestamp".format(from_type))
Beispiel #17
0
def cast_to_date(value, from_type, options):
    if isinstance(value, datetime.datetime):
        return value.date()
    if isinstance(value, datetime.date):
        return value
    if isinstance(value, str):
        # Spark cast only considers the first non empty part before a ' ' or a 'T'
        if ' ' in value:
            value = value.strip().split(" ")[0]
        if 'T' in value:
            value = value.split("T")[0]
        date_components = value.split("-")
        if len(date_components) > 3 or len(date_components[0]) != 4:
            return None
        # default month and day to 1
        date_components += ([1] * (3 - len(date_components)))
        try:
            return datetime.date(*map(int, date_components))
        except ValueError:
            return None
    if isinstance(from_type, (TimestampType, DateType, StringType)):
        return None  # other values would have been handle in the lines above

    raise AnalysisException("Cannot cast type {0} to date".format(from_type))
Beispiel #18
0
 def get_literal_value(self):
     raise AnalysisException("Expecting a Literal, but got {0}: {1}".format(
         type(self), self))
Beispiel #19
0
 def get_literal_value(self):
     if isinstance(self.expr, Expression):
         return self.expr.get_literal_value()
     raise AnalysisException("Expecting a Literal, but got {0}: {1}".format(
         type(self), self))
Beispiel #20
0
 def eval(self, row, schema):
     metadata = row.get_metadata()
     if metadata is None or "grouping" not in metadata:
         raise AnalysisException("grouping_id() can only be used with GroupingSets/Cube/Rollup")
     pos = self.column.find_position_in_schema(schema)
     return int(metadata["grouping"][pos])
Beispiel #21
0
 def get_literal_value(self):
     if hasattr(self.value, "expr") or isinstance(self.value, Expression):
         raise AnalysisException(
             "Value should not be a Column or an Expression, "
             "but got {0}: {1}".format(type(self), self))
     return self.value
Beispiel #22
0
def cast_from_none(value, from_type, options):
    if value is None:
        return None
    raise AnalysisException(
        "Expected a null value from a field with type {0}, got {1}".format(
            from_type, value))