def resolve_partitions(patterns):
    """
    Given a list of patterns, returns all the files matching or in folders matching
    one of them.

    The file are returned in a list of tuple of 2 elements:
    - The first tuple is the file path
    - The second being the partition keys and values if any were encountered else None

    In addition to this list, return, if the data was partitioned, a schema for the
    partition keys, else None

    :type patterns: list of str
    :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
    """
    file_paths = File.get_content(patterns)
    if not file_paths:
        raise AnalysisException('Path does not exist: {0}'.format(patterns))
    partitions = {}
    for file_path in file_paths:
        if '=' in file_path:
            row = row_from_keyed_values(
                folder.split('=') for folder in file_path.split('/')[:-1]
                if folder.count('=') == 1)
            partitions[file_path] = row
        else:
            partitions[file_path] = None

    partitioning_field_sets = set(p.__fields__ for p in partitions.values()
                                  if p is not None)
    if len(partitioning_field_sets) > 1:
        raise Exception(
            'Conflicting directory structures detected while reading {0}. '
            'All partitions must have the same partitioning fields, found fields {1}'
            .format(
                ','.join(patterns),
                ' and also '.join(
                    str(fields) for fields in partitioning_field_sets),
            ))

    if partitioning_field_sets:
        if any(value is None for value in partitions.values()):
            raise AnalysisException(
                'Unable to parse those malformed folders: {1} of {0}'.format(
                    file_paths,
                    [
                        path
                        for path, value in partitions.items() if value is None
                    ],
                ))
        partitioning_fields = partitioning_field_sets.pop()
        partition_schema = guess_schema_from_strings(partitioning_fields,
                                                     partitions.values(),
                                                     options={})
    else:
        partition_schema = None

    return partitions, partition_schema
def get_checked_matches(matches, field_name, schema, show_id):
    if not matches:
        raise AnalysisException(
            "Unable to find the column '{0}' among {1}".format(
                field_name, format_schema(schema, show_id)))

    if len(matches) > 1:
        raise AnalysisException(
            "Reference '{0}' is ambiguous, found {1} columns matching it.".
            format(field_name, len(matches)))

    return matches.pop()
    def eval(self, row, schema):
        value_1 = self.arg1.eval(row, schema)
        value_2 = self.arg2.eval(row, schema)
        if value_1 is None or value_2 is None:
            return None

        type_1 = value_1.__class__
        type_2 = value_2.__class__
        if type_1 == type_2:
            return self.unsafe_operation(value_1, value_2)

        try:
            order_1 = INTERNAL_TYPE_ORDER.index(type_1)
            order_2 = INTERNAL_TYPE_ORDER.index(type_2)
        except ValueError as e:
            raise AnalysisException(f'Unable to process type: {e}') from None

        spark_type_1 = python_to_spark_type(type_1)
        spark_type_2 = python_to_spark_type(type_2)

        if order_1 > order_2:
            caster = get_caster(from_type=spark_type_2,
                                to_type=spark_type_1,
                                options={})
            value_2 = caster(value_2)
        elif order_1 < order_2:
            caster = get_caster(from_type=spark_type_1,
                                to_type=spark_type_2,
                                options={})
            value_1 = caster(value_1)

        return self.unsafe_operation(value_1, value_2)
def guess_type_from_values_as_string(values, options):
    # Reproduces inferences available in Spark
    # PartitioningUtils.inferPartitionColumnValue()
    # located in org.apache.spark.sql.execution.datasources
    tested_types = (
        IntegerType(),
        LongType(),
        DecimalType(),
        DoubleType(),
        TimestampType(),
        StringType(),
    )
    string_type = StringType()
    for tested_type in tested_types:
        type_caster = get_caster(from_type=string_type,
                                 to_type=tested_type,
                                 options=options)
        try:
            for value in values:
                casted_value = type_caster(value)
                if casted_value is None and value not in ('null', None):
                    raise ValueError
            return tested_type
        except ValueError:
            pass
    # Should never happen
    raise AnalysisException(
        'Unable to find a matching type for some fields, even StringType did not work'
    )
def cast_to_binary(value, from_type, options):
    if isinstance(from_type, StringType):
        # noinspection PyTypeChecker
        return bytearray(value, 'utf-8')
    if isinstance(from_type, BinaryType):
        return value
    raise AnalysisException('Cannot cast type {0} to binary'.format(from_type))
 def eval(self, row, schema):
     column_value = self.column.eval(row, schema)
     if isinstance(column_value, (list, dict)):
         return len(column_value)
     raise AnalysisException(
         '{0} value should be an array or a map, got {1}'.format(
             self.column, type(column_value)))
 def eval(self, row, schema):
     metadata = row.get_metadata()
     if metadata is None or 'grouping' not in metadata:
         raise AnalysisException(
             'grouping_id() can only be used with GroupingSets/Cube/Rollup')
     pos = self.column.find_position_in_schema(schema)
     return int(metadata['grouping'][pos])
def cast_to_boolean(value, from_type, options):
    if value == '' or value is None:
        return None
    if isinstance(from_type, StringType):
        return True if value.lower() == 'true' else False if value.lower() == 'false' else None
    if isinstance(from_type, (NumericType, BooleanType)):
        return bool(value)
    raise AnalysisException('Cannot cast type {0} to boolean'.format(from_type))
 def eval(self, row, schema):
     metadata = row.get_metadata()
     if metadata is None or 'grouping' not in metadata:
         raise AnalysisException(
             'grouping_id() can only be used with GroupingSets/Cube/Rollup')
     id_binary_string_value = ''.join('1' if grouping else '0'
                                      for grouping in metadata['grouping'])
     return int(id_binary_string_value, 2)
Example #10
0
def cast_to_map(value, from_type, to_type, options):
    if isinstance(from_type, MapType):
        key_caster = get_caster(from_type=from_type.keyType, to_type=to_type.keyType, options=options)
        value_caster = get_caster(from_type=from_type.valueType, to_type=to_type.valueType, options=options)
        return {
            key_caster(key): (value_caster(sub_value) if sub_value is not None else None)
            for key, sub_value in value.items()
        }
    raise AnalysisException('Cannot cast type {0} to map'.format(from_type))
Example #11
0
def cast_to_float(value, from_type, options):
    # NB: fast_pyspark_tester does not mimic the loss of accuracy of Spark nor value
    # bounding between float min&max values
    try:
        return cast_value(value, options=options)
    except ValueError:
        if isinstance(from_type, (DateType, TimestampType, NumericType, StringType)):
            return None
        raise AnalysisException(f'Cannot cast type {from_type} to float') from None
Example #12
0
def get_caster(from_type, to_type, options):
    to_type_class = to_type.__class__
    if from_type == to_type:
        return partial(identity, options=options)
    if to_type_class == NullType:
        return partial(cast_from_none, from_type=from_type, options=options)
    if to_type_class == TimestampType:
        return get_datetime_parser(options.get('timestampFormat'))
    if to_type_class in DESTINATION_DEPENDENT_CASTERS:
        caster = DESTINATION_DEPENDENT_CASTERS[to_type_class]
        return partial(caster, from_type=from_type, to_type=to_type, options=options)
    if to_type_class in CASTERS:
        return partial(CASTERS[to_type_class], from_type=from_type, options=options)
    raise AnalysisException('Cannot cast from {0} to {1}'.format(from_type, to_type))
Example #13
0
    def eval(self, row, schema):
        value = self.column.eval(row, schema)
        if not isinstance(value, str) or value == '':
            raise AnalysisException(
                'type mismatch: The input csv should be a string literal and not null; '
                'however, got {0}.'.format(value))
        # pylint: disable=import-outside-toplevel; circular import
        from fast_pyspark_tester.sql.internal_utils.readers.csvreader import csv_record_to_row
        from fast_pyspark_tester.sql.internal_utils.readers.utils import guess_schema_from_strings

        record_as_row = csv_record_to_row(value, self.options)
        schema = guess_schema_from_strings(record_as_row.__fields__,
                                           [record_as_row], self.options)
        return schema.simpleString()
    def eval(self, row, schema):
        value_1 = self.arg1.eval(row, schema)
        value_2 = self.arg2.eval(row, schema)
        if value_1 is None or value_2 is None:
            return None

        type_1 = value_1.__class__
        type_2 = value_2.__class__
        if type_1 == type_2 or (isinstance(value_1, (int, float))
                                and isinstance(value_2, (int, float))):
            return self.unsafe_operation(value_1, value_2)

        raise AnalysisException(
            'Cannot resolve {0} due to data type mismatch, first value is {1}, second value is {2}.'
            ''.format(self, type_1, type_2))
Example #15
0
def _cast_to_bounded_type(name, min_value, max_value, value, from_type, options):
    if value == '' or value is None:
        return None
    size = max_value - min_value + 1
    if isinstance(from_type, DateType):
        return None
    if isinstance(from_type, TimestampType):
        return _cast_to_bounded_type(
            name, min_value, max_value, cast_to_float(value, from_type, options=options), FloatType(), options=options,
        )
    if isinstance(from_type, StringType):
        casted_value = int(value)
        return casted_value if min_value <= casted_value <= max_value else None
    if isinstance(from_type, (NumericType, BooleanType)):
        value = int(value)
        return value % size if value % size <= max_value else value % -size
    raise AnalysisException('Cannot cast type {0} to {1}'.format(from_type, name))
Example #16
0
    def save(self):
        output_path = self.path
        mode = self.mode
        if os.path.exists(output_path):
            if mode == 'ignore':
                return
            if mode in ('error', 'errorifexists'):
                raise AnalysisException(
                    'path {0} already exists.;'.format(output_path))
            if mode == 'overwrite':
                shutil.rmtree(output_path)
                os.makedirs(output_path)
        else:
            os.makedirs(output_path)

        self.apply_on_aggregated_data(col(
            WriteInFolder(writer=self))).collect()

        success_path = os.path.join(output_path, '_SUCCESS')

        with open(success_path, 'w'):
            pass
Example #17
0
def cast_to_date(value, from_type, options):
    if isinstance(value, datetime.datetime):
        return value.date()
    if isinstance(value, datetime.date):
        return value
    if isinstance(value, str):
        # Spark cast only considers the first non empty part before a ' ' or a 'T'
        if ' ' in value:
            value = value.strip().split(' ')[0]
        if 'T' in value:
            value = value.split('T')[0]
        date_components = value.split('-')
        if len(date_components) > 3 or len(date_components[0]) != 4:
            return None
        # default month and day to 1
        date_components += [1] * (3 - len(date_components))
        try:
            return datetime.date(*map(int, date_components))
        except ValueError:
            return None
    if isinstance(from_type, (TimestampType, DateType, StringType)):
        return None  # other values would have been handle in the lines above

    raise AnalysisException('Cannot cast type {0} to date'.format(from_type))
Example #18
0
def cast_to_timestamp(value, from_type, options):
    if value == '' or value is None:
        return None
    if isinstance(value, str):
        date_as_string, time_as_string = split_datetime_as_string(value)
        date = cast_to_date(date_as_string, from_type, options=options)
        time_of_day = parse_time_as_string(time_as_string)

        return (
            None
            if date is None or time_of_day is None
            else datetime.datetime(year=date.year, month=date.month, day=date.day, **time_of_day)
            .astimezone(tzlocal())
            .replace(tzinfo=None)
        )
    if isinstance(value, datetime.datetime):
        return value
    if isinstance(value, datetime.date):
        return datetime.datetime(year=value.year, month=value.month, day=value.day)
    if isinstance(value, (int, float)):
        return datetime.datetime.fromtimestamp(value)
    if isinstance(from_type, (StringType, TimestampType, NumericType, BooleanType)):
        return None
    raise AnalysisException('Cannot cast type {0} to timestamp'.format(from_type))
 def get_literal_value(self):
     if isinstance(self.expr, Expression):
         return self.expr.get_literal_value()
     raise AnalysisException("Expecting a Literal, but got {0}: {1}".format(
         type(self), self))
Example #20
0
def cast_to_array(value, from_type, to_type, options):
    if isinstance(from_type, ArrayType):
        caster = get_caster(from_type=from_type.elementType, to_type=to_type.elementType, options=options,)
        return [caster(sub_value) if sub_value is not None else None for sub_value in value]
    raise AnalysisException('Cannot cast type {0} to array'.format(from_type))
 def get_literal_value(self):
     raise AnalysisException("Expecting a Literal, but got {0}: {1}".format(
         type(self), self))
Example #22
0
def cast_from_none(value, from_type, options):
    if value is None:
        return None
    raise AnalysisException('Expected a null value from a field with type {0}, got {1}'.format(from_type, value))