Beispiel #1
0
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)
Beispiel #2
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in ():
        return types.ArrayType(types.StringType())
Beispiel #3
0
def infer_spark_type(typeclass) -> t.DataType:
    if typeclass in (None, NoneType):
        return t.NullType()
    elif typeclass is str:
        return t.StringType()
    elif typeclass in {bytes, bytearray}:
        return t.BinaryType()
    elif typeclass is bool:
        return t.BooleanType()
    elif typeclass is date:
        return t.DateType()
    elif typeclass is datetime:
        return t.TimestampType()
    elif typeclass is Decimal:
        return t.DecimalType(precision=36, scale=6)
    elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal):
        (precision, scale) = typeclass.__constraints__
        return t.DecimalType(precision=precision, scale=scale)
    elif typeclass is float:
        return t.DoubleType()
    elif typeclass is int:
        return t.IntegerType()
    elif typeclass is long:
        return t.LongType()
    elif typeclass is short:
        return t.ShortType()
    elif typeclass is byte:
        return t.ByteType()
    elif getattr(typeclass, "__origin__", None) is not None:
        return infer_complex_spark_type(typeclass)
    elif is_pyspark_class(typeclass):
        return transform(typeclass)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
Beispiel #4
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
        return types.ArrayType(as_spark_type(tpe.__args__[0]))
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()
    else:
        raise TypeError("Type %s was not understood." % tpe)
Beispiel #5
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (decimal.Decimal, ):
        return types.DecimalType(38, 18)
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray, ):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe)
Beispiel #6
0
def as_spark_type(tpe: Union[str, type, Dtype],
                  *,
                  raise_error: bool = True,
                  prefer_timestamp_ntz: bool = False) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
    if sys.version_info >= (3, 8) and LooseVersion(
            np.__version__) >= LooseVersion("1.21"):
        if (hasattr(tpe, "__origin__")
                and tpe.__origin__ is np.ndarray  # type: ignore[union-attr]
                and hasattr(tpe, "__args__")
                and len(tpe.__args__) > 1  # type: ignore[union-attr]
            ):
            # numpy.typing.NDArray
            return types.ArrayType(
                as_spark_type(
                    tpe.__args__[1].__args__[0],
                    raise_error=raise_error  # type: ignore[union-attr]
                ))

    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(
            tpe.__origin__,
            list  # type: ignore[union-attr]
    ):
        element_type = as_spark_type(
            tpe.__args__[0],
            raise_error=raise_error  # type: ignore[union-attr]
        )
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool_, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType or TimestampNTZType if timezone is not specified.
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampNTZType(
        ) if prefer_timestamp_ntz else types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Beispiel #7
0
        "PolyPhen",
    )
)
parsed_df.printSchema()

# +
# x = parsed_df.filter(f.col("Condel") != '-').limit(10).toPandas()
# x
# -

dtypes = {
    "consequence": t.ArrayType(t.StringType()),
    "Existing_variation": t.ArrayType(t.StringType()),
    "ALLELE_NUM": t.IntegerType(),
    "DISTANCE": t.IntegerType(),
    "STRAND": t.ShortType(),
    "FLAGS": t.ArrayType(t.StringType()),
    "HGNC_ID": t.IntegerType(),
#     "CANONICAL": t.BooleanType(), # needs manual check if column equals "CANONICAL"
    "TREMBL": t.ArrayType(t.StringType()),
    "REFSEQ_MATCH": t.ArrayType(t.StringType()),
    "GENE_PHENO": t.BooleanType(),
    "sift_score": t.FloatType(),
    "polyphen_score": t.FloatType(),
    "EXON": t.ArrayType(t.IntegerType()),
    "INTRON": t.ArrayType(t.IntegerType()),
    "HGVS_OFFSET": t.IntegerType(),
    "AF": t.ArrayType(t.FloatType()),
    "AFR_AF": t.ArrayType(t.FloatType()),
    "AMR_AF": t.ArrayType(t.FloatType()),
    "EAS_AF": t.ArrayType(t.FloatType()),
         types.StructField(
             field,
             types.StructType(
                 [types.StructField('0', types.DoubleType())])))
 elif 'Altitude' in field or 'diff_' in field:
     schema_fields.add(
         types.StructField(
             field,
             types.StructType(
                 [types.StructField('0', types.FloatType())])))
 elif 'RSSI_' in field:
     schema_fields.add(
         types.StructField(
             field,
             types.StructType(
                 [types.StructField('0', types.ShortType())])))
 elif 'tmp_' in field:
     schema_fields.add(
         types.StructField(
             field,
             types.StructType(
                 [types.StructField('0', types.LongType())])))
 elif 'mean' in field:
     schema_fields.add(
         types.StructField(
             field,
             types.StructType(
                 [types.StructField('0', types.DoubleType())])))
 else:
     schema_fields.add(
         types.StructField(
Beispiel #9
0
def _to_stype(tpe) -> X:
    if _is_col(tpe):
        inner = as_spark_type(_get_col_inner(tpe))
        return _Column(inner)
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Regular(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, 'str', 'string'],
    types.ByteType(): [np.int8, 'int8', 'byte'],
    types.ShortType(): [np.int16, 'int16', 'short'],
    types.IntegerType(): [int, 'int', np.int],
    types.LongType(): [np.int64, 'int64', 'long', 'bigint'],
    types.FloatType(): [float, 'float', np.float],
    types.DoubleType(): [np.float64, 'float64', 'double'],
    types.TimestampType(): [np.datetime64],
    types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
                 for other_type in l])


def _build_py_type_dict():
Beispiel #10
0
from pyspark.sql import types

# base type
DType = types.DataType
# individual types
String = types.StringType()
Date = types.DateType()
Datetime = types.TimestampType()
# numeric types
Float = types.FloatType()
Double = types.DoubleType()
Byte = types.ByteType()
Short = types.ShortType()
Integer = types.IntegerType()
Long = types.LongType()
# groups
Floats = (Float, Double)
Integers = (Byte, Short, Integer, Long)
Numerics = Floats + Integers
 def make_udf(threshs):
     return F.udf(lambda x: sum([x > y for y in threshs]), T.ShortType())
Beispiel #12
0
def get_common_spark_testing_client(data_directory, connect):
    spark = (
        SparkSession.builder.appName("ibis_testing").master("local[1]").config(
            "spark.cores.max",
            1).config("spark.executor.heartbeatInterval", "3600s").config(
                "spark.executor.instances",
                1).config("spark.network.timeout", "4200s").config(
                    "spark.sql.execution.arrow.pyspark.enabled",
                    False).config("spark.sql.legacy.timeParserPolicy",
                                  "LEGACY").
        config("spark.storage.blockManagerSlaveTimeoutMs", "4200s").config(
            "spark.ui.showConsoleProgress",
            False).config('spark.default.parallelism', 1).config(
                'spark.dynamicAllocation.enabled',
                False).config('spark.rdd.compress', False).config(
                    'spark.serializer',
                    'org.apache.spark.serializer.KryoSerializer').config(
                        'spark.shuffle.compress', False).config(
                            'spark.shuffle.spill.compress',
                            False).config('spark.sql.shuffle.partitions',
                                          1).config('spark.ui.enabled',
                                                    False).getOrCreate())
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session
    num_partitions = 4

    df_functional_alltypes = (
        s.read.csv(
            path=str(data_directory / 'functional_alltypes.csv'),
            schema=pt.StructType([
                pt.StructField('index', pt.IntegerType(), True),
                pt.StructField('Unnamed: 0', pt.IntegerType(), True),
                pt.StructField('id', pt.IntegerType(), True),
                # cast below, Spark can't read 0/1 as bool
                pt.StructField('bool_col', pt.ByteType(), True),
                pt.StructField('tinyint_col', pt.ByteType(), True),
                pt.StructField('smallint_col', pt.ShortType(), True),
                pt.StructField('int_col', pt.IntegerType(), True),
                pt.StructField('bigint_col', pt.LongType(), True),
                pt.StructField('float_col', pt.FloatType(), True),
                pt.StructField('double_col', pt.DoubleType(), True),
                pt.StructField('date_string_col', pt.StringType(), True),
                pt.StructField('string_col', pt.StringType(), True),
                pt.StructField('timestamp_col', pt.TimestampType(), True),
                pt.StructField('year', pt.IntegerType(), True),
                pt.StructField('month', pt.IntegerType(), True),
            ]),
            mode='FAILFAST',
            header=True,
        ).repartition(num_partitions).sort('index'))

    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = (s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_batting.createOrReplaceTempView("batting")

    df_awards_players = (s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {
            'a': [[2, 4], [3, 5]]
        })],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
Beispiel #13
0
def spark_client_testing(data_directory):
    pytest.importorskip('pyspark')

    import pyspark.sql.types as pt

    client = ibis.spark.connect()

    df_functional_alltypes = client._session.read.csv(
        path=str(data_directory / 'functional_alltypes.csv'),
        schema=pt.StructType([
            pt.StructField('index', pt.IntegerType(), True),
            pt.StructField('Unnamed: 0', pt.IntegerType(), True),
            pt.StructField('id', pt.IntegerType(), True),
            # cast below, Spark can't read 0/1 as bool
            pt.StructField('bool_col', pt.ByteType(), True),
            pt.StructField('tinyint_col', pt.ByteType(), True),
            pt.StructField('smallint_col', pt.ShortType(), True),
            pt.StructField('int_col', pt.IntegerType(), True),
            pt.StructField('bigint_col', pt.LongType(), True),
            pt.StructField('float_col', pt.FloatType(), True),
            pt.StructField('double_col', pt.DoubleType(), True),
            pt.StructField('date_string_col', pt.StringType(), True),
            pt.StructField('string_col', pt.StringType(), True),
            pt.StructField('timestamp_col', pt.TimestampType(), True),
            pt.StructField('year', pt.IntegerType(), True),
            pt.StructField('month', pt.IntegerType(), True),
        ]),
        mode='FAILFAST',
        header=True,
    )
    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = client._session.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    )
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = client._session.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    )
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = client._session.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = client._session.createDataFrame([((1, 2, 'a'), )],
                                                ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = client._session.createDataFrame([(
        [1, 2],
        [[3, 4], [5, 6]],
        {
            'a': [[2, 4], [3, 5]]
        },
    )], [
        'list_of_ints', 'list_of_list_of_ints',
        'map_string_list_of_list_of_ints'
    ])
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = client._session.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    return client
Beispiel #14
0
    pass


PRIMITIVES = [
    (NoneType, t.NullType()),
    (int, t.IntegerType()),
    (float, t.DoubleType()),
    (str, t.StringType()),
    (bytes, t.BinaryType()),
    (bytearray, t.BinaryType()),
    (bool, t.BooleanType()),
]

SYNTHETIC_PRIMITIVES = [
    (long, t.LongType()),
    (short, t.ShortType()),
    (byte, t.ByteType()),
]

DATE_TYPES = [
    (date, t.DateType()),
    (datetime, t.TimestampType()),
]

OPTIONALS_PY = [
    (str, (False, str)),
    (decimal(1, 2), (False, decimal(1, 2))),
    (Optional[decimal(3, 4)], (True, decimal(3, 4))),
    (List[str], (False, List[str])),
    (Optional[str], (True, str)),
    (Optional[List[str]], (True, List[str])),
Beispiel #15
0
def infer_spark_dtype(df, col):
    """
    Deduce correct spark dtype from pandas dtype for column col of pandas dataframe df
    """

    logger = logging.getLogger(__name__ + ".infer_spark_dtype")

    pd_dtype = df.dtypes[col]

    # get a sample from column col
    sample = df[col].dropna()

    if sample.shape[0] == 0:
        logger.warning("column %s of dtype %s containing nulls found" % (col, pd_dtype))
        sample = None
    else:
        sample = sample.iloc[0]

    # infer spark dtype

    # datetimes
    if pd.api.types.is_datetime64_any_dtype(pd_dtype):
        ret = T.TimestampType()

    # ints
    elif (pd_dtype == 'int8') or (pd_dtype == 'int16'):  # int8, int16
        ret = T.ShortType()

    elif pd_dtype == 'int32':
        ret = T.IntegerType()

    elif pd.api.types.is_int64_dtype(pd_dtype):
        ret = T.LongType()

    # uints
    elif pd_dtype == 'uint8':
        ret = T.ShortType()

    elif pd_dtype == 'uint16':
        ret = T.IntegerType()

    elif pd_dtype == 'uint32':
        ret = T.LongType()

    elif pd_dtype == 'uint64':
        logger.warning("converting column %s of type uint64 to spark LongType - overflows will be nulls" % col)
        ret = T.LongType()

    # floats
    elif (pd_dtype == 'float16') or (pd_dtype == 'float32'):
        ret = T.FloatType()

    elif pd_dtype == 'float64':  # float64
        ret = T.DoubleType()

    elif pd_dtype == 'bool':
        ret = T.BooleanType()

    # object
    elif pd_dtype == 'object':

        if (sample is None) or (isinstance(sample, str)):
            logger.warning("converting column %s of type object to spark StringType" % col)
            ret = T.StringType()

        elif isinstance(sample, tuple):
            raise NotImplementedError("cannot convert column %s containing tuples to spark" % col)

        else:
            raise NotImplementedError("values in column %s of type object not understood" % col)

    # category
    elif pd.api.types.is_categorical_dtype(pd_dtype):
        logger.warning("converting column %s of type category to spark StringType" % col)
        ret = T.StringType()

    else:
        raise NotImplementedError("column %s of type %s not understood" % (col, pd_dtype))

    return ret
Beispiel #16
0
def get_common_spark_testing_client(data_directory, connect):
    pytest.importorskip('pyspark')
    import pyspark.sql.types as pt
    from pyspark.sql import SparkSession

    spark = (SparkSession.builder.config('spark.default.parallelism',
                                         4).config('spark.driver.bindAddress',
                                                   '127.0.0.1').getOrCreate())
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session
    num_partitions = 4

    df_functional_alltypes = (
        s.read.csv(
            path=str(data_directory / 'functional_alltypes.csv'),
            schema=pt.StructType([
                pt.StructField('index', pt.IntegerType(), True),
                pt.StructField('Unnamed: 0', pt.IntegerType(), True),
                pt.StructField('id', pt.IntegerType(), True),
                # cast below, Spark can't read 0/1 as bool
                pt.StructField('bool_col', pt.ByteType(), True),
                pt.StructField('tinyint_col', pt.ByteType(), True),
                pt.StructField('smallint_col', pt.ShortType(), True),
                pt.StructField('int_col', pt.IntegerType(), True),
                pt.StructField('bigint_col', pt.LongType(), True),
                pt.StructField('float_col', pt.FloatType(), True),
                pt.StructField('double_col', pt.DoubleType(), True),
                pt.StructField('date_string_col', pt.StringType(), True),
                pt.StructField('string_col', pt.StringType(), True),
                pt.StructField('timestamp_col', pt.TimestampType(), True),
                pt.StructField('year', pt.IntegerType(), True),
                pt.StructField('month', pt.IntegerType(), True),
            ]),
            mode='FAILFAST',
            header=True,
        ).repartition(num_partitions).sort('index'))

    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = (s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = (s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    ).repartition(num_partitions).sort('playerID'))
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {
            'a': [[2, 4], [3, 5]]
        })],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
Beispiel #17
0
                    .master("yarn")\
                    .getOrCreate()

spark_session.sparkContext.addFile('parse_tool.py')

from parse_tool import parse_logs

# User logs collection
user_logs = spark_session.sparkContext.textFile("/data/access_logs/big_log/")

parsed_logs = user_logs.map(parse_logs) \
                       .filter(lambda parse_res : parse_res[8] != "") \
                       .map(lambda parse_res : [
                          parse_res[0],                       # ip
                          parse_res[0] + '_' + parse_res[7],  # ip+user_agent
                          parse_res[8]                        # hour
                       ])

schema = tp.StructType().add("ip", tp.StringType()) \
                        .add("user_id", tp.StringType()) \
                        .add("hour", tp.ShortType())

user_log_df = spark_session.createDataFrame(parsed_logs, schema)

res = user_log_df.groupby("hour") \
                 .agg(fn.countDistinct("user_id").alias("unique_visitors_count")) \
                 .orderBy("hour") \
                 .collect()

for hour, count in res:
    print(hour, count, sep='\t')
Beispiel #18
0
        else:
            parameters = getattr(tuple_type, "__args__")
        return _DataFrame([as_spark_type(t) for t in parameters])
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Scalar(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, "str", "string"],
    types.BinaryType(): [bytes],
    types.ByteType(): [np.int8, "int8", "byte"],
    types.ShortType(): [np.int16, "int16", "short"],
    types.IntegerType(): [int, "int", np.int, np.int32],
    types.LongType(): [np.int64, "int64", "long", "bigint"],
    types.FloatType(): [float, "float", np.float],
    types.DoubleType(): [np.float64, "float64", "double"],
    types.TimestampType(): [datetime.datetime, np.datetime64],
    types.DateType(): [datetime.date],
    types.BooleanType(): [bool, "boolean", "bool", np.bool],
    types.ArrayType(types.StringType()): [],
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
                 for other_type in l] + [(spark_type, spark_type)
                                         for (spark_type, _) in _base.items()])
Beispiel #19
0
def as_spark_type(tpe: typing.Union[str, type, Dtype],
                  *,
                  raise_error: bool = True) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__,
                                                   list):  # type: ignore
        element_type = as_spark_type(tpe.__args__[0],
                                     raise_error=raise_error)  # type: ignore
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
def process_song_data(spark, input_data, output_data):
    """
    Description: This function can be used to process song-data files from the
    given input path and transform the data from json files into songs and artists
    spark tables and writing these tables to the given output path as parquet tables.

    Arguments:
        spark: SparkSession object.
        input_data: Path to the input JSON files.
        output_data: Path to the output directory that stores output parquet tables.

    Returns:
        df: Song data dataframe.
    """
    # get filepath to song data file
    song_data = input_data + 'song-data/A/B/*/*.json'

    # define schema for song data file
    song_schema = t.StructType([
        t.StructField("artist_id", t.StringType(), True),
        t.StructField("artist_latitude", t.DecimalType(11, 7), True),
        t.StructField("artist_location", t.StringType(), True),
        t.StructField("artist_longitude", t.DecimalType(11, 7), True),
        t.StructField("artist_name", t.StringType(), True),
        t.StructField("duration", t.DecimalType(11, 7), True),
        t.StructField("num_songs", t.IntegerType(), True),
        t.StructField("song_id", t.StringType(), True),
        t.StructField("title", t.StringType(), True),
        t.StructField("year", t.ShortType(), True)
    ])

    # read song data file using schema
    df = spark \
        .read \
        .format("json") \
        .schema(song_schema) \
        .load(song_data)

    # extract columns to create songs table
    songs_table = df \
        .select(['song_id', 'title', 'artist_id', 'year', 'duration']) \
        .dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_output = output_data + 'songs'

    songs_table \
        .write \
        .partitionBy('year', 'artist_id') \
        .option("path", songs_output) \
        .saveAsTable('songs', format='parquet')

    # extract columns to create artists table
    artists_table = df \
        .select(['artist_id', 'artist_name', 'artist_location', 'artist_longitude', 'artist_latitude']) \
        .dropDuplicates()

    # write artists table to parquet files
    artists_output = output_data + 'artists'

    artists_table \
        .write \
        .option("path", artists_output) \
        .saveAsTable('artists', format='parquet')

    return df