def schema(self):
        '''
            Return the data type that represents a row from the received data list.
        '''
        from pyspark.sql.types import (IntegerType, LongType, ShortType, StringType,
                                        StructField, StructType)

        return StructType(
            [
                StructField('frame_time', StringType(), True),
                StructField('unix_tstamp', LongType(), True),
                StructField('frame_len', IntegerType(), True),
                StructField('ip_dst', StringType(), True),
                StructField('ip_src', StringType(), True),
                StructField('dns_qry_name', StringType(), True),
                StructField('dns_qry_class', StringType(), True),
                StructField('dns_qry_type', IntegerType(), True),
                StructField('dns_qry_rcode', IntegerType(), True),
                StructField('dns_a', StringType(), True),
                StructField('y', ShortType(), True),
                StructField('m', ShortType(), True),
                StructField('d', ShortType(), True),
                StructField('h', ShortType(), True)
            ]
        )
Beispiel #2
0
    def test_apply_schema(self):
        from datetime import date, datetime
        rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                    date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1),
                                    {"a": 1}, (2,), [1, 2, 3], None)])
        schema = StructType([
            StructField("byte1", ByteType(), False),
            StructField("byte2", ByteType(), False),
            StructField("short1", ShortType(), False),
            StructField("short2", ShortType(), False),
            StructField("int1", IntegerType(), False),
            StructField("float1", FloatType(), False),
            StructField("date1", DateType(), False),
            StructField("time1", TimestampType(), False),
            StructField("map1", MapType(StringType(), IntegerType(), False), False),
            StructField("struct1", StructType([StructField("b", ShortType(), False)]), False),
            StructField("list1", ArrayType(ByteType(), False), False),
            StructField("null1", DoubleType(), True)])
        df = self.spark.createDataFrame(rdd, schema)
        results = df.rdd.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1,
                             x.date1, x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1))
        r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1),
             datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
        self.assertEqual(r, results.first())

        with self.tempView("table2"):
            df.createOrReplaceTempView("table2")
            r = self.spark.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
                               "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " +
                               "float1 + 1.5 as float1 FROM table2").first()

            self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))
def load_df_with_schema(spark):
    schema = StructType([
        StructField("dateCrawled", TimestampType(), True),
        StructField("name", StringType(), True),
        StructField("seller", StringType(), False),
        StructField("offerType", StringType(), True),
        StructField("price", LongType(), True),
        StructField("abtest", StringType(), True),
        StructField("vehicleType", StringType(), True),
        StructField("yearOfRegistration", StringType(), True),
        StructField("gearbox", StringType(), True),
        StructField("powerPS", ShortType(), True),
        StructField("model", StringType(), True),
        StructField("kilometer", LongType(), True),
        StructField("monthOfRegistration", StringType(), True),
        StructField("fuelType", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("notRepairedDamage", StringType(), True),
        StructField("dateCreated", DateType(), True),
        StructField("nrOfPictures", ShortType(), True),
        StructField("postalCode", StringType(), True),
        StructField("lastSeen", TimestampType(), True)
    ])

    df = spark \
        .read \
        .format("csv") \
        .schema(schema)         \
        .option("header", "true") \
        .load("/home/gavaskarrathnam/dataeng/etl-analytics-pyspark/data/autos.csv")

    print("Data loaded into PySpark", "\n")
    return df
Beispiel #4
0
 def setUpClass(cls):
     cls._TestField1a = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1b = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1c = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField2a = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2b = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2c = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
Beispiel #5
0
def test_make_named_tuple():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_scalar', np.string_,
                       (), ScalarCodec(StringType()), True),
        UnischemaField('int32_scalar', np.int32,
                       (), ScalarCodec(ShortType()), False),
        UnischemaField('uint8_scalar', np.uint8,
                       (), ScalarCodec(ShortType()), False),
        UnischemaField('int32_matrix', np.float32,
                       (10, 20, 3), NdarrayCodec(), True),
        UnischemaField('decimal_scalar', Decimal,
                       (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False),
    ])

    TestSchema.make_namedtuple(string_scalar='abc',
                               int32_scalar=10,
                               uint8_scalar=20,
                               int32_matrix=np.int32((10, 20, 3)),
                               decimal_scalar=Decimal(123) / Decimal(10))

    TestSchema.make_namedtuple(string_scalar=None,
                               int32_scalar=10,
                               uint8_scalar=20,
                               int32_matrix=None,
                               decimal_scalar=Decimal(123) / Decimal(10))
Beispiel #6
0
def test_assert_schema_equal(spark):
    expected = StructType([
        StructField('Name', StringType(), True),
        StructField('Age', ShortType(), True)
    ])
    result = StructType([
        StructField('Name', StringType(), True),
        StructField('Age', ShortType(), True)
    ])

    assert_schema_equal(expected, result)

    result = StructType([
        StructField('Name', StringType(), True),
        StructField('Age', IntegerType(), True)
    ])

    with pytest.raises(AssertionError):
        assert_schema_equal(expected, result)

    result = StructType([
        StructField('Name', StringType(), True),
        StructField('age', ShortType(), True)
    ])

    with pytest.raises(AssertionError):
        assert_schema_equal(expected, result)

    result = StructType([
        StructField('Name', StringType(), True),
        StructField('Age', ShortType(), False)
    ])

    with pytest.raises(AssertionError):
        assert_schema_equal(expected, result)
Beispiel #7
0
def loadDFWithSchema(spark):
    schema = StructType([
        StructField('dateCrawled', TimestampType(), True),
        StructField('name', StringType(), True),
        StructField('seller', StringType(), True),
        StructField('offerType', StringType(), True),
        StructField('price', LongType(), True),
        StructField('abtest', StringType(), True),
        StructField('vehicleType', StringType(), True),
        StructField('yearOfRegistration', StringType(), True),
        StructField('gearbox', StringType(), True),
        StructField('powerPS', ShortType(), True),
        StructField('model', StringType(), True),
        StructField('kilometer', LongType(), True),
        StructField('monthOfRegistration', StringType(), True),
        StructField('fuelType', StringType(), True),
        StructField('brand', StringType(), True),
        StructField('notRepairedDamage', StringType(), True),
        StructField('dateCreated', DateType(), True),
        StructField('nrOfPictures', ShortType(), True),
        StructField('postalCode', StringType(), True),
        StructField('lastSeen', TimestampType(), True)
    ])

    df = spark \
            .read \
            .format('csv') \
            .schema(schema) \
            .option('header', 'true') \
            .load(DATASET)

    print('Data loaded into PySpark', '\n')

    # Returning the DataFrame
    return df
 class Fixture(object):
     string1 = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     string2 = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     string_implicit = UnischemaField('random', np.string_, ())
     string_nullable = UnischemaField('random', np.string_, (), nullable=True)
     other_string = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False)
     int1 = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     int2 = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     other_int = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
Beispiel #9
0
    class Fixture(object):
        TestField1a = UnischemaField('random', np.string_, (),
                                     ScalarCodec(StringType()), False)
        TestField1b = UnischemaField('random', np.string_, (),
                                     ScalarCodec(StringType()), False)
        TestField1c = UnischemaField('Random', np.string_, (),
                                     ScalarCodec(StringType()), False)

        TestField2a = UnischemaField('id', np.int32, (),
                                     ScalarCodec(ShortType()), False)
        TestField2b = UnischemaField('id', np.int32, (),
                                     ScalarCodec(ShortType()), False)
        TestField2c = UnischemaField('ID', np.int32, (),
                                     ScalarCodec(ShortType()), False)
Beispiel #10
0
    def test_insert_explicit_nulls(self):
        TestSchema = Unischema('TestSchema', [
            UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True),
            UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False),
        ])

        # Insert_explicit_nulls to leave the dictionary as is.
        row_dict = {'nullable': 0, 'not_nullable': 1}
        insert_explicit_nulls(TestSchema, row_dict)
        self.assertEqual(len(row_dict), 2)
        self.assertEqual(row_dict['nullable'], 0)
        self.assertEqual(row_dict['not_nullable'], 1)

        # Insert_explicit_nulls to leave the dictionary as is.
        row_dict = {'nullable': None, 'not_nullable': 1}
        insert_explicit_nulls(TestSchema, row_dict)
        self.assertEqual(len(row_dict), 2)
        self.assertEqual(row_dict['nullable'], None)
        self.assertEqual(row_dict['not_nullable'], 1)

        # We are missing a nullable field here. insert_explicit_nulls should add a None entry.
        row_dict = {'not_nullable': 1}
        insert_explicit_nulls(TestSchema, row_dict)
        self.assertEqual(len(row_dict), 2)
        self.assertEqual(row_dict['nullable'], None)
        self.assertEqual(row_dict['not_nullable'], 1)

        # We are missing a not_nullable field here. Should raise an ValueError.
        row_dict = {'nullable': 0}
        with self.assertRaises(ValueError):
            insert_explicit_nulls(TestSchema, row_dict)
Beispiel #11
0
def test_primitives():
    assert BooleanType() == parse_schema("bool")
    assert BooleanType() == parse_schema("boolean")

    assert ByteType() == parse_schema("byte")
    assert ByteType() == parse_schema("tinyint")

    assert ShortType() == parse_schema("short")
    assert ShortType() == parse_schema("smallint")

    assert IntegerType() == parse_schema("int")
    assert FloatType() == parse_schema("float")
    assert DoubleType() == parse_schema("double")

    assert StringType() == parse_schema("string")
    assert BinaryType() == parse_schema("binary")
def test_insert_explicit_nulls():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True),
        UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False),
    ])

    # Insert_explicit_nulls to leave the dictionary as is.
    row_dict = {'nullable': 0, 'not_nullable': 1}
    insert_explicit_nulls(TestSchema, row_dict)
    assert len(row_dict) == 2
    assert row_dict['nullable'] == 0
    assert row_dict['not_nullable'] == 1

    # Insert_explicit_nulls to leave the dictionary as is.
    row_dict = {'nullable': None, 'not_nullable': 1}
    insert_explicit_nulls(TestSchema, row_dict)
    assert len(row_dict) == 2
    assert row_dict['nullable'] is None
    assert row_dict['not_nullable'] == 1

    # We are missing a nullable field here. insert_explicit_nulls should add a None entry.
    row_dict = {'not_nullable': 1}
    insert_explicit_nulls(TestSchema, row_dict)
    assert len(row_dict) == 2
    assert row_dict['nullable'] is None
    assert row_dict['not_nullable'] == 1

    # We are missing a not_nullable field here. Should raise an ValueError.
    row_dict = {'nullable': 0}
    with pytest.raises(ValueError):
        insert_explicit_nulls(TestSchema, row_dict)
Beispiel #13
0
def read_song_data(spark, song_files):
    """Read song data to spark DataFrame.
    Args:
        spark (obj): SparkSession
        song_files (list of str): list of full paths to song files on S3
    Returns:
         song_df (obj): DataFrame with song data
    """
    song_df_schema = StructType([
        StructField('artist_id', StringType()),
        StructField('artist_latitude', FloatType()),
        StructField('artist_longitude', FloatType()),
        StructField('artist_location', StringType()),
        StructField('artist_name', StringType()),
        StructField('song_id', StringType()),
        StructField('title', StringType()),
        StructField('duration', FloatType()),
        StructField('year', ShortType())
    ])

    # read song data files to DataFrame
    song_df = spark.read.json(song_files,
                              schema=song_df_schema,
                              mode='DROPMALFORMED')
    return song_df
Beispiel #14
0
 def iff_schema(self):
     from pyspark.sql.types import (ShortType, StringType, StructType,StructField,LongType, IntegerType, DoubleType)
     myschema = StructType([
         StructField("recType", ShortType(), True),  # 1  //track point record type number
         StructField("recTime", StringType(), True),  # 2  //seconds since midnigght 1/1/70 UTC
         StructField("fltKey", LongType(), True),  # 3  //flight key
         StructField("bcnCode", IntegerType(), True),  # 4  //digit range from 0 to 7
         StructField("cid", IntegerType(), True),  # 5  //computer flight id
         StructField("Source", StringType(), True),  # 6  //source of the record
         StructField("msgType", StringType(), True),  # 7
         StructField("callsign", StringType(), True),  # 8  //call sign
         StructField("recTypeCat", IntegerType(), True),  # 9
         StructField("latitude", DoubleType(), True),  # 10
         StructField("longitude", DoubleType(), True),  # 11
         StructField("altitude", DoubleType(), True),  # 12  //in 100s of feet
         StructField("significance", ShortType(), True),  # 13 //digit range from 1 to 10
         StructField("latAcc", DoubleType(), True),  # 14
         StructField("lonAcc", DoubleType(), True),  # 15
         StructField("altAcc", DoubleType(), True),  # 16
         StructField("tas", IntegerType(), True),  # 17 //in knots
         StructField("heading", DoubleType(), True),  # 18  //in degrees from true north
         StructField("rateOfClimb", DoubleType(), True),  # 19  //in feet per minute
         StructField("altQualifier", StringType(), True),  # 20  //Altitude qualifier (the “B4 character”)
         StructField("altIndicator", StringType(), True),  # 21  //Altitude indicator (the “C4 character”)
         StructField("trackPtStatus", StringType(), True),  # 22  //Track point status (e.g., ‘C’ for coast)
         StructField("leaderDir", IntegerType(), True),  # 23  //int 0-8 representing the direction of the leader line
         StructField("scratchPad", StringType(), True),  # 24
         StructField("msawInhibitInd", ShortType(), True),  # 25 // MSAW Inhibit Indicator (0=not inhibited, 1=inhibited)
         StructField("assignedAltString", StringType(), True),  # 26
         StructField("controllingFac", StringType(), True),  # 27
         StructField("controllingSec", StringType(), True),  # 28
         StructField("receivingFac", StringType(), True),  # 29
         StructField("receivingSec", StringType(), True),  # 30
         StructField("activeContr", IntegerType(), True),  # 31  // the active control number
         StructField("primaryContr", IntegerType(), True),
         # 32  //The primary(previous, controlling, or possible next)controller number
         StructField("kybrdSubset", StringType(), True),  # 33  //identifies a subset of controller keyboards
         StructField("kybrdSymbol", StringType(), True),  # 34  //identifies a keyboard within the keyboard subsets
         StructField("adsCode", IntegerType(), True),  # 35  //arrival departure status code
         StructField("opsType", StringType(), True),  # 36  //Operations type (O/E/A/D/I/U)from ARTS and ARTS 3A data
         StructField("airportCode", StringType(), True),  # 37
         StructField("trackNumber", IntegerType(), True),  # 38
         StructField("tptReturnType", StringType(), True),  # 39
         StructField("modeSCode", StringType(), True)  # 40
     ])
     return myschema
Beispiel #15
0
def check_column_numeric(df, column):
    return df.schema[column].dataType in [
        IntegerType(),
        ShortType(),
        LongType(),
        FloatType(),
        DecimalType(),
        DoubleType()
    ]
Beispiel #16
0
    def from_arrow_schema(cls, parquet_dataset):
        """
        Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars
        which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will
        throw an exception.

        :param arrow_schema: :class:`pyarrow.lib.Schema`
        :return: A :class:`Unischema` object.
        """
        meta = parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open)
        arrow_schema = meta.schema.to_arrow_schema()
        unischema_fields = []

        for partition_name in parquet_dataset.partitions.partition_names:
            unischema_fields.append(UnischemaField(partition_name, np.str_, (), ScalarCodec(StringType()), False))

        for column_name in arrow_schema.names:
            arrow_field = arrow_schema.field_by_name(column_name)
            field_type = arrow_field.type
            if field_type == pyarrow.int8():
                np_type = np.int8
                codec = ScalarCodec(ByteType())
            elif field_type == pyarrow.int16():
                np_type = np.int16
                codec = ScalarCodec(ShortType())
            elif field_type == pyarrow.int32():
                np_type = np.int32
                codec = ScalarCodec(IntegerType())
            elif field_type == pyarrow.int64():
                np_type = np.int64
                codec = ScalarCodec(LongType())
            elif field_type == pyarrow.string():
                np_type = np.unicode_
                codec = ScalarCodec(StringType())
            elif field_type == pyarrow.bool_():
                np_type = np.bool_
                codec = ScalarCodec(BooleanType())
            elif field_type == pyarrow.float32():
                np_type = np.float32
                codec = ScalarCodec(FloatType())
            elif field_type == pyarrow.float64():
                np_type = np.float64
                codec = ScalarCodec(DoubleType())
            elif isinstance(field_type, pyarrow.lib.Decimal128Type):
                np_type = Decimal
                codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale))
            elif field_type == pyarrow.binary():
                np_type = np.string_
                codec = ScalarCodec(StringType())
            elif isinstance(field_type, pyarrow.lib.FixedSizeBinaryType):
                np_type = np.string_
                codec = ScalarCodec(StringType())
            else:
                raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type))

            unischema_fields.append(UnischemaField(column_name, np_type, (), codec, arrow_field.nullable))
        return Unischema('inferred_schema', unischema_fields)
def test_primitive(test_ctx):
    schema = StructType([
        StructField("bool_col", BooleanType(), False),
        StructField("float_col", FloatType(), False),
        StructField("double_col", DoubleType(), False),
        StructField("short_col", ShortType(), False),
        StructField("int_col", IntegerType(), False),
        StructField("long_col", LongType(), False),
        StructField("str_col", StringType(), False),
        StructField("bin_col", BinaryType(), False),
        StructField("byte_col", ByteType(), False),
    ])
    df = test_ctx.spark.createDataFrame(
        [(True, 0.12, 432.1, 5, 5, 0, "hello", bytearray(b"spark\x01\x02"),
          -128),
         (False, 123.45, 0.987, 9, 908, 765, "petastorm",
          bytearray(b"\x0012345"), 127)],
        schema=schema).coalesce(1)
    # If we use numPartition > 1, the order of the loaded dataset would
    # be non-deterministic.
    expected_df = df.collect()

    converter = make_spark_converter(df)
    with converter.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
            # TODO: we will improve the test once the batch_size argument
            #  added.
            # Now we only have one batch.
        for i in range(converter.dataset_size):
            for col in df.schema.names:
                actual_ele = getattr(ts, col)[i]
                expected_ele = expected_df[i][col]
                if col == "str_col":
                    actual_ele = actual_ele.decode()
                if col == "bin_col":
                    actual_ele = bytearray(actual_ele)
                if col == "float_col" or col == "double_col":
                    # Note that the default dtype is float32
                    assert pytest.approx(expected_ele, rel=1e-6) == actual_ele
                else:
                    assert expected_ele == actual_ele

        assert len(expected_df) == len(converter)

    assert np.bool_ == ts.bool_col.dtype.type
    assert np.float32 == ts.float_col.dtype.type
    # Default dtype float32
    assert np.float32 == ts.double_col.dtype.type
    assert np.int16 == ts.short_col.dtype.type
    assert np.int32 == ts.int_col.dtype.type
    assert np.int64 == ts.long_col.dtype.type
    assert np.object_ == ts.str_col.dtype.type
    assert np.object_ == ts.bin_col.dtype.type
Beispiel #18
0
def enrich_puzzle_data(puzzles: DataFrame) -> DataFrame:
    """Merge puzzle data with corresponding puzzles.

    :param puzzles: PySpark DataFrame containing un-enriched puzzle data.
    :return: PySpark DataFrame containing metadata-enriched puzzles.
    """
    data_folder = Path('resources')
    meta = data_folder / 'puzzle_meta.json'
    f_enrichment = open(meta)
    enrichment_data = json.load(f_enrichment)

    enrichment_schema = StructType([
        StructField('puzzle_no', ShortType(), False),
        StructField('final_jumble_groupings', ArrayType(ShortType()), False)
    ])

    enriched_puzzles = spark.createDataFrame(enrichment_data['puzzles'],
                                             enrichment_schema)
    return puzzles.join(other=enriched_puzzles, how='left', on='puzzle_no')
def test_assert_frame_equal(spark):

    expected = create_dataframe({
        'Name': ['Tom', 'Charlie'],
        'Age': [25, 24]
    },
                                spark=spark)
    result = create_dataframe({
        'Name': ['Tom', 'Charlie'],
        'Age': [25, 24]
    },
                              spark=spark)

    assert_dataframe_equal(expected, result)

    result = create_dataframe({
        'Name': ['Tom', 'Charlie'],
        'Age': [25, 23]
    },
                              spark=spark)

    with pytest.raises(AssertionError):
        assert_dataframe_equal(expected, result)

    result = create_dataframe({'Name': ['Tom'], 'Age': [25]}, spark=spark)

    with pytest.raises(AssertionError):
        assert_dataframe_equal(expected, result)

    new_schema = StructType([
        StructField('Name', StringType(), True),
        StructField('Age', ShortType(), True),
    ])
    result = create_dataframe({
        'Name': ['Tom', 'Charlie'],
        'Age': [25, 24]
    },
                              schema=new_schema,
                              spark=spark)

    with pytest.raises(AssertionError):
        assert_dataframe_equal(expected, result)

    assert_dataframe_equal(expected, result, check_schema=False)

    result = create_dataframe({
        'Name': ['Charlie', 'Tom'],
        'Age': [24, 25]
    },
                              spark=spark)

    with pytest.raises(AssertionError):
        assert_dataframe_equal(expected, result)

    assert_dataframe_equal(expected, result, check_order=False)
    def schema(self):
        '''
            Return the data type that represents a row from the received data list.
        '''
        from pyspark.sql.types import (FloatType, IntegerType, LongType,
                                       ShortType, StringType, StructField,
                                       StructType)

        return StructType([
            StructField('treceived', StringType(), True),
            StructField('unix_tstamp', LongType(), True),
            StructField('tryear', IntegerType(), True),
            StructField('trmonth', IntegerType(), True),
            StructField('trday', IntegerType(), True),
            StructField('trhour', IntegerType(), True),
            StructField('trminute', IntegerType(), True),
            StructField('trsecond', IntegerType(), True),
            StructField('tdur', FloatType(), True),
            StructField('sip', StringType(), True),
            StructField('dip', StringType(), True),
            StructField('sport', IntegerType(), True),
            StructField('dport', IntegerType(), True),
            StructField('proto', StringType(), True),
            StructField('flag', StringType(), True),
            StructField('fwd', IntegerType(), True),
            StructField('stos', IntegerType(), True),
            StructField('ipkt', LongType(), True),
            StructField('ibyt', LongType(), True),
            StructField('opkt', LongType(), True),
            StructField('obyt', LongType(), True),
            StructField('input', IntegerType(), True),
            StructField('output', IntegerType(), True),
            StructField('sas', IntegerType(), True),
            StructField('das', IntegerType(), True),
            StructField('dtos', IntegerType(), True),
            StructField('dir', IntegerType(), True),
            StructField('rip', StringType(), True),
            StructField('y', ShortType(), True),
            StructField('m', ShortType(), True),
            StructField('d', ShortType(), True),
            StructField('h', ShortType(), True)
        ])
Beispiel #21
0
def load_puzzles() -> DataFrame:
    """Automated puzzle loading.
    :return: PySpark DataFrame of puzzles and information about them.
    """
    data_folder = Path('resources')
    puzzle_file = data_folder / 'puzzles.json'
    f_puzzle = open(puzzle_file)
    puzzle_data = json.load(f_puzzle)
    sort_word = udf(lambda j: ''.join(sorted(j)))

    puzzle_schema = StructType([
        StructField('puzzle_no', ShortType(), False),
        StructField('scrambled_word', StringType(), False),
        StructField('key_indices', ArrayType(ShortType()), False)
    ])

    puzzles = spark.createDataFrame(puzzle_data['jumbles'], puzzle_schema) \
        .withColumn('sorted_word', sort_word(col('scrambled_word')))

    return enrich_puzzle_data(puzzles)
Beispiel #22
0
def struct_type():
    '''
        Return the data type that represents a row from the received data list.
    '''
    from pyspark.sql.types import (StructType, StructField, StringType,
                                   ShortType, IntegerType, FloatType)

    return StructType([
        StructField('data', StringType(), True),
        StructField('event_id', IntegerType(), True),
        StructField('event_second', IntegerType(), True),
        StructField('length', IntegerType(), True),
        StructField('linktype', ShortType(), True),
        StructField('sensor_id', IntegerType(), True),
        StructField('unix_tstamp', FloatType(), True),
        StructField('y', ShortType(), True),
        StructField('m', ShortType(), True),
        StructField('d', ShortType(), True),
        StructField('h', ShortType(), True)
    ])
Beispiel #23
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
Beispiel #24
0
    def test_as_spark_type_extension_dtypes(self):
        from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype

        type_mapper = {
            Int8Dtype(): ByteType(),
            Int16Dtype(): ShortType(),
            Int32Dtype(): IntegerType(),
            Int64Dtype(): LongType(),
        }

        for extension_dtype, spark_type in type_mapper.items():
            self.assertEqual(as_spark_type(extension_dtype), spark_type)
Beispiel #25
0
 def sqlType(cls):
     """
     Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT
     """
     return StructType([
         StructField(
             "cell_context",
             StructType([
                 StructField(
                     "cellType",
                     StructType(
                         [StructField("cellTypeName", StringType(),
                                      False)]), False),
                 StructField(
                     "dimensions",
                     StructType([
                         StructField("cols", ShortType(), False),
                         StructField("rows", ShortType(), False)
                     ]), False),
             ]), False),
         StructField(
             "cell_data",
             StructType([
                 StructField("cells", BinaryType(), True),
                 StructField(
                     "ref",
                     StructType([
                         StructField("source", RasterSourceUDT(), False),
                         StructField("bandIndex", IntegerType(), False),
                         StructField(
                             "subextent",
                             StructType([
                                 StructField("xmin", DoubleType(), False),
                                 StructField("ymin", DoubleType(), False),
                                 StructField("xmax", DoubleType(), False),
                                 StructField("ymax", DoubleType(), False)
                             ]), True)
                     ]), True)
             ]), False)
     ])
Beispiel #26
0
def struct_type():
    '''
        Return the data type that represents a row from the received data list.
    '''
    from pyspark.sql.types import (StructType, StructField, StringType, ShortType,
                                    IntegerType, LongType, FloatType)

    return StructType([
        StructField('blocked', ShortType(), True),
        StructField('classification', StringType(), True),
        StructField('classification_id', IntegerType(), True),
        StructField('destination_ip', StringType(), True),
        StructField('dport_icode', IntegerType(), True),
        StructField('event_id', IntegerType(), True),
        StructField('generator_id', IntegerType(), True),
        StructField('impact', IntegerType(), True),
        StructField('impact_flag', ShortType(), True),
        StructField('priority', IntegerType(), True),
        StructField('protocol', IntegerType(), True),
        StructField('sensor_id', IntegerType(), True),
        StructField('signature_id', LongType(), True),
        StructField('signature_revision', IntegerType(), True),
        StructField('sport_itype', LongType(), True),
        StructField('source_ip', StringType(), True),
        StructField('vlan_id', IntegerType(), True),
        StructField('unix_tstamp', FloatType(), True),
        StructField('y', ShortType(), True), StructField('m', ShortType(), True),
        StructField('d', ShortType(), True), StructField('h', ShortType(), True)
    ])
Beispiel #27
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
        codec = ScalarCodec(ByteType())
    elif types.is_int16(field_type):
        np_type = np.int16
        codec = ScalarCodec(ShortType())
    elif types.is_int32(field_type):
        np_type = np.int32
        codec = ScalarCodec(IntegerType())
    elif types.is_int64(field_type):
        np_type = np.int64
        codec = ScalarCodec(LongType())
    elif types.is_string(field_type):
        np_type = np.unicode_
        codec = ScalarCodec(StringType())
    elif types.is_boolean(field_type):
        np_type = np.bool_
        codec = ScalarCodec(BooleanType())
    elif types.is_float32(field_type):
        np_type = np.float32
        codec = ScalarCodec(FloatType())
    elif types.is_float64(field_type):
        np_type = np.float64
        codec = ScalarCodec(DoubleType())
    elif types.is_decimal(field_type):
        np_type = Decimal
        codec = ScalarCodec(DecimalType(field_type.precision,
                                        field_type.scale))
    elif types.is_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(DateType())
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(TimestampType())
    elif types.is_list(field_type):
        _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
        codec = None
    else:
        raise ValueError(
            'Cannot auto-create unischema due to unsupported column type {}'.
            format(field_type))
    return codec, np_type
Beispiel #28
0
def test_invalid_schema_field(synthetic_dataset):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False),
               predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1))

    assert 'bogus_key' in str(e)
Beispiel #29
0
def map_precision_col_df(df: pyspark.sql.dataframe.DataFrame,
                         col_name: str) -> pyspark.sql.types.DataType:
    if isinstance(df.schema[col_name].dataType, DecimalType) is True:
        data_precision = df.schema[col_name].dataType.precision
        if data_precision <= 2:
            return ByteType()
        elif 2 < data_precision <= 4:
            return ShortType()
        elif 4 < data_precision <= 9:
            return IntegerType()
        else:
            return LongType()

    return StringType()
def make_songplay_data(d_artist_df, d_song_df, event_df):
    """
    Create the songplay fact dataframe.

    Parameters:
    d_artist_df (DataFrame): The artist dimension dataframe.
    d_song_df (DataFrame): The song dimension dataframe.
    event_df (DataFrame): The raw song play event dataframe.

    Returns:
    f_songplay_df (DataFrame): A songplay fact dataframe.
    """
    print('\nmake_songplay_data...')

    tmp_df = d_song_df.withColumnRenamed('artist_id', 'song_artist_id')
    tmp_df = tmp_df.join(d_artist_df, d_artist_df.artist_id == tmp_df.song_artist_id) \
        .select('song_id', 'title', 'duration', 'artist_id', 'artist_name')

    comparison = [
        event_df.song == tmp_df.title,
        event_df.length.cast(ShortType()) == tmp_df.duration.cast(ShortType())
    ]

    # extract columns from joined song and log datasets to create songplays table
    # create hash of timestmap userId and song for unique songplay ID
    # year and month columns exist for paritioning parquet files
    f_songplay_df = event_df.withColumn('songplay_id', F.sha1(F.concat_ws('|', 'timestamp', 'userId', 'song'))) \
        .withColumn('year', F.year('timestamp')) \
        .withColumn('month', F.month('timestamp')) \
        .join(tmp_df, comparison, 'left') \
        .select(['songplay_id', 'start_time', 'year', 'month', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'])
    print('songplay fact record count:', f_songplay_df.count())

    not_null_count = f_songplay_df.filter(F.col('song_id').isNotNull()).count()
    print('songplay fact records with song_id value:', not_null_count)

    return f_songplay_df