Beispiel #1
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal
        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True)
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1),
             datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a")),
            (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2),
             datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb")),
            (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3),
             datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc")),
            (u"d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12),
             datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd")),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
Beispiel #2
0
    def test_data_type_ops(self):
        _mock_spark_type = DataType()
        _mock_dtype = ExtensionDtype()
        _mappings = (
            (CategoricalDtype(), _mock_spark_type, CategoricalOps),
            (_mock_dtype, DecimalType(), DecimalOps),
            (_mock_dtype, FractionalType(), FractionalOps),
            (_mock_dtype, IntegralType(), IntegralOps),
            (_mock_dtype, StringType(), StringOps),
            (_mock_dtype, BooleanType(), BooleanOps),
            (_mock_dtype, TimestampType(), DatetimeOps),
            (_mock_dtype, TimestampNTZType(), DatetimeNTZOps),
            (_mock_dtype, DateType(), DateOps),
            (_mock_dtype, DayTimeIntervalType(), TimedeltaOps),
            (_mock_dtype, BinaryType(), BinaryOps),
            (_mock_dtype, ArrayType(StringType()), ArrayOps),
            (_mock_dtype, MapType(StringType(), IntegralType()), MapOps),
            (_mock_dtype, StructType(), StructOps),
            (_mock_dtype, NullType(), NullOps),
            (_mock_dtype, UserDefinedType(), UDTOps),
        )
        for _dtype, _spark_type, _ops in _mappings:
            self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops)

        _unknow_spark_type = _mock_spark_type
        self.assertRaises(TypeError, DataTypeOps, BooleanType(),
                          _unknow_spark_type)
Beispiel #3
0
def makeSchema(columns):
    struct_field_map = {'string': StringType(),
                        'date': TimestampType(),
                        'double': DoubleType(),
                        'int': IntegerType(),
                        'none': NullType()}
    fields = [StructField(k, struct_field_map[v], True) for k, v in columns]

    return StructType(fields)
Beispiel #4
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
    def test_unsupported_types(self):
        common_err_msg = 'Invalid return type.*grouped map Pandas UDF.*'
        unsupported_types = [
            StructField('arr_ts', ArrayType(TimestampType())),
            StructField('null', NullType()),
            StructField('struct', StructType([StructField('l', LongType())])),
        ]

        for unsupported_type in unsupported_types:
            schema = StructType([StructField('id', LongType(), True), unsupported_type])
            with QuietTest(self.sc):
                with self.assertRaisesRegex(NotImplementedError, common_err_msg):
                    pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
Beispiel #6
0
    def test_get_col_info(self):
        with spark_session('test_get_col_info') as spark:
            data = [[
                0,
                0.0,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                DenseVector([1.0, 1.0])
            ], [
                1,
                None,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                SparseVector(2, {1: 1.0})
            ]]

            schema = StructType([
                StructField('int', IntegerType()),
                StructField('float', FloatType()),
                StructField('null', NullType()),
                StructField('array', ArrayType(IntegerType())),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])

            df = create_test_data_from_schema(spark, data, schema)
            all_col_types, col_shapes, col_max_sizes = util._get_col_info(df)

            expected = [
                ('int', {int}, 1, 1),
                ('float', {float, NullType}, 1, 1),
                ('null', {NullType}, 1, 1),
                ('array', {list}, 2, 2),
                ('dense', {DenseVector}, 2, 2),
                ('sparse', {SparseVector}, 2, 1),
                ('mixed', {DenseVector, SparseVector}, 2, 2)
            ]

            for expected_col_info in expected:
                col_name, col_types, col_shape, col_size = expected_col_info
                assert all_col_types[col_name] == col_types, col_name
                assert col_shapes[col_name] == col_shape, col_name
                assert col_max_sizes[col_name] == col_size, col_name
Beispiel #7
0
    def test_toPandas_empty_df_arrow_enabled(self):
        # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes
        # when arrow is enabled
        from datetime import date
        from decimal import Decimal

        schema = StructType([
            StructField("a", StringType(), True),
            StructField("a", IntegerType(), True),
            StructField("c", TimestampType(), True),
            StructField("d", NullType(), True),
            StructField("e", LongType(), True),
            StructField("f", FloatType(), True),
            StructField("g", DateType(), True),
            StructField("h", BinaryType(), True),
            StructField("i", DecimalType(38, 18), True),
            StructField("k", TimestampNTZType(), True),
            StructField("L", DayTimeIntervalType(0, 3), True),
        ])
        df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(),
                                        schema=schema)
        non_empty_df = self.spark.createDataFrame(
            [(
                "a",
                1,
                datetime.datetime(1969, 1, 1, 1, 1, 1),
                None,
                10,
                0.2,
                date(1969, 1, 1),
                bytearray(b"a"),
                Decimal("2.0"),
                datetime.datetime(1969, 1, 1, 1, 1, 1),
                datetime.timedelta(microseconds=123),
            )],
            schema=schema,
        )

        pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
        pdf_non_empty, pdf_arrow_non_empty = self._toPandas_arrow_toggle(
            non_empty_df)
        assert_frame_equal(pdf, pdf_arrow)
        self.assertTrue(pdf_arrow.dtypes.equals(pdf_arrow_non_empty.dtypes))
        self.assertTrue(pdf_arrow.dtypes.equals(pdf_non_empty.dtypes))
package_id_product_id_map = entity_package_data_frame.dropna(
    subset=['PACKAGE_FDC_ID', 'ESD_PRODUCT_FDC_ID']).set_index(
        'PACKAGE_FDC_ID')['ESD_PRODUCT_FDC_ID'].to_dict()

# In[ ]:

# package_id_product_id_map

# In[ ]:

from pyspark.sql.types import NullType
audit_data_frame = audit_data_frame.dropna(subset=['PACKAGE_FDC_ID'])
get_product_id = spark_session.udf.register('get_product_id',
    lambda package_id: package_id_product_id_map[package_id] \
    if package_id in package_id_product_id_map else NullType())
audit_data_frame = audit_data_frame.withColumn(
    'PRODUCT_FDC_ID', get_product_id('PACKAGE_FDC_ID'))
audit_data_frame = audit_data_frame.dropna(subset=['PRODUCT_FDC_ID'])
audit_data_frame.head(1)

# In[ ]:

audit_data_frame.count()

# In[ ]:

from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol='PRODUCT_FDC_ID', outputCol='label')
audit_data_frame = str_indexer.fit(audit_data_frame).transform(
    audit_data_frame)
Beispiel #9
0
    def test_supported_types(self):

        values = [
            1, 2, 3, 4, 5, 1.1, 2.2,
            Decimal(1.123), [1, 2, 2], True, 'hello',
            bytearray([0x01, 0x02]), None
        ]
        output_fields = [('id', IntegerType()), ('byte', ByteType()),
                         ('short', ShortType()), ('int', IntegerType()),
                         ('long', LongType()), ('float', FloatType()),
                         ('double', DoubleType()),
                         ('decim', DecimalType(10, 3)),
                         ('array', ArrayType(IntegerType())),
                         ('bool', BooleanType()), ('str', StringType()),
                         ('bin', BinaryType()), ('null', NullType())]

        output_schema = StructType([StructField(*x) for x in output_fields])
        df = self.spark.createDataFrame([values], schema=output_schema)

        # Different forms of group map pandas UDF, results of these are the same
        udf1 = pandas_udf(
            lambda pdf: pdf.assign(byte=pdf.byte * 2,
                                   short=pdf.short * 2,
                                   int=pdf.int * 2,
                                   long=pdf.long * 2,
                                   float=pdf.float * 2,
                                   double=pdf.double * 2,
                                   decim=pdf.decim * 2,
                                   bool=False if pdf.bool else True,
                                   str=pdf.str + 'there',
                                   array=pdf.array,
                                   bin=pdf.bin,
                                   null=pdf.null), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf2 = pandas_udf(
            lambda _, pdf: pdf.assign(byte=pdf.byte * 2,
                                      short=pdf.short * 2,
                                      int=pdf.int * 2,
                                      long=pdf.long * 2,
                                      float=pdf.float * 2,
                                      double=pdf.double * 2,
                                      decim=pdf.decim * 2,
                                      bool=False if pdf.bool else True,
                                      str=pdf.str + 'there',
                                      array=pdf.array,
                                      bin=pdf.bin,
                                      null=pdf.null), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf3 = pandas_udf(
            lambda key, pdf: pdf.assign(id=key[0],
                                        byte=pdf.byte * 2,
                                        short=pdf.short * 2,
                                        int=pdf.int * 2,
                                        long=pdf.long * 2,
                                        float=pdf.float * 2,
                                        double=pdf.double * 2,
                                        decim=pdf.decim * 2,
                                        bool=False if pdf.bool else True,
                                        str=pdf.str + 'there',
                                        array=pdf.array,
                                        bin=pdf.bin,
                                        null=pdf.null), output_schema,
            PandasUDFType.GROUPED_MAP)

        result1 = df.groupby('id').apply(udf1).sort('id').toPandas()
        expected1 = df.toPandas().groupby('id').apply(
            udf1.func).reset_index(drop=True)

        result2 = df.groupby('id').apply(udf2).sort('id').toPandas()
        expected2 = expected1

        result3 = df.groupby('id').apply(udf3).sort('id').toPandas()
        expected3 = expected1

        assert_frame_equal(expected1, result1)
        assert_frame_equal(expected2, result2)
        assert_frame_equal(expected3, result3)
Beispiel #10
0
                      "null": "null",
                      "vector": "vector",
                      "timestamp": "datetime"
                      }

SPARK_DTYPES_DICT = {"string": StringType, "int": IntegerType, "float": FloatType,
                     "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType,
                     "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType,
                     "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT
                     }

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"}

SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"],
                            "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary",
                            "array": "array", "object": "object", "null": "null", "missing": "missing"}

from enum import Enum


class Actions(Enum):
    """
    Actions that modify a columns.
    def test_supported_types(self):

        values = [
            1,
            2,
            3,
            4,
            5,
            1.1,
            2.2,
            Decimal(1.123),
            [1, 2, 2],
            True,
            "hello",
            bytearray([0x01, 0x02]),
            None,
        ]
        output_fields = [
            ("id", IntegerType()),
            ("byte", ByteType()),
            ("short", ShortType()),
            ("int", IntegerType()),
            ("long", LongType()),
            ("float", FloatType()),
            ("double", DoubleType()),
            ("decim", DecimalType(10, 3)),
            ("array", ArrayType(IntegerType())),
            ("bool", BooleanType()),
            ("str", StringType()),
            ("bin", BinaryType()),
            ("null", NullType()),
        ]

        output_schema = StructType([StructField(*x) for x in output_fields])
        df = self.spark.createDataFrame([values], schema=output_schema)

        # Different forms of group map pandas UDF, results of these are the same
        udf1 = pandas_udf(
            lambda pdf: pdf.assign(
                byte=pdf.byte * 2,
                short=pdf.short * 2,
                int=pdf.int * 2,
                long=pdf.long * 2,
                float=pdf.float * 2,
                double=pdf.double * 2,
                decim=pdf.decim * 2,
                bool=False if pdf.bool else True,
                str=pdf.str + "there",
                array=pdf.array,
                bin=pdf.bin,
                null=pdf.null,
            ),
            output_schema,
            PandasUDFType.GROUPED_MAP,
        )

        udf2 = pandas_udf(
            lambda _, pdf: pdf.assign(
                byte=pdf.byte * 2,
                short=pdf.short * 2,
                int=pdf.int * 2,
                long=pdf.long * 2,
                float=pdf.float * 2,
                double=pdf.double * 2,
                decim=pdf.decim * 2,
                bool=False if pdf.bool else True,
                str=pdf.str + "there",
                array=pdf.array,
                bin=pdf.bin,
                null=pdf.null,
            ),
            output_schema,
            PandasUDFType.GROUPED_MAP,
        )

        udf3 = pandas_udf(
            lambda key, pdf: pdf.assign(
                id=key[0],
                byte=pdf.byte * 2,
                short=pdf.short * 2,
                int=pdf.int * 2,
                long=pdf.long * 2,
                float=pdf.float * 2,
                double=pdf.double * 2,
                decim=pdf.decim * 2,
                bool=False if pdf.bool else True,
                str=pdf.str + "there",
                array=pdf.array,
                bin=pdf.bin,
                null=pdf.null,
            ),
            output_schema,
            PandasUDFType.GROUPED_MAP,
        )

        result1 = df.groupby("id").apply(udf1).sort("id").toPandas()
        expected1 = df.toPandas().groupby("id").apply(udf1.func).reset_index(drop=True)

        result2 = df.groupby("id").apply(udf2).sort("id").toPandas()
        expected2 = expected1

        result3 = df.groupby("id").apply(udf3).sort("id").toPandas()
        expected3 = expected1

        assert_frame_equal(expected1, result1)
        assert_frame_equal(expected2, result2)
        assert_frame_equal(expected3, result3)
def sql_types_example(spark):
    
    # DataType
    dp = DataType()
    
    python_obj = dp.fromInternal(1)
    print(python_obj, type(python_obj))
    
    sql_obj = dp.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(dp.json())
    print(dp.jsonValue())
    print(dp.needConversion())
    print(dp.simpleString())
    print(DataType.typeName())

    # NullType
    nt = NullType()
    
    python_obj = nt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = nt.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(nt.json())
    print(nt.jsonValue())
    print(nt.needConversion())
    print(nt.simpleString())
    print(NullType.typeName())

    # AtomicType
    at = AtomicType()
    
    python_obj = at.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = at.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(at.json())
    print(at.jsonValue())
    print(at.needConversion())
    print(at.simpleString())
    print(AtomicType.typeName())

    # NumericType
    nt = NumericType()
    
    python_obj = nt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = nt.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(nt.json())
    print(nt.jsonValue())
    print(nt.needConversion())
    print(nt.simpleString())
    print(NumericType.typeName())

    # IntegralType
    it = IntegralType()
    
    python_obj = it.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = it.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(it.json())
    print(it.jsonValue())
    print(it.needConversion())
    print(it.simpleString())
    print(IntegralType.typeName())

    # FractionalType
    ft = FractionalType()
    
    python_obj = ft.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = ft.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(ft.json())
    print(ft.jsonValue())
    print(ft.needConversion())
    print(ft.simpleString())
    print(FractionalType.typeName())

    # StringType
    st = StringType()
    
    python_obj = st.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = st.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(st.json())
    print(st.jsonValue())
    print(st.needConversion())
    print(st.simpleString())
    print(StringType.typeName())

    # BinaryType
    bt = BinaryType()
    
    python_obj = bt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = bt.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(bt.json())
    print(bt.jsonValue())
    print(bt.needConversion())
    print(bt.simpleString())
    print(BinaryType.typeName())

    # BooleanType
    bt = BooleanType()
    
    python_obj = bt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = bt.toInternal(1)
    print(sql_obj, type(sql_obj))
    
    print(bt.json())
    print(bt.jsonValue())
    print(bt.needConversion())
    print(bt.simpleString())
    print(BooleanType.typeName())

    # DateType
    from datetime import datetime
    dt = DateType()
    
    python_obj = dt.fromInternal(1000)
    print(python_obj, type(python_obj))
    
    today = datetime.today()
    sql_obj = dt.toInternal(today)
    print(sql_obj, type(sql_obj))

    print(dt.json())
    print(dt.jsonValue())
    print(dt.needConversion())
    print(dt.simpleString())
    print(DateType.typeName())

    # TimestampType
    tt = TimestampType()
    
    python_obj = tt.fromInternal(365000000)
    print(python_obj, type(python_obj))

    today = datetime.today()
    sql_obj = tt.toInternal(today)
    print(sql_obj, type(sql_obj))
    
    print(tt.json())
    print(tt.jsonValue())
    print(tt.needConversion())
    print(tt.simpleString())
    print(TimestampType.typeName())

    # DecimalType
    dt = DecimalType()

    python_obj = dt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = dt.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(dt.json())
    print(dt.jsonValue())
    print(dt.needConversion())
    print(dt.simpleString())
    print(DecimalType.typeName())

    # DoubleType
    dt = DoubleType()

    python_obj = dt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = dt.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(dt.json())
    print(dt.jsonValue())
    print(dt.needConversion())
    print(dt.simpleString())
    print(DoubleType.typeName())

    # FloatType
    ft = FloatType()

    python_obj = ft.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = ft.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(ft.json())
    print(ft.jsonValue())
    print(ft.needConversion())
    print(ft.simpleString())
    print(FloatType.typeName())

    # ByteType
    bt = ByteType()

    python_obj = bt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = bt.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(bt.json())
    print(bt.jsonValue())
    print(bt.needConversion())
    print(bt.simpleString())
    print(ByteType.typeName())

    # IntegerType
    it = IntegerType()
    
    python_obj = it.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = it.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(it.json())
    print(it.jsonValue())
    print(it.needConversion())
    print(it.simpleString())
    print(IntegerType.typeName())

    # LongType
    lt = LongType()

    python_obj = lt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = lt.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(lt.json())
    print(lt.jsonValue())
    print(lt.needConversion())
    print(lt.simpleString())
    print(LongType.typeName())

    # ShortType
    st = ShortType()

    python_obj = st.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = st.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(st.json())
    print(st.jsonValue())
    print(st.needConversion())
    print(st.simpleString())
    print(ShortType.typeName())

    # ArrayType
    dt = DataType()
    at = ArrayType(dt)

    python_obj = at.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = at.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(at.json())
    print(at.jsonValue())
    print(at.needConversion())
    print(at.simpleString())
    print(ArrayType.typeName())
    print(ArrayType.fromJson({"containsNull": True, "elementType": "string"}))

    # MapType
    key_type = DataType()
    value_type = DataType()
    mt = MapType(key_type, value_type)
    
    python_obj = mt.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = mt.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(mt.json())
    print(mt.jsonValue())
    print(mt.needConversion())
    print(mt.simpleString())
    print(MapType.typeName())
    print(MapType.fromJson({"valueContainsNull": True, "keyType": "string", "valueType": "integer"}))

    # StructField
    dt = DataType()
    sf = StructField("first_struct", dt)

    python_obj = sf.fromInternal(1)
    print(python_obj, type(python_obj))

    sql_obj = sf.toInternal(1)
    print(sql_obj, type(sql_obj))

    print(sf.json())
    print(sf.jsonValue())
    print(sf.needConversion())
    print(sf.simpleString())
    print(StructField.fromJson({"metadata": None, "nullable": True, "name": "first_struct", "type": "string"}))

    # StructType
    string_type = StringType()
    st = StructType([StructField("first_struct", StringType()), StructField("second_struct", DataType())])
    print("------")
    print(st.names)
    print(st.fields)
    print(st._needConversion)
    print(st._needSerializeAnyField)

    python_obj = st.fromInternal(["first_struct", "second_struct"])
    print(python_obj, type(python_obj))

    sql_obj = st.toInternal(["first_struct", "second_struct"])
    print(sql_obj, type(sql_obj))

    print(st.json())
    print(st.jsonValue())
    print(st.needConversion())
    print(st.simpleString())
    print(st.fieldNames())
    fields = {
        "fields": [
            {"metadata": None, "nullable": True, "name": "first", "type": "string"},
            {"metadata": None, "nullable": True, "name": "second", "type": "integer"}
        ]
    }
    print(st.fromJson(fields))
    
    st.add(StructField("first_struct", StringType()))
    
    print("st.add success!")

    print("Finish running types module API")
Beispiel #13
0
    def test_merge_type(self):
        self.assertEqual(_merge_type(LongType(), NullType()), LongType())
        self.assertEqual(_merge_type(NullType(), LongType()), LongType())

        self.assertEqual(_merge_type(LongType(), LongType()), LongType())

        self.assertEqual(
            _merge_type(ArrayType(LongType()), ArrayType(LongType())),
            ArrayType(LongType()))
        with self.assertRaisesRegexp(TypeError, 'element in array'):
            _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))

        self.assertEqual(
            _merge_type(MapType(StringType(), LongType()),
                        MapType(StringType(), LongType())),
            MapType(StringType(), LongType()))
        with self.assertRaisesRegexp(TypeError, 'key of map'):
            _merge_type(MapType(StringType(), LongType()),
                        MapType(DoubleType(), LongType()))
        with self.assertRaisesRegexp(TypeError, 'value of map'):
            _merge_type(MapType(StringType(), LongType()),
                        MapType(StringType(), DoubleType()))

        self.assertEqual(
            _merge_type(
                StructType([
                    StructField("f1", LongType()),
                    StructField("f2", StringType())
                ]),
                StructType([
                    StructField("f1", LongType()),
                    StructField("f2", StringType())
                ])),
            StructType([
                StructField("f1", LongType()),
                StructField("f2", StringType())
            ]))
        with self.assertRaisesRegexp(TypeError, 'field f1'):
            _merge_type(
                StructType([
                    StructField("f1", LongType()),
                    StructField("f2", StringType())
                ]),
                StructType([
                    StructField("f1", DoubleType()),
                    StructField("f2", StringType())
                ]))

        self.assertEqual(
            _merge_type(
                StructType([
                    StructField("f1",
                                StructType([StructField("f2", LongType())]))
                ]),
                StructType([
                    StructField("f1",
                                StructType([StructField("f2", LongType())]))
                ])),
            StructType([
                StructField("f1", StructType([StructField("f2", LongType())]))
            ]))
        with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
            _merge_type(
                StructType([
                    StructField("f1",
                                StructType([StructField("f2", LongType())]))
                ]),
                StructType([
                    StructField("f1",
                                StructType([StructField("f2", StringType())]))
                ]))

        self.assertEqual(
            _merge_type(
                StructType([
                    StructField("f1", ArrayType(LongType())),
                    StructField("f2", StringType())
                ]),
                StructType([
                    StructField("f1", ArrayType(LongType())),
                    StructField("f2", StringType())
                ])),
            StructType([
                StructField("f1", ArrayType(LongType())),
                StructField("f2", StringType())
            ]))
        with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
            _merge_type(
                StructType([
                    StructField("f1", ArrayType(LongType())),
                    StructField("f2", StringType())
                ]),
                StructType([
                    StructField("f1", ArrayType(DoubleType())),
                    StructField("f2", StringType())
                ]))

        self.assertEqual(
            _merge_type(
                StructType([
                    StructField("f1", MapType(StringType(), LongType())),
                    StructField("f2", StringType())
                ]),
                StructType([
                    StructField("f1", MapType(StringType(), LongType())),
                    StructField("f2", StringType())
                ])),
            StructType([
                StructField("f1", MapType(StringType(), LongType())),
                StructField("f2", StringType())
            ]))
        with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
            _merge_type(
                StructType([
                    StructField("f1", MapType(StringType(), LongType())),
                    StructField("f2", StringType())
                ]),
                StructType([
                    StructField("f1", MapType(StringType(), DoubleType())),
                    StructField("f2", StringType())
                ]))

        self.assertEqual(
            _merge_type(
                StructType([
                    StructField("f1",
                                ArrayType(MapType(StringType(), LongType())))
                ]),
                StructType([
                    StructField("f1",
                                ArrayType(MapType(StringType(), LongType())))
                ])),
            StructType([
                StructField("f1", ArrayType(MapType(StringType(), LongType())))
            ]))
        with self.assertRaisesRegexp(TypeError,
                                     'key of map element in array field f1'):
            _merge_type(
                StructType([
                    StructField("f1",
                                ArrayType(MapType(StringType(), LongType())))
                ]),
                StructType([
                    StructField("f1",
                                ArrayType(MapType(DoubleType(), LongType())))
                ]))
    spark = SparkSession.builder.master("local[2]").appName(
        'link-prediction').getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    # Create spark context
    sc = spark.sparkContext

    # Create a dataframe from training_set and testing_set
    trainingRDD = sc.textFile("training_set.txt").map(
        lambda x: x.strip().split(' '))
    trainingDF = trainingRDD.toDF(['from_node_id', 'to_node_id',
                                   'label']).sample(False, 0.3, 10)
    predRDD = sc.textFile("testing_set.txt").map(
        lambda x: x.strip().split(' '))
    predDF = predRDD.toDF(['from_node_id', 'to_node_id'])
    predictDF = predDF.withColumn('label', lit(None).cast(NullType()))
    combinedDF = trainingDF.union(predictDF)

    print("Input DataFrame contains %d elements" % trainingDF.count())
    print("To predict DataFrame contains %d elements" % predictDF.count())
    print("The combined DataFrame contains %d elements" % combinedDF.count())

    # Create a dataframe for paper information (title, authors, abstract, etc)
    infoRDD = sc.textFile("node_information.csv")
    infoRDD = infoRDD.mapPartitions(lambda x: csv.reader(x))
    infoDF = infoRDD.toDF(
        ['node_id', 'year', 'title', 'authors', 'journal', 'abstract'])
    infoDF.printSchema()
    infoDF.show(5)
    raw_input("Press enter ... ")
Beispiel #15
0
    '_TIMESTAMP_TYPE',
    '_DATETIME_TYPES',

    '_DECIMAL_10_0_TYPE',
    '_DECIMAL_38_18_TYPE',
    '_DECIMAL_TYPE_PREFIX',

    '_ARRAY_TYPE_PREFIX',
    '_MAP_TYPE_PREFIX',
    '_STRUCT_TYPE_PREFIX',

    '_VECTOR_TYPE',
)


__null_type: NullType = NullType()
_NULL_TYPE: str = __null_type.simpleString()
assert _NULL_TYPE == __null_type.typeName()


__bool_type: BooleanType = BooleanType()
_BOOL_TYPE: str = __bool_type.simpleString()
assert _BOOL_TYPE == __bool_type.typeName()


__str_type: StringType = StringType()
_STR_TYPE: str = __str_type.simpleString()
assert _STR_TYPE == __str_type.typeName()


__binary_type: BinaryType = BinaryType()