Exemple #1
0
    def _inferSchema(self, rdd, samplingRatio=None):
        """
        Infer schema from an RDD of Row or tuple.

        :param rdd: an RDD of Row or tuple
        :param samplingRatio: sampling ratio, or no sampling (default)
        :return: StructType
        """
        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")
        if type(first) is dict:
            warnings.warn("Using RDD of dict to inferSchema is deprecated. "
                          "Use pyspark.sql.Row instead")

        if samplingRatio is None:
            schema = _infer_schema(first)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(schema, _infer_schema(row))
                    if not _has_nulltype(schema):
                        break
                else:
                    raise ValueError("Some of types cannot be determined by the "
                                     "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(_infer_schema).reduce(_merge_type)
        return schema
Exemple #2
0
    def _inferSchema(self, rdd, samplingRatio=None):
        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")
        if type(first) is dict:
            warnings.warn("Using RDD of dict to inferSchema is deprecated. "
                          "Use pyspark.sql.Row instead")

        if samplingRatio is None:
            schema = _infer_schema(first)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(schema, _infer_schema(row))
                    if not _has_nulltype(schema):
                        break
                else:
                    raise ValueError(
                        "Some of types cannot be determined by the "
                        "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(_infer_schema).reduce(_merge_type)
        return schema
Exemple #3
0
    def _inferSchema(self, rdd, samplingRatio=None, names=None):
        """
        Infer schema from an RDD of Row or tuple.

        :param rdd: an RDD of Row or tuple
        :param samplingRatio: sampling ratio, or no sampling (default)
        :return: :class:`pyspark.sql.types.StructType`
        """
        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")
        if type(first) is dict:
            warnings.warn("Using RDD of dict to inferSchema is deprecated. "
                          "Use pyspark.sql.Row instead")

        if samplingRatio is None:
            schema = _infer_schema(first, names=names)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(schema, _infer_schema(row,
                                                               names=names))
                    if not _has_nulltype(schema):
                        break
                else:
                    raise ValueError(
                        "Some of types cannot be determined by the "
                        "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(
                _merge_type)
        return schema
Exemple #4
0
    def _inferSchema(self,
                     rdd: "RDD[Any]",
                     samplingRatio: Optional[float] = None,
                     names: Optional[List[str]] = None) -> StructType:
        """
        Infer schema from an RDD of Row, dict, or tuple.

        Parameters
        ----------
        rdd : :class:`RDD`
            an RDD of Row, dict, or tuple
        samplingRatio : float, optional
            sampling ratio, or no sampling (default)
        names : list, optional

        Returns
        -------
        :class:`pyspark.sql.types.StructType`
        """
        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")

        infer_dict_as_struct = self._wrapped._conf.inferDictAsStruct(
        )  # type: ignore[attr-defined]
        prefer_timestamp_ntz = is_timestamp_ntz_preferred()
        if samplingRatio is None:
            schema = _infer_schema(first,
                                   names=names,
                                   infer_dict_as_struct=infer_dict_as_struct,
                                   prefer_timestamp_ntz=prefer_timestamp_ntz)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(
                        schema,
                        _infer_schema(
                            row,
                            names=names,
                            infer_dict_as_struct=infer_dict_as_struct,
                            prefer_timestamp_ntz=prefer_timestamp_ntz))
                    if not _has_nulltype(schema):
                        break
                else:
                    raise ValueError(
                        "Some of types cannot be determined by the "
                        "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(lambda row: _infer_schema(
                row,
                names,
                infer_dict_as_struct=infer_dict_as_struct,
                prefer_timestamp_ntz=prefer_timestamp_ntz)).reduce(_merge_type)
        return schema
Exemple #5
0
    def _inferSchemaFromList(self, data):
        """
        Infer schema from list of Row or tuple.

        :param data: list of Row or tuple
        :return: StructType
        """
        if not data:
            raise ValueError("can not infer schema from empty dataset")
        first = data[0]
        if type(first) is dict:
            warnings.warn("inferring schema from dict is deprecated,"
                          "please use pyspark.sql.Row instead")
        schema = _infer_schema(first)
        if _has_nulltype(schema):
            for r in data:
                schema = _merge_type(schema, _infer_schema(r))
                if not _has_nulltype(schema):
                    break
            else:
                raise ValueError("Some of types cannot be determined after inferring")
        return schema
Exemple #6
0
    def _inferSchemaFromList(self, data):
        """
        Infer schema from list of Row or tuple.

        :param data: list of Row or tuple
        :return: StructType
        """
        if not data:
            raise ValueError("can not infer schema from empty dataset")
        first = data[0]
        if type(first) is dict:
            warnings.warn("inferring schema from dict is deprecated,"
                          "please use pyspark.sql.Row instead")
        schema = _infer_schema(first)
        if _has_nulltype(schema):
            for r in data:
                schema = _merge_type(schema, _infer_schema(r))
                if not _has_nulltype(schema):
                    break
            else:
                raise ValueError("Some of types cannot be determined after inferring")
        return schema
Exemple #7
0
    def _inferSchema(self, rdd, samplingRatio=None, names=None):
        """
        Infer schema from an RDD of Row, dict, or tuple.

        Parameters
        ----------
        rdd : :class:`RDD`
            an RDD of Row, dict, or tuple
        samplingRatio : float, optional
            sampling ratio, or no sampling (default)
        names : list, optional

        Returns
        -------
        :class:`pyspark.sql.types.StructType`
        """
        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")

        if samplingRatio is None:
            schema = _infer_schema(first, names=names)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(schema, _infer_schema(row,
                                                               names=names))
                    if not _has_nulltype(schema):
                        break
                else:
                    raise ValueError(
                        "Some of types cannot be determined by the "
                        "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(
                _merge_type)
        return schema
Exemple #8
0
    def test_merge_type(self):
        self.assertEqual(_merge_type(LongType(), NullType()), LongType())
        self.assertEqual(_merge_type(NullType(), LongType()), LongType())

        self.assertEqual(_merge_type(LongType(), LongType()), LongType())

        self.assertEqual(_merge_type(
            ArrayType(LongType()),
            ArrayType(LongType())
        ), ArrayType(LongType()))
        with self.assertRaisesRegexp(TypeError, 'element in array'):
            _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))

        self.assertEqual(_merge_type(
            MapType(StringType(), LongType()),
            MapType(StringType(), LongType())
        ), MapType(StringType(), LongType()))
        with self.assertRaisesRegexp(TypeError, 'key of map'):
            _merge_type(
                MapType(StringType(), LongType()),
                MapType(DoubleType(), LongType()))
        with self.assertRaisesRegexp(TypeError, 'value of map'):
            _merge_type(
                MapType(StringType(), LongType()),
                MapType(StringType(), DoubleType()))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
            StructType([StructField("f1", LongType()), StructField("f2", StringType())])
        ), StructType([StructField("f1", LongType()), StructField("f2", StringType())]))
        with self.assertRaisesRegexp(TypeError, 'field f1'):
            _merge_type(
                StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
                StructType([StructField("f1", DoubleType()), StructField("f2", StringType())]))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
            StructType([StructField("f1", StructType([StructField("f2", LongType())]))])
        ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]))
        with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
            _merge_type(
                StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
                StructType([StructField("f1", StructType([StructField("f2", StringType())]))]))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]),
            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])
        ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]))
        with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
            _merge_type(
                StructType([
                    StructField("f1", ArrayType(LongType())),
                    StructField("f2", StringType())]),
                StructType([
                    StructField("f1", ArrayType(DoubleType())),
                    StructField("f2", StringType())]))

        self.assertEqual(_merge_type(
            StructType([
                StructField("f1", MapType(StringType(), LongType())),
                StructField("f2", StringType())]),
            StructType([
                StructField("f1", MapType(StringType(), LongType())),
                StructField("f2", StringType())])
        ), StructType([
            StructField("f1", MapType(StringType(), LongType())),
            StructField("f2", StringType())]))
        with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
            _merge_type(
                StructType([
                    StructField("f1", MapType(StringType(), LongType())),
                    StructField("f2", StringType())]),
                StructType([
                    StructField("f1", MapType(StringType(), DoubleType())),
                    StructField("f2", StringType())]))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])
        ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]))
        with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'):
            _merge_type(
                StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
                StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))])
            )
Exemple #9
0
    def _inferSchema(
        self,
        rdd: RDD[Any],
        samplingRatio: Optional[float] = None,
        names: Optional[List[str]] = None,
    ) -> StructType:
        """
        Infer schema from an RDD of Row, dict, or tuple.

        Parameters
        ----------
        rdd : :class:`RDD`
            an RDD of Row, dict, or tuple
        samplingRatio : float, optional
            sampling ratio, or no sampling (default)
        names : list, optional

        Returns
        -------
        :class:`pyspark.sql.types.StructType`
        """
        first = rdd.first()
        if isinstance(first, Sized) and len(first) == 0:
            raise ValueError(
                "The first row in RDD is empty, can not infer schema")

        infer_dict_as_struct = self._jconf.inferDictAsStruct()
        infer_array_from_first_element = self._jconf.legacyInferArrayTypeFromFirstElement(
        )
        prefer_timestamp_ntz = is_timestamp_ntz_preferred()
        if samplingRatio is None:
            schema = _infer_schema(
                first,
                names=names,
                infer_dict_as_struct=infer_dict_as_struct,
                prefer_timestamp_ntz=prefer_timestamp_ntz,
            )
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(
                        schema,
                        _infer_schema(
                            row,
                            names=names,
                            infer_dict_as_struct=infer_dict_as_struct,
                            infer_array_from_first_element=
                            infer_array_from_first_element,
                            prefer_timestamp_ntz=prefer_timestamp_ntz,
                        ),
                    )
                    if not _has_nulltype(schema):
                        break
                else:
                    raise ValueError(
                        "Some of types cannot be determined by the "
                        "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(lambda row: _infer_schema(
                row,
                names,
                infer_dict_as_struct=infer_dict_as_struct,
                infer_array_from_first_element=infer_array_from_first_element,
                prefer_timestamp_ntz=prefer_timestamp_ntz,
            )).reduce(_merge_type)
        return schema
Exemple #10
0
    def inferSchema(self, rdd, samplingRatio=None):
        """Infer and apply a schema to an RDD of L{Row}.

        ::note:
            Deprecated in 1.3, use :func:`createDataFrame` instead

        When samplingRatio is specified, the schema is inferred by looking
        at the types of each row in the sampled dataset. Otherwise, the
        first 100 rows of the RDD are inspected. Nested collections are
        supported, which can include array, dict, list, Row, tuple,
        namedtuple, or object.

        Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
        Using top level dicts is deprecated, as dict is used to represent Maps.

        If a single column has multiple distinct inferred types, it may cause
        runtime exceptions.

        >>> rdd = sc.parallelize(
        ...     [Row(field1=1, field2="row1"),
        ...      Row(field1=2, field2="row2"),
        ...      Row(field1=3, field2="row3")])
        >>> df = sqlCtx.inferSchema(rdd)
        >>> df.collect()[0]
        Row(field1=1, field2=u'row1')

        >>> NestedRow = Row("f1", "f2")
        >>> nestedRdd1 = sc.parallelize([
        ...     NestedRow(array('i', [1, 2]), {"row1": 1.0}),
        ...     NestedRow(array('i', [2, 3]), {"row2": 2.0})])
        >>> df = sqlCtx.inferSchema(nestedRdd1)
        >>> df.collect()
        [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]

        >>> nestedRdd2 = sc.parallelize([
        ...     NestedRow([[1, 2], [2, 3]], [1, 2]),
        ...     NestedRow([[2, 3], [3, 4]], [2, 3])])
        >>> df = sqlCtx.inferSchema(nestedRdd2)
        >>> df.collect()
        [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]

        >>> from collections import namedtuple
        >>> CustomRow = namedtuple('CustomRow', 'field1 field2')
        >>> rdd = sc.parallelize(
        ...     [CustomRow(field1=1, field2="row1"),
        ...      CustomRow(field1=2, field2="row2"),
        ...      CustomRow(field1=3, field2="row3")])
        >>> df = sqlCtx.inferSchema(rdd)
        >>> df.collect()[0]
        Row(field1=1, field2=u'row1')
        """

        if isinstance(rdd, DataFrame):
            raise TypeError("Cannot apply schema to DataFrame")

        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")
        if type(first) is dict:
            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
                          "please use pyspark.sql.Row instead")

        if samplingRatio is None:
            schema = _infer_schema(first)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(schema, _infer_schema(row))
                    if not _has_nulltype(schema):
                        break
                else:
                    warnings.warn(
                        "Some of types cannot be determined by the "
                        "first 100 rows, please try again with sampling")
        else:
            if samplingRatio < 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(_infer_schema).reduce(_merge_type)

        converter = _create_converter(schema)
        rdd = rdd.map(converter)
        return self.applySchema(rdd, schema)
Exemple #11
0
    def test_merge_type(self):
        self.assertEqual(_merge_type(LongType(), NullType()), LongType())
        self.assertEqual(_merge_type(NullType(), LongType()), LongType())

        self.assertEqual(_merge_type(LongType(), LongType()), LongType())

        self.assertEqual(_merge_type(
            ArrayType(LongType()),
            ArrayType(LongType())
        ), ArrayType(LongType()))
        with self.assertRaisesRegexp(TypeError, 'element in array'):
            _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))

        self.assertEqual(_merge_type(
            MapType(StringType(), LongType()),
            MapType(StringType(), LongType())
        ), MapType(StringType(), LongType()))
        with self.assertRaisesRegexp(TypeError, 'key of map'):
            _merge_type(
                MapType(StringType(), LongType()),
                MapType(DoubleType(), LongType()))
        with self.assertRaisesRegexp(TypeError, 'value of map'):
            _merge_type(
                MapType(StringType(), LongType()),
                MapType(StringType(), DoubleType()))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
            StructType([StructField("f1", LongType()), StructField("f2", StringType())])
        ), StructType([StructField("f1", LongType()), StructField("f2", StringType())]))
        with self.assertRaisesRegexp(TypeError, 'field f1'):
            _merge_type(
                StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
                StructType([StructField("f1", DoubleType()), StructField("f2", StringType())]))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
            StructType([StructField("f1", StructType([StructField("f2", LongType())]))])
        ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]))
        with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
            _merge_type(
                StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
                StructType([StructField("f1", StructType([StructField("f2", StringType())]))]))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]),
            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])
        ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]))
        with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
            _merge_type(
                StructType([
                    StructField("f1", ArrayType(LongType())),
                    StructField("f2", StringType())]),
                StructType([
                    StructField("f1", ArrayType(DoubleType())),
                    StructField("f2", StringType())]))

        self.assertEqual(_merge_type(
            StructType([
                StructField("f1", MapType(StringType(), LongType())),
                StructField("f2", StringType())]),
            StructType([
                StructField("f1", MapType(StringType(), LongType())),
                StructField("f2", StringType())])
        ), StructType([
            StructField("f1", MapType(StringType(), LongType())),
            StructField("f2", StringType())]))
        with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
            _merge_type(
                StructType([
                    StructField("f1", MapType(StringType(), LongType())),
                    StructField("f2", StringType())]),
                StructType([
                    StructField("f1", MapType(StringType(), DoubleType())),
                    StructField("f2", StringType())]))

        self.assertEqual(_merge_type(
            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])
        ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]))
        with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'):
            _merge_type(
                StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
                StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))])
            )
Exemple #12
0
    def inferSchema(self, rdd, samplingRatio=None):
        """Infer and apply a schema to an RDD of L{Row}.

        When samplingRatio is specified, the schema is inferred by looking
        at the types of each row in the sampled dataset. Otherwise, the
        first 100 rows of the RDD are inspected. Nested collections are
        supported, which can include array, dict, list, Row, tuple,
        namedtuple, or object.

        Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
        Using top level dicts is deprecated, as dict is used to represent Maps.

        If a single column has multiple distinct inferred types, it may cause
        runtime exceptions.

        >>> rdd = sc.parallelize(
        ...     [Row(field1=1, field2="row1"),
        ...      Row(field1=2, field2="row2"),
        ...      Row(field1=3, field2="row3")])
        >>> df = sqlCtx.inferSchema(rdd)
        >>> df.collect()[0]
        Row(field1=1, field2=u'row1')

        >>> NestedRow = Row("f1", "f2")
        >>> nestedRdd1 = sc.parallelize([
        ...     NestedRow(array('i', [1, 2]), {"row1": 1.0}),
        ...     NestedRow(array('i', [2, 3]), {"row2": 2.0})])
        >>> df = sqlCtx.inferSchema(nestedRdd1)
        >>> df.collect()
        [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]

        >>> nestedRdd2 = sc.parallelize([
        ...     NestedRow([[1, 2], [2, 3]], [1, 2]),
        ...     NestedRow([[2, 3], [3, 4]], [2, 3])])
        >>> df = sqlCtx.inferSchema(nestedRdd2)
        >>> df.collect()
        [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]

        >>> from collections import namedtuple
        >>> CustomRow = namedtuple('CustomRow', 'field1 field2')
        >>> rdd = sc.parallelize(
        ...     [CustomRow(field1=1, field2="row1"),
        ...      CustomRow(field1=2, field2="row2"),
        ...      CustomRow(field1=3, field2="row3")])
        >>> df = sqlCtx.inferSchema(rdd)
        >>> df.collect()[0]
        Row(field1=1, field2=u'row1')
        """

        if isinstance(rdd, DataFrame):
            raise TypeError("Cannot apply schema to DataFrame")

        first = rdd.first()
        if not first:
            raise ValueError("The first row in RDD is empty, "
                             "can not infer schema")
        if type(first) is dict:
            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
                          "please use pyspark.sql.Row instead")

        if samplingRatio is None:
            schema = _infer_schema(first)
            if _has_nulltype(schema):
                for row in rdd.take(100)[1:]:
                    schema = _merge_type(schema, _infer_schema(row))
                    if not _has_nulltype(schema):
                        break
                else:
                    warnings.warn("Some of types cannot be determined by the "
                                  "first 100 rows, please try again with sampling")
        else:
            if samplingRatio > 0.99:
                rdd = rdd.sample(False, float(samplingRatio))
            schema = rdd.map(_infer_schema).reduce(_merge_type)

        converter = _create_converter(schema)
        rdd = rdd.map(converter)
        return self.applySchema(rdd, schema)