def _inferSchema(self, rdd, samplingRatio=None, names=None): """ Infer schema from an RDD of Row or tuple. :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) :return: :class:`pyspark.sql.types.StructType` """ first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") if type(first) is dict: warnings.warn("Using RDD of dict to inferSchema is deprecated. " "Use pyspark.sql.Row instead") if samplingRatio is None: schema = _infer_schema(first, names=names) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type(schema, _infer_schema(row, names=names)) if not _has_nulltype(schema): break else: raise ValueError( "Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(lambda row: _infer_schema(row, names)).reduce( _merge_type) return schema
def _inferSchema(self, rdd, samplingRatio=None): first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") if type(first) is dict: warnings.warn("Using RDD of dict to inferSchema is deprecated. " "Use pyspark.sql.Row instead") if samplingRatio is None: schema = _infer_schema(first) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type(schema, _infer_schema(row)) if not _has_nulltype(schema): break else: raise ValueError( "Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(_infer_schema).reduce(_merge_type) return schema
def _inferSchema(self, rdd, samplingRatio=None): """ Infer schema from an RDD of Row or tuple. :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) :return: StructType """ first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") if type(first) is dict: warnings.warn("Using RDD of dict to inferSchema is deprecated. " "Use pyspark.sql.Row instead") if samplingRatio is None: schema = _infer_schema(first) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type(schema, _infer_schema(row)) if not _has_nulltype(schema): break else: raise ValueError("Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(_infer_schema).reduce(_merge_type) return schema
def _inferSchema(self, rdd: "RDD[Any]", samplingRatio: Optional[float] = None, names: Optional[List[str]] = None) -> StructType: """ Infer schema from an RDD of Row, dict, or tuple. Parameters ---------- rdd : :class:`RDD` an RDD of Row, dict, or tuple samplingRatio : float, optional sampling ratio, or no sampling (default) names : list, optional Returns ------- :class:`pyspark.sql.types.StructType` """ first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") infer_dict_as_struct = self._wrapped._conf.inferDictAsStruct( ) # type: ignore[attr-defined] prefer_timestamp_ntz = is_timestamp_ntz_preferred() if samplingRatio is None: schema = _infer_schema(first, names=names, infer_dict_as_struct=infer_dict_as_struct, prefer_timestamp_ntz=prefer_timestamp_ntz) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type( schema, _infer_schema( row, names=names, infer_dict_as_struct=infer_dict_as_struct, prefer_timestamp_ntz=prefer_timestamp_ntz)) if not _has_nulltype(schema): break else: raise ValueError( "Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(lambda row: _infer_schema( row, names, infer_dict_as_struct=infer_dict_as_struct, prefer_timestamp_ntz=prefer_timestamp_ntz)).reduce(_merge_type) return schema
def _inferSchemaFromList( self, data: Iterable[Any], names: Optional[List[str]] = None ) -> StructType: """ Infer schema from list of Row, dict, or tuple. Parameters ---------- data : iterable list of Row, dict, or tuple names : list, optional list of column names Returns ------- :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") infer_dict_as_struct = self._wrapped._conf.inferDictAsStruct() # type: ignore[attr-defined] prefer_timestamp_ntz = is_timestamp_ntz_preferred() schema = reduce(_merge_type, ( _infer_schema(row, names, infer_dict_as_struct, prefer_timestamp_ntz) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot be determined after inferring") return schema
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row, dict, or tuple. Parameters ---------- data : iterable list of Row, dict, or tuple names : list, optional list of column names Returns ------- :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") infer_dict_as_struct = self._wrapped._conf.inferDictAsStruct() schema = reduce(_merge_type, (_infer_schema(row, names, infer_dict_as_struct) for row in data)) if _has_nulltype(schema): raise ValueError( "Some of types cannot be determined after inferring") return schema
def _inferSchema(self, rdd, samplingRatio=None, names=None): """ Infer schema from an RDD of Row, dict, or tuple. Parameters ---------- rdd : :class:`RDD` an RDD of Row, dict, or tuple samplingRatio : float, optional sampling ratio, or no sampling (default) names : list, optional Returns ------- :class:`pyspark.sql.types.StructType` """ first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") if samplingRatio is None: schema = _infer_schema(first, names=names) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type(schema, _infer_schema(row, names=names)) if not _has_nulltype(schema): break else: raise ValueError( "Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(lambda row: _infer_schema(row, names)).reduce( _merge_type) return schema
def _inferSchemaFromList(self, data): """ Infer schema from list of Row or tuple. :param data: list of Row or tuple :return: StructType """ if not data: raise ValueError("can not infer schema from empty dataset") first = data[0] if type(first) is dict: warnings.warn("inferring schema from dict is deprecated," "please use pyspark.sql.Row instead") schema = _infer_schema(first) if _has_nulltype(schema): for r in data: schema = _merge_type(schema, _infer_schema(r)) if not _has_nulltype(schema): break else: raise ValueError("Some of types cannot be determined after inferring") return schema
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row, dict, or tuple. :param data: list of Row, dict, or tuple :param names: list of column names :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot be determined after inferring") return schema
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row or tuple. :param data: list of Row or tuple :param names: list of column names :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") first = data[0] if type(first) is dict: warnings.warn("inferring schema from dict is deprecated," "please use pyspark.sql.Row instead") schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot be determined after inferring") return schema
def _inferSchemaFromList( self, data: Iterable[Any], names: Optional[List[str]] = None ) -> StructType: """ Infer schema from list of Row, dict, or tuple. Parameters ---------- data : iterable list of Row, dict, or tuple names : list, optional list of column names Returns ------- :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") infer_dict_as_struct = self._jconf.inferDictAsStruct() infer_array_from_first_element = self._jconf.legacyInferArrayTypeFromFirstElement() prefer_timestamp_ntz = is_timestamp_ntz_preferred() schema = reduce( _merge_type, ( _infer_schema( row, names, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element=infer_array_from_first_element, prefer_timestamp_ntz=prefer_timestamp_ntz, ) for row in data ), ) if _has_nulltype(schema): raise ValueError("Some of types cannot be determined after inferring") return schema
def to_schema(cls, obj): row = cls.to_row(obj) import pyspark.sql.types as pst return pst._infer_schema(row)
def _inferSchema( self, rdd: RDD[Any], samplingRatio: Optional[float] = None, names: Optional[List[str]] = None, ) -> StructType: """ Infer schema from an RDD of Row, dict, or tuple. Parameters ---------- rdd : :class:`RDD` an RDD of Row, dict, or tuple samplingRatio : float, optional sampling ratio, or no sampling (default) names : list, optional Returns ------- :class:`pyspark.sql.types.StructType` """ first = rdd.first() if isinstance(first, Sized) and len(first) == 0: raise ValueError( "The first row in RDD is empty, can not infer schema") infer_dict_as_struct = self._jconf.inferDictAsStruct() infer_array_from_first_element = self._jconf.legacyInferArrayTypeFromFirstElement( ) prefer_timestamp_ntz = is_timestamp_ntz_preferred() if samplingRatio is None: schema = _infer_schema( first, names=names, infer_dict_as_struct=infer_dict_as_struct, prefer_timestamp_ntz=prefer_timestamp_ntz, ) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type( schema, _infer_schema( row, names=names, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element= infer_array_from_first_element, prefer_timestamp_ntz=prefer_timestamp_ntz, ), ) if not _has_nulltype(schema): break else: raise ValueError( "Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(lambda row: _infer_schema( row, names, infer_dict_as_struct=infer_dict_as_struct, infer_array_from_first_element=infer_array_from_first_element, prefer_timestamp_ntz=prefer_timestamp_ntz, )).reduce(_merge_type) return schema
def inferSchema(self, rdd, samplingRatio=None): """Infer and apply a schema to an RDD of L{Row}. ::note: Deprecated in 1.3, use :func:`createDataFrame` instead When samplingRatio is specified, the schema is inferred by looking at the types of each row in the sampled dataset. Otherwise, the first 100 rows of the RDD are inspected. Nested collections are supported, which can include array, dict, list, Row, tuple, namedtuple, or object. Each row could be L{pyspark.sql.Row} object or namedtuple or objects. Using top level dicts is deprecated, as dict is used to represent Maps. If a single column has multiple distinct inferred types, it may cause runtime exceptions. >>> rdd = sc.parallelize( ... [Row(field1=1, field2="row1"), ... Row(field1=2, field2="row2"), ... Row(field1=3, field2="row3")]) >>> df = sqlCtx.inferSchema(rdd) >>> df.collect()[0] Row(field1=1, field2=u'row1') >>> NestedRow = Row("f1", "f2") >>> nestedRdd1 = sc.parallelize([ ... NestedRow(array('i', [1, 2]), {"row1": 1.0}), ... NestedRow(array('i', [2, 3]), {"row2": 2.0})]) >>> df = sqlCtx.inferSchema(nestedRdd1) >>> df.collect() [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})] >>> nestedRdd2 = sc.parallelize([ ... NestedRow([[1, 2], [2, 3]], [1, 2]), ... NestedRow([[2, 3], [3, 4]], [2, 3])]) >>> df = sqlCtx.inferSchema(nestedRdd2) >>> df.collect() [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])] >>> from collections import namedtuple >>> CustomRow = namedtuple('CustomRow', 'field1 field2') >>> rdd = sc.parallelize( ... [CustomRow(field1=1, field2="row1"), ... CustomRow(field1=2, field2="row2"), ... CustomRow(field1=3, field2="row3")]) >>> df = sqlCtx.inferSchema(rdd) >>> df.collect()[0] Row(field1=1, field2=u'row1') """ if isinstance(rdd, DataFrame): raise TypeError("Cannot apply schema to DataFrame") first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") if type(first) is dict: warnings.warn("Using RDD of dict to inferSchema is deprecated," "please use pyspark.sql.Row instead") if samplingRatio is None: schema = _infer_schema(first) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type(schema, _infer_schema(row)) if not _has_nulltype(schema): break else: warnings.warn( "Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(_infer_schema).reduce(_merge_type) converter = _create_converter(schema) rdd = rdd.map(converter) return self.applySchema(rdd, schema)
def inferSchema(self, rdd, samplingRatio=None): """Infer and apply a schema to an RDD of L{Row}. When samplingRatio is specified, the schema is inferred by looking at the types of each row in the sampled dataset. Otherwise, the first 100 rows of the RDD are inspected. Nested collections are supported, which can include array, dict, list, Row, tuple, namedtuple, or object. Each row could be L{pyspark.sql.Row} object or namedtuple or objects. Using top level dicts is deprecated, as dict is used to represent Maps. If a single column has multiple distinct inferred types, it may cause runtime exceptions. >>> rdd = sc.parallelize( ... [Row(field1=1, field2="row1"), ... Row(field1=2, field2="row2"), ... Row(field1=3, field2="row3")]) >>> df = sqlCtx.inferSchema(rdd) >>> df.collect()[0] Row(field1=1, field2=u'row1') >>> NestedRow = Row("f1", "f2") >>> nestedRdd1 = sc.parallelize([ ... NestedRow(array('i', [1, 2]), {"row1": 1.0}), ... NestedRow(array('i', [2, 3]), {"row2": 2.0})]) >>> df = sqlCtx.inferSchema(nestedRdd1) >>> df.collect() [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})] >>> nestedRdd2 = sc.parallelize([ ... NestedRow([[1, 2], [2, 3]], [1, 2]), ... NestedRow([[2, 3], [3, 4]], [2, 3])]) >>> df = sqlCtx.inferSchema(nestedRdd2) >>> df.collect() [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])] >>> from collections import namedtuple >>> CustomRow = namedtuple('CustomRow', 'field1 field2') >>> rdd = sc.parallelize( ... [CustomRow(field1=1, field2="row1"), ... CustomRow(field1=2, field2="row2"), ... CustomRow(field1=3, field2="row3")]) >>> df = sqlCtx.inferSchema(rdd) >>> df.collect()[0] Row(field1=1, field2=u'row1') """ if isinstance(rdd, DataFrame): raise TypeError("Cannot apply schema to DataFrame") first = rdd.first() if not first: raise ValueError("The first row in RDD is empty, " "can not infer schema") if type(first) is dict: warnings.warn("Using RDD of dict to inferSchema is deprecated," "please use pyspark.sql.Row instead") if samplingRatio is None: schema = _infer_schema(first) if _has_nulltype(schema): for row in rdd.take(100)[1:]: schema = _merge_type(schema, _infer_schema(row)) if not _has_nulltype(schema): break else: warnings.warn("Some of types cannot be determined by the " "first 100 rows, please try again with sampling") else: if samplingRatio > 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(_infer_schema).reduce(_merge_type) converter = _create_converter(schema) rdd = rdd.map(converter) return self.applySchema(rdd, schema)