def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation # TODO: maybe don't need None check anymore as of Arrow 0.9.1 if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) elif t is not None and pa.types.is_string(t) and sys.version < '3': # TODO: need decode before converting to Arrow in Python 2 # TODO: don't need as of Arrow 0.9.1 return pa.Array.from_pandas(s.apply( lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) elif t is not None and pa.types.is_decimal(t) and \ LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. return pa.Array.from_pandas(s.apply( lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"): # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0. return pa.Array.from_pandas(s, mask=mask, type=t) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=safecheck) except pa.ArrowException as e: error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \ "Array (%s). It can be caused by overflows or other unsafe " + \ "conversions warned by Arrow. Arrow safe type check can be " + \ "disabled by using SQL config " + \ "`spark.sql.execution.pandas.arrowSafeTypeConversion`." raise RuntimeError(error_msg % (s.dtype, t), e) return array
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation # TODO: maybe don't need None check anymore as of Arrow 0.9.1 if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal( s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) elif t is not None and pa.types.is_string(t) and sys.version < '3': # TODO: need decode before converting to Arrow in Python 2 # TODO: don't need as of Arrow 0.9.1 return pa.Array.from_pandas(s.apply(lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) elif t is not None and pa.types.is_decimal(t) and \ LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. return pa.Array.from_pandas( s.apply(lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"): # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0. return pa.Array.from_pandas(s, mask=mask, type=t) return pa.Array.from_pandas(s, mask=mask, type=t, safe=False)
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) return pa.Array.from_pandas(s, mask=mask, type=t)
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) return pa.Array.from_pandas(s, mask=mask, type=t)
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) elif t is not None and pa.types.is_string(t) and sys.version < '3': # TODO: need decode before converting to Arrow in Python 2 return pa.Array.from_pandas(s.apply( lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) return pa.Array.from_pandas(s, mask=mask, type=t)
def cast_series(s, t): if type(t) == pa.TimestampType: # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680 return _check_series_convert_timestamps_internal(s.fillna(0))\ .values.astype('datetime64[us]', copy=False) elif t == pa.date32(): # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8 return s.dt.date elif t is None or s.dtype == t.to_pandas_dtype(): return s else: return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
def cast_series(s, t): if type(t) == pa.TimestampType: # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680 return _check_series_convert_timestamps_internal(s.fillna(0))\ .values.astype('datetime64[us]', copy=False) elif t == pa.date32(): # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8 return s.dt.date elif t is None or s.dtype == t.to_pandas_dtype(): return s else: return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s, self._timezone) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) except pa.ArrowException as e: error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \ "Array (%s). It can be caused by overflows or other unsafe " + \ "conversions warned by Arrow. Arrow safe type check can be " + \ "disabled by using SQL config " + \ "`spark.sql.execution.pandas.arrowSafeTypeConversion`." raise RuntimeError(error_msg % (s.dtype, t), e) return array
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) elif t is not None and pa.types.is_string(t) and sys.version < '3': # TODO: need decode before converting to Arrow in Python 2 return pa.Array.from_pandas(s.apply( lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) elif t is not None and pa.types.is_decimal(t) and \ LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. return pa.Array.from_pandas(s.apply( lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) return pa.Array.from_pandas(s, mask=mask, type=t)
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), self._timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) except pa.ArrowException as e: error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \ "Array (%s). It can be caused by overflows or other unsafe " + \ "conversions warned by Arrow. Arrow safe type check can be " + \ "disabled by using SQL config " + \ "`spark.sql.execution.pandas.arrowSafeTypeConversion`." raise RuntimeError(error_msg % (s.dtype, t), e) return array
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation # TODO: maybe don't need None check anymore as of Arrow 0.9.1 if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal( s.fillna(0), self._timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) elif t is not None and pa.types.is_string(t) and sys.version < '3': # TODO: need decode before converting to Arrow in Python 2 # TODO: don't need as of Arrow 0.9.1 return pa.Array.from_pandas(s.apply( lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) elif t is not None and pa.types.is_decimal(t) and \ LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. return pa.Array.from_pandas(s.apply( lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"): # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0. return pa.Array.from_pandas(s, mask=mask, type=t) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) except pa.ArrowException as e: error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \ "Array (%s). It can be caused by overflows or other unsafe " + \ "conversions warned by Arrow. Arrow safe type check can be " + \ "disabled by using SQL config " + \ "`spark.sql.execution.pandas.arrowSafeTypeConversion`." raise RuntimeError(error_msg % (s.dtype, t), e) return array