def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame': """ Create an immutable DataFrame from pandas DataFrame. :param pdf: :class:`pd.DataFrame` :return: the created immutable DataFrame """ columns = pdf.columns data_columns = [name_like_string(col) for col in columns] if isinstance(columns, pd.MultiIndex): column_index = columns.tolist() else: column_index = None column_index_names = columns.names index = pdf.index index_map = [] # type: List[IndexMap] if isinstance(index, pd.MultiIndex): if index.names is None: index_map = [(SPARK_INDEX_NAME_FORMAT(i), None) for i in range(len(index.levels))] else: index_map = [ (SPARK_INDEX_NAME_FORMAT(i) if name is None else name_like_string(name), name if name is None or isinstance(name, tuple) else (name, )) for i, name in enumerate(index.names) ] else: name = index.name index_map = [(name_like_string(name) if name is not None else SPARK_INDEX_NAME_FORMAT(0), name if name is None or isinstance(name, tuple) else (name, ))] index_columns = [index_column for index_column, _ in index_map] reset_index = pdf.reset_index() reset_index.columns = index_columns + data_columns schema = StructType([ StructField(name_like_string(name), infer_pd_series_spark_type(col), nullable=bool(col.isnull().any())) for name, col in reset_index.iteritems() ]) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) sdf = default_session().createDataFrame(reset_index, schema=schema) return _InternalFrame( sdf=sdf, index_map=index_map, column_index=column_index, column_scols=[scol_for(sdf, col) for col in data_columns], column_index_names=column_index_names)
def _init_from_pandas(self, pdf, *args): metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields schema = StructType([StructField(name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any())) for name, col in reset_index.iteritems()]) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) self._init_from_spark(default_session().createDataFrame(reset_index, schema=schema), metadata)
def from_pandas(pdf: pd.DataFrame) -> "InternalFrame": """ Create an immutable DataFrame from pandas DataFrame. :param pdf: :class:`pd.DataFrame` :return: the created immutable DataFrame """ columns = pdf.columns data_columns = [name_like_string(col) for col in columns] if isinstance(columns, pd.MultiIndex): column_labels = columns.tolist() else: column_labels = [(col, ) for col in columns] column_label_names = [ name if name is None or isinstance(name, tuple) else (name, ) for name in columns.names ] index_names = [ name if name is None or isinstance(name, tuple) else (name, ) for name in pdf.index.names ] index_columns = [ SPARK_INDEX_NAME_FORMAT(i) for i in range(len(index_names)) ] pdf = pdf.copy() pdf.index.names = index_columns reset_index = pdf.reset_index() reset_index.columns = index_columns + data_columns schema = StructType([ StructField( name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any()), ) for name, col in reset_index.iteritems() ]) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) sdf = default_session().createDataFrame(reset_index, schema=schema) return InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, col) for col in index_columns], index_names=index_names, column_labels=column_labels, data_spark_columns=[scol_for(sdf, col) for col in data_columns], column_label_names=column_label_names, )
def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame': """ Create an immutable DataFrame from pandas DataFrame. :param pdf: :class:`pd.DataFrame` :return: the created immutable DataFrame """ data_columns = [str(col) for col in pdf.columns] index = pdf.index index_map = [] # type: List[IndexMap] if isinstance(index, pd.MultiIndex): if index.names is None: index_map = [('__index_level_{}__'.format(i), None) for i in range(len(index.levels))] else: index_map = [ ('__index_level_{}__'.format(i) if name is None else name, name) for i, name in enumerate(index.names) ] else: index_map = [ (index.name if index.name is not None else '__index_level_0__', index.name) ] index_columns = [index_column for index_column, _ in index_map] reset_index = pdf.reset_index() reset_index.columns = index_columns + data_columns schema = StructType([ StructField(name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any())) for name, col in reset_index.iteritems() ]) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) sdf = default_session().createDataFrame(reset_index, schema=schema) return _InternalFrame(sdf=sdf, index_map=index_map, data_columns=data_columns)