Ejemplo n.º 1
0
    def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame':
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        columns = pdf.columns
        data_columns = [name_like_string(col) for col in columns]
        if isinstance(columns, pd.MultiIndex):
            column_index = columns.tolist()
        else:
            column_index = None
        column_index_names = columns.names

        index = pdf.index

        index_map = []  # type: List[IndexMap]
        if isinstance(index, pd.MultiIndex):
            if index.names is None:
                index_map = [(SPARK_INDEX_NAME_FORMAT(i), None)
                             for i in range(len(index.levels))]
            else:
                index_map = [
                    (SPARK_INDEX_NAME_FORMAT(i)
                     if name is None else name_like_string(name),
                     name if name is None or isinstance(name, tuple) else
                     (name, )) for i, name in enumerate(index.names)
                ]
        else:
            name = index.name
            index_map = [(name_like_string(name)
                          if name is not None else SPARK_INDEX_NAME_FORMAT(0),
                          name if name is None or isinstance(name, tuple) else
                          (name, ))]

        index_columns = [index_column for index_column, _ in index_map]

        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType([
            StructField(name_like_string(name),
                        infer_pd_series_spark_type(col),
                        nullable=bool(col.isnull().any()))
            for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return _InternalFrame(
            sdf=sdf,
            index_map=index_map,
            column_index=column_index,
            column_scols=[scol_for(sdf, col) for col in data_columns],
            column_index_names=column_index_names)
Ejemplo n.º 2
0
 def _init_from_pandas(self, pdf, *args):
     metadata = Metadata.from_pandas(pdf)
     reset_index = pdf.reset_index()
     reset_index.columns = metadata.all_fields
     schema = StructType([StructField(name, infer_pd_series_spark_type(col),
                                      nullable=bool(col.isnull().any()))
                          for name, col in reset_index.iteritems()])
     for name, col in reset_index.iteritems():
         dt = col.dtype
         if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
             continue
         reset_index[name] = col.replace({np.nan: None})
     self._init_from_spark(default_session().createDataFrame(reset_index, schema=schema),
                           metadata)
Ejemplo n.º 3
0
    def from_pandas(pdf: pd.DataFrame) -> "InternalFrame":
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        columns = pdf.columns
        data_columns = [name_like_string(col) for col in columns]
        if isinstance(columns, pd.MultiIndex):
            column_labels = columns.tolist()
        else:
            column_labels = [(col, ) for col in columns]
        column_label_names = [
            name if name is None or isinstance(name, tuple) else (name, )
            for name in columns.names
        ]

        index_names = [
            name if name is None or isinstance(name, tuple) else (name, )
            for name in pdf.index.names
        ]
        index_columns = [
            SPARK_INDEX_NAME_FORMAT(i) for i in range(len(index_names))
        ]

        pdf = pdf.copy()
        pdf.index.names = index_columns
        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType([
            StructField(
                name,
                infer_pd_series_spark_type(col),
                nullable=bool(col.isnull().any()),
            ) for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, col) for col in index_columns],
            index_names=index_names,
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        )
Ejemplo n.º 4
0
    def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame':
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        data_columns = [str(col) for col in pdf.columns]

        index = pdf.index

        index_map = []  # type: List[IndexMap]
        if isinstance(index, pd.MultiIndex):
            if index.names is None:
                index_map = [('__index_level_{}__'.format(i), None)
                             for i in range(len(index.levels))]
            else:
                index_map = [
                    ('__index_level_{}__'.format(i) if name is None else name,
                     name) for i, name in enumerate(index.names)
                ]
        else:
            index_map = [
                (index.name if index.name is not None else '__index_level_0__',
                 index.name)
            ]

        index_columns = [index_column for index_column, _ in index_map]

        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType([
            StructField(name,
                        infer_pd_series_spark_type(col),
                        nullable=bool(col.isnull().any()))
            for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return _InternalFrame(sdf=sdf,
                              index_map=index_map,
                              data_columns=data_columns)