Beispiel #1
0
def _get_columns_to_convert(df, schema, preserve_index, columns):
    columns = _resolve_columns_of_interest(df, schema, columns)

    if not df.columns.is_unique:
        raise ValueError('Duplicate column names found: {}'.format(
            list(df.columns)))

    if schema is not None:
        return _get_columns_to_convert_given_schema(df, schema, preserve_index)

    column_names = []

    index_levels = (_get_index_level_values(df.index)
                    if preserve_index is not False else [])

    columns_to_convert = []
    convert_fields = []

    for name in columns:
        col = df[name]
        name = _column_name_to_strings(name)

        if _pandas_api.is_sparse(col):
            raise TypeError(
                "Sparse pandas data (column {}) not supported.".format(name))

        columns_to_convert.append(col)
        convert_fields.append(None)
        column_names.append(name)

    index_descriptors = []
    index_column_names = []
    for i, index_level in enumerate(index_levels):
        name = _index_level_name(index_level, i, column_names)
        if (isinstance(index_level, _pandas_api.pd.RangeIndex)
                and preserve_index is None):
            descr = _get_range_index_descriptor(index_level)
        else:
            columns_to_convert.append(index_level)
            convert_fields.append(None)
            descr = name
            index_column_names.append(name)
        index_descriptors.append(descr)

    all_names = column_names + index_column_names

    # all_names : all of the columns in the resulting table including the data
    # columns and serialized index columns
    # column_names : the names of the data columns
    # index_column_names : the names of the serialized index columns
    # index_descriptors : descriptions of each index to be used for
    # reconstruction
    # index_levels : the extracted index level values
    # columns_to_convert : assembled raw data (both data columns and indexes)
    # to be converted to Arrow format
    # columns_fields : specified column to use for coercion / casting
    # during serialization, if a Schema was provided
    return (all_names, column_names, index_column_names, index_descriptors,
            index_levels, columns_to_convert, convert_fields)
Beispiel #2
0
def _get_columns_to_convert_given_schema(df, schema, preserve_index):
    """
    Specialized version of _get_columns_to_convert in case a Schema is
    specified.
    In that case, the Schema is used as the single point of truth for the
    table structure (types, which columns are included, order of columns, ...).
    """
    column_names = []
    columns_to_convert = []
    convert_fields = []
    index_descriptors = []
    index_column_names = []
    index_levels = []

    for name in schema.names:
        try:
            col = df[name]
            is_index = False
        except KeyError:
            if preserve_index is not False and name in df.index.names:
                col = df.index.get_level_values(name)
                if (preserve_index is None
                        and isinstance(col, _pandas_api.pd.RangeIndex)):
                    raise ValueError(
                        "name '{}' is present in the schema, but it is a "
                        "RangeIndex which will not be converted as a column "
                        "in the Table, but saved as metadata-only not in "
                        "columns. Specify 'preserve_index=True' to force it "
                        "being added as a column, or remove it from the "
                        "specified schema".format(name))
                is_index = True
            else:
                raise KeyError(
                    "name '{}' present in the specified schema is not found "
                    "in the columns or index".format(name))

        name = _column_name_to_strings(name)

        if _pandas_api.is_sparse(col):
            raise TypeError(
                "Sparse pandas data (column {}) not supported.".format(name))

        field = schema.field(name)
        columns_to_convert.append(col)
        convert_fields.append(field)
        column_names.append(name)

        if is_index:
            index_column_names.append(name)
            index_descriptors.append(name)
            index_levels.append(col)

    all_names = column_names + index_column_names

    return (all_names, column_names, index_column_names, index_descriptors,
            index_levels, columns_to_convert, convert_fields)