Beispiel #1
0
def series2col(s, name):
    kw = {
        'name': name,
        'kind': fpb.Column.SLICE,
    }

    if is_integer(s.dtype):
        kw['dtype'] = fpb.INTEGER
        kw['ints'] = s
    elif is_float(s.dtype):
        kw['dtype'] = fpb.FLOAT
        kw['floats'] = s
    elif s.dtype == np.object:  # Pandas dtype for str is object
        kw['strings'] = s
        kw['dtype'] = fpb.STRING
    elif is_bool(s.dtype):
        kw['bools'] = s
        kw['dtype'] = fpb.BOOLEAN
    elif is_datetime(s.dtype):
        if s.dt.tz:
            try:
                s = s.dt.tz_localize(pytz.UTC)
            except TypeError:
                s = s.dt.tz_convert('UTC')
        kw['times'] = s.astype(np.int64)
        kw['dtype'] = fpb.TIME
    elif is_categorical_dtype(s.dtype):
        # We assume catgorical data is strings
        kw['strings'] = s.astype(str)
        kw['dtype'] = fpb.STRING
    else:
        raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype))

    return fpb.Column(**kw)
Beispiel #2
0
    def _check_op(self, s, op_name, other, exc=None):
        op = self.get_op_from_name(op_name)
        result = op(s, other)

        # compute expected
        mask = s.isna()

        # other array is an Integer
        if isinstance(other, IntegerArray):
            omask = getattr(other, 'mask', None)
            mask = getattr(other, 'data', other)
            if omask is not None:
                mask |= omask

        # float result type or float op
        if ((is_float_dtype(other) or is_float(other) or
             op_name in ['__rtruediv__', '__truediv__',
                         '__rdiv__', '__div__'])):
            rs = s.astype('float')
            expected = op(rs, other)
            self._check_op_float(result, expected, mask, s, op_name, other)

        # integer result type
        else:
            rs = pd.Series(s.values._data)
            expected = op(rs, other)
            self._check_op_integer(result, expected, mask, s, op_name, other)
Beispiel #3
0
def infer_dtype_by_scaladata(data):
    if isinstance(data, float):
        return DataType.DOUBLE
    if isinstance(data, bool):
        return DataType.BOOL
    if isinstance(data, int):
        return DataType.INT64
    if isinstance(data, str):
        return DataType.STRING
    if isinstance(data, np.float64):
        return DataType.DOUBLE
    if isinstance(data, np.float32):
        return DataType.FLOAT
    if isinstance(data, np.int64):
        return DataType.INT64
    if isinstance(data, np.int32):
        return DataType.INT32
    if isinstance(data, np.int16):
        return DataType.INT16
    if isinstance(data, np.int8):
        return DataType.INT8
    if isinstance(data, np.bool8):
        return DataType.BOOL
    if isinstance(data, np.bool_):
        return DataType.BOOL
    if isinstance(data, bytes):
        return DataType.BINARY_VECTOR
    if is_float(data):
        return DataType.DOUBLE

    return DataType.UNKNOWN
Beispiel #4
0
 def default_display_func(x):
     if self.na_rep is not None and pd.isna(x):
         return self.na_rep
     elif is_float(x):
         n_precision = len(str(int(x))) + self.precision
         display_format = f"{x:.{n_precision}n}"
         return display_format
     elif is_integer(x):
         display_format = f"{x:n}"
         return display_format
     else:
         return x
Beispiel #5
0
def get_actual_types(df):
    column_types = {}

    for col_name in df.columns:
        col = df[col_name]
        if is_integer(col.dtype):
            column_types[col.name] = fpb.INTEGER
        elif is_float(col.dtype):
            column_types[col.name] = fpb.FLOAT
        elif is_string(col.dtype):
            has_data = False
            for x in col:
                if pd.isnull(x):
                    continue
                if isinstance(x, str):
                    column_types[col.name] = fpb.STRING
                    has_data = True
                    break
                if isinstance(x, bool):
                    column_types[col.name] = fpb.BOOLEAN
                    has_data = True
                    break
                if isinstance(x, pd.Timestamp):
                    column_types[col.name] = fpb.TIME
                    has_data = True
                    break
                if isinstance(x, datetime):
                    column_types[col.name] = fpb.TIME
                    has_data = True
                    break
                raise WriteError(
                    '{} - contains an unsupported value type - {}'.format(
                        col_name, type(x)))
            # If all items in the column are None
            # it does not matter what type the column will be, set the column as INTEGER
            if not has_data:
                column_types[col.name] = fpb.NULL
        elif is_bool(col.dtype):
            column_types[col.name] = fpb.BOOLEAN
        elif is_datetime(col.dtype):
            column_types[col.name] = fpb.TIME
        elif is_categorical_dtype(col.dtype):
            # We assume catgorical data is strings
            column_types[col.name] = fpb.STRING
        else:
            raise WriteError('{} - unsupported type - {}'.format(
                col_name, col.dtype))

    return column_types
Beispiel #6
0
    def _check_op(self, s, op_name, other, exc=None):
        op = self.get_op_from_name(op_name)
        result = op(s, other)

        # compute expected
        mask = s.isna()

        # if s is a DataFrame, squeeze to a Series
        # for comparison
        if isinstance(s, pd.DataFrame):
            result = result.squeeze()
            s = s.squeeze()
            mask = mask.squeeze()

        # other array is an Integer
        if isinstance(other, IntegerArray):
            omask = getattr(other, "mask", None)
            mask = getattr(other, "data", other)
            if omask is not None:
                mask |= omask

        # 1 ** na is na, so need to unmask those
        if op_name == "__pow__":
            mask = np.where(~s.isna() & (s == 1), False, mask)

        elif op_name == "__rpow__":
            other_is_one = other == 1
            if isinstance(other_is_one, pd.Series):
                other_is_one = other_is_one.fillna(False)
            mask = np.where(other_is_one, False, mask)

        # float result type or float op
        if (
            is_float_dtype(other)
            or is_float(other)
            or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"]
        ):
            rs = s.astype("float")
            expected = op(rs, other)
            self._check_op_float(result, expected, mask, s, op_name, other)

        # integer result type
        else:
            rs = pd.Series(s.values._data, name=s.name)
            expected = op(rs, other)
            self._check_op_integer(result, expected, mask, s, op_name, other)
Beispiel #7
0
    def _check_op(self, s, op_name, other, exc=None):
        op = self.get_op_from_name(op_name)
        result = op(s, other)

        # compute expected
        mask = s.isna()

        # if s is a DataFrame, squeeze to a Series
        # for comparison
        if isinstance(s, pd.DataFrame):
            result = result.squeeze()
            s = s.squeeze()
            mask = mask.squeeze()

        # other array is an Integer
        if isinstance(other, IntegerArray):
            omask = getattr(other, 'mask', None)
            mask = getattr(other, 'data', other)
            if omask is not None:
                mask |= omask

        # 1 ** na is na, so need to unmask those
        if op_name == '__pow__':
            mask = np.where(s == 1, False, mask)

        elif op_name == '__rpow__':
            mask = np.where(other == 1, False, mask)

        # float result type or float op
        if ((is_float_dtype(other) or is_float(other) or
             op_name in ['__rtruediv__', '__truediv__',
                         '__rdiv__', '__div__'])):
            rs = s.astype('float')
            expected = op(rs, other)
            self._check_op_float(result, expected, mask, s, op_name, other)

        # integer result type
        else:
            rs = pd.Series(s.values._data)
            expected = op(rs, other)
            self._check_op_integer(result, expected, mask, s, op_name, other)
Beispiel #8
0
    def _check_op(self, s, op_name, other, exc=None):
        op = self.get_op_from_name(op_name)
        result = op(s, other)

        # compute expected
        mask = s.isna()

        # if s is a DataFrame, squeeze to a Series
        # for comparison
        if isinstance(s, pd.DataFrame):
            result = result.squeeze()
            s = s.squeeze()
            mask = mask.squeeze()

        # other array is an Integer
        if isinstance(other, IntegerArray):
            omask = getattr(other, 'mask', None)
            mask = getattr(other, 'data', other)
            if omask is not None:
                mask |= omask

        # 1 ** na is na, so need to unmask those
        if op_name == '__pow__':
            mask = np.where(s == 1, False, mask)

        elif op_name == '__rpow__':
            mask = np.where(other == 1, False, mask)

        # float result type or float op
        if ((is_float_dtype(other) or is_float(other) or
             op_name in ['__rtruediv__', '__truediv__',
                         '__rdiv__', '__div__'])):
            rs = s.astype('float')
            expected = op(rs, other)
            self._check_op_float(result, expected, mask, s, op_name, other)

        # integer result type
        else:
            rs = pd.Series(s.values._data)
            expected = op(rs, other)
            self._check_op_integer(result, expected, mask, s, op_name, other)