def series2col(s, name): kw = { 'name': name, 'kind': fpb.Column.SLICE, } if is_integer(s.dtype): kw['dtype'] = fpb.INTEGER kw['ints'] = s elif is_float(s.dtype): kw['dtype'] = fpb.FLOAT kw['floats'] = s elif s.dtype == np.object: # Pandas dtype for str is object kw['strings'] = s kw['dtype'] = fpb.STRING elif is_bool(s.dtype): kw['bools'] = s kw['dtype'] = fpb.BOOLEAN elif is_datetime(s.dtype): if s.dt.tz: try: s = s.dt.tz_localize(pytz.UTC) except TypeError: s = s.dt.tz_convert('UTC') kw['times'] = s.astype(np.int64) kw['dtype'] = fpb.TIME elif is_categorical_dtype(s.dtype): # We assume catgorical data is strings kw['strings'] = s.astype(str) kw['dtype'] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype)) return fpb.Column(**kw)
def get_actual_types(df): column_types = {} for col_name in df.columns: col = df[col_name] if is_integer(col.dtype): column_types[col.name] = fpb.INTEGER elif is_float(col.dtype): column_types[col.name] = fpb.FLOAT elif is_string(col.dtype): has_data = False for x in col: if pd.isnull(x): continue if isinstance(x, str): column_types[col.name] = fpb.STRING has_data = True break if isinstance(x, bool): column_types[col.name] = fpb.BOOLEAN has_data = True break if isinstance(x, pd.Timestamp): column_types[col.name] = fpb.TIME has_data = True break if isinstance(x, datetime): column_types[col.name] = fpb.TIME has_data = True break raise WriteError( '{} - contains an unsupported value type - {}'.format( col_name, type(x))) # If all items in the column are None # it does not matter what type the column will be, set the column as INTEGER if not has_data: column_types[col.name] = fpb.NULL elif is_bool(col.dtype): column_types[col.name] = fpb.BOOLEAN elif is_datetime(col.dtype): column_types[col.name] = fpb.TIME elif is_categorical_dtype(col.dtype): # We assume catgorical data is strings column_types[col.name] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format( col_name, col.dtype)) return column_types
def _infer_object_dtype(arr): # TODO: accelerate with Cython/C BOOLEAN, STRING = 0, 1 state = BOOLEAN avalues = arr.values if isinstance(arr, pd.Series) else arr nulls = pd.isnull(avalues) if nulls.any(): for i in compat.range(len(avalues)): if state == BOOLEAN: if not nulls[i] and not pdcom.is_bool(avalues[i]): state = STRING elif state == STRING: break if state == BOOLEAN: return 'boolean' elif state == STRING: return 'string' else: return infer_dtype(avalues)
def check_if_categorical(feature): return (is_categorical(feature) or is_string_dtype(feature) or is_bool(feature))
def make_nums_from_bool(df): # convert the True/Fals boolean values to 1/0 for n,c in df.items(): # print(n,end=" ") if is_bool(c): df[n] = (df[n] == True).astype(int)