def _get_value_hash(value: Union[str, int, bool]) -> int: if isinstance(value, (int, np.int_)): value = int(value) bigint_min, bigint_max = -(2**63), 2**63 - 1 int_min, int_max = -(2**31), 2**31 - 1 if not bigint_min <= value <= bigint_max: raise ValueError( f"{value} exceeds the range that Athena cannot handle as bigint." ) if not int_min <= value <= int_max: value = (value >> 32) ^ value if value < 0: return -value - 1 return int(value) if isinstance(value, (str, np.str_)): value_hash = 0 for byte in value.encode(): value_hash = value_hash * 31 + byte value_hash = _simulate_overflow(value_hash) return value_hash if isinstance(value, (bool, np.bool_)): return int(value) raise exceptions.InvalidDataFrame( "Column specified for bucketing contains invalid data type. Only string, int and bool are supported." )
def check_duplicated_columns(df: pd.DataFrame) -> Any: """Raise an exception if there are duplicated columns names.""" duplicated: List[str] = df.loc[:, df.columns.duplicated()].columns.to_list() if duplicated: raise exceptions.InvalidDataFrame( f"There is duplicated column names in your DataFrame: {duplicated}" )
def check_duplicated_columns(df: pd.DataFrame) -> Any: """Raise an exception if there are duplicated columns names.""" duplicated: List[str] = df.loc[:, df.columns.duplicated()].columns.to_list() if duplicated: raise exceptions.InvalidDataFrame( f"There are duplicated column names in your DataFrame: {duplicated}. " f"Note that your columns may have been sanitized and it can be the cause of " f"the duplicity.")
def check_duplicated_columns(df: pd.DataFrame) -> Any: """Raise an exception if there are duplicated columns names.""" duplicated: List[str] = df.loc[:, df.columns.duplicated()].columns.to_list() if duplicated: raise exceptions.InvalidDataFrame( f"There are duplicated column names in your DataFrame: {duplicated}. " f"Note that your columns may have been sanitized and it can be the cause of " f"the duplicity. Wrangler sanitization removes all special characters and " f"also converts CamelCase to snake_case. So you must avoid columns like " f"['MyCol', 'my_col'] in your DataFrame." )
def _get_value_hash(value: Union[str, int, bool]) -> int: if isinstance(value, (int, np.int_)): return int(value) if isinstance(value, (str, np.str_)): value_hash = 0 for byte in value.encode(): value_hash = value_hash * 31 + byte return value_hash if isinstance(value, (bool, np.bool_)): return int(value) raise exceptions.InvalidDataFrame( "Column specified for bucketing contains invalid data type. Only string, int and bool are supported." )