def create_days_between_feature(self, df, feature_name_one, feature_name_two, result_name = None, absolute_value = True): """ Takes two pandas.DataFrame datetime columns and calculates the absolute days between the dates. Parameters ---------- df : DataFrame The dataframe containing the dates to calculate days between. feature_name_one : str The name of the primary feature column. feature_name_two : str The name of the secondary feature column. result_name: str, optional. Defaults is None. Sets the name of the resulting column to the one provided. absolute_value: boolean, optional. Defaults is True If true, the absolute value of the difference between dates will be used. Procedure --------- 1. Ensure both features are pd.datetime 2. Take the absolute values from the difference of the secondary date from primary and cast to datetime.days 3. Add this as a column to dateframe as, 'days_between_primary_and_secondary.' """ if is_datetime(df[feature_name_one]) and is_datetime(df[feature_name_two]): result_col_name = f'days_between_{feature_name_one}_and_{feature_name_two}' if result_name is not None: result_col_name = result_name df[result_col_name] = (df[feature_name_one] - df[feature_name_two]).dt.days if absolute_value: df[result_col_name] = abs(df[result_col_name]) else: print(f'Excepted datetime features, received 1: {df[feature_name_two].dtype} 2:{df[feature_name_two].dtype}') return
def _find_or_check_datetime_variables(X: pd.DataFrame, variables: Variables = None ) -> List[Union[str, int]]: """ Checks that variables provided by the user are of type datetime. If None, finds all datetime variables in the DataFrame. Parameters ---------- X : pandas DataFrame variables : variable or list of variables. Defaults to None. Returns ------- variables : List of datetime variables. """ if variables is None: variables = [ column for column in X.select_dtypes(exclude="number").columns if is_datetime(X[column]) or _is_categorical_and_is_datetime(X[column]) ] if len(variables) == 0: raise ValueError("No datetime variables found in this dataframe.") elif isinstance(variables, (str, int)): if is_datetime(X[variables]) or (not is_numeric(X[variables]) and _is_categorical_and_is_datetime( X[variables])): variables = [variables] else: raise TypeError("The variable entered is not datetime.") else: if len(variables) == 0: raise ValueError("The indicated list of variables is empty.") # check that the variables entered by the user are datetime else: vars_non_dt = [ column for column in X[variables].select_dtypes(exclude="datetime") if is_numeric(X[column]) or not _is_categorical_and_is_datetime(X[column]) ] if len(vars_non_dt) > 0: raise TypeError("Some of the variables are not datetime.") return variables
def series2col(s, name): kw = { 'name': name, 'kind': fpb.Column.SLICE, } if is_int_dtype(s.dtype): kw['dtype'] = fpb.INTEGER kw['ints'] = s elif is_float_dtype(s.dtype): kw['dtype'] = fpb.FLOAT kw['floats'] = s elif s.dtype == np.object: # Pandas dtype for str is object kw['strings'] = s kw['dtype'] = fpb.STRING elif s.dtype == np.bool: kw['bools'] = s kw['dtype'] = fpb.BOOLEAN elif is_datetime(s.dtype): if s.dt.tz: try: s = s.dt.tz_localize(pytz.UTC) except TypeError: s = s.dt.tz_convert('UTC') kw['times'] = s.astype(np.int64) kw['dtype'] = fpb.TIME elif is_categorical_dtype(s.dtype): # We assume catgorical data is strings kw['strings'] = s.astype(str) kw['dtype'] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype)) return fpb.Column(**kw)
def get_datetime_column_names(df): column_names = [] for column in df.columns: if is_datetime(df[column]): column_names.append(column) return column_names
def import_static_data(self, name: str, dataframe: pd.DataFrame) -> str: if isinstance(dataframe, pd.DataFrame): if not os.path.exists('.data'): os.makedirs('./.data') for column in dataframe.columns: if is_datetime(dataframe[column]): dataframe[column] = dataframe[column].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S.%f")) dataframe.to_csv('./.data/temporary-df.csv', index=False, header=True) import_guid = str(uuid.uuid4()) object_name = f"notebook-imports/{import_guid}/import.csv" path = self.__get_file_path(object_name) if not self.file_repository.upload_file(self._import_bucket, path, './.data/temporary-df.csv'): raise Exception("Error Uploading file to bucket") return self.notebook_repository.import_static_data( project_guid=self.project_guid, name=name, path=path, delete_when_complete=True ) else: raise Exception("Error: Must import as data frame")
def df_to_datetime_ser(df, col_values, col_date='date', assert_filled=False): """ Obtain the column `col_values` in `df` as a series with datetime index from `col_date`. Parameters ---------- df : pandas.DataFrame Description col_values : str Column name with values col_date : str, default='date' Column name with datetime index assert_filled : bool, default=False Assert that all dates in between have values Returns ------- ser : pandas.Series Resulting series """ assert col_values in df.columns assert col_date in df.columns assert is_datetime(df[col_date]) ser = pd.Series(df[col_values].values, df[col_date]) # the index must have no duplicate entries assert ser.index.duplicated().sum() == 0 ser.sort_index(inplace=True) if assert_filled: assert_filled_in_dates(ser) return ser
def unevenness(vis: Vis, ldf: LuxDataFrame, measure_lst: list, dimension_lst: list) -> int: """ Measure the unevenness of a bar chart vis. If a bar chart is highly uneven across the possible values, then it may be interesting. (e.g., USA produces lots of cars compared to Japan and Europe) Likewise, if a bar chart shows that the measure is the same for any possible values the dimension attribute could take on, then it may not very informative. (e.g., The cars produced across all Origins (Europe, Japan, and USA) has approximately the same average Acceleration.) Parameters ---------- vis : Vis ldf : LuxDataFrame measure_lst : list List of measures dimension_lst : list List of dimensions Returns ------- int Score describing how uneven the bar chart is. """ v = vis.data[measure_lst[0].attribute] v = v / v.sum() # normalize by total to get ratio v = v.fillna(0) # Some bar values may be NaN attr = dimension_lst[0].attribute if isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): # If timestamp, use the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attr = str(attr._date_repr) C = ldf.cardinality[attr] D = (0.9)**C # cardinality-based discounting factor v_flat = pd.Series([1 / C] * len(v)) if is_datetime(v): v = v.astype("int") return D * euclidean(v, v_flat)
def numpy_type_2_xsd_type(value: Any) -> (Any, URIRef): from pandas.api.types import is_datetime64_any_dtype as is_datetime if isinstance(value, str): return value, XSD.string if isinstance(value, bool): return value, XSD.boolean if np.issubdtype(type(value), np.integer): return value, Literal(value).datatype if np.issubdtype(type(value), np.float): return value, Literal(value).datatype elif isinstance( value, Timestamp): # has to come before the test for type date below date_time: datetime = value if date_time.hour == 0 and date_time.minute == 0 and date_time.second < 2 and date_time.microsecond < 32: return date_time.date(), XSD.date return value, XSD.datetime if isinstance(value, date): return value, XSD.date if is_datetime(value): return value, XSD.datetime elif isinstance(value, Timestamp): return value, XSD.datetime warning(f"Unknown type in numpy_type_2_xsd_type: {type(value)}") return value, None
def format_dates(df, format='%Y-%m-%d'): date_cols = [ column for column in df.columns if is_datetime(df[column]) ] df.loc[:, date_cols] = df[date_cols].apply( lambda x: x.dt.strftime(format).replace('NaT', '')) return df
def Mort_CompressColumn(df, use_float16=False): """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() / 1024 ** 2 print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: if is_datetime(df[col]) or is_categorical_dtype(df[col]): # skip datetime type or categorical type continue col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category')
def unevenness(vis:Vis, ldf:LuxDataFrame, measure_lst:list, dimension_lst:list) -> int: """ Measure the unevenness of a bar chart vis. If a bar chart is highly uneven across the possible values, then it may be interesting. (e.g., USA produces lots of cars compared to Japan and Europe) Likewise, if a bar chart shows that the measure is the same for any possible values the dimension attribute could take on, then it may not very informative. (e.g., The cars produced across all Origins (Europe, Japan, and USA) has approximately the same average Acceleration.) Parameters ---------- vis : Vis ldf : LuxDataFrame measure_lst : list List of measures dimension_lst : list List of dimensions Returns ------- int Score describing how uneven the bar chart is. """ v = vis.data[measure_lst[0].attribute] v = v/v.sum() # normalize by total to get ratio C = ldf.cardinality[dimension_lst[0].attribute] D = (0.5) ** C # cardinality-based discounting factor v_flat = pd.Series([1 / C] * len(v)) if (is_datetime(v)): v = v.astype('int') return D * euclidean(v, v_flat)
def fit(self, X): """Fits the CustomTimestampFeaturizer. :param X: The dataset containing timestamp columns to featurize. :type X: numpy.array or pandas.DataFrame or iml.datatypes.DenseData or scipy.sparse.csr_matrix """ # If the data was previously successfully summarized, then there are no # timestamp columns as it must be numeric. # Also, if the dataset is sparse, we can assume there are no timestamps if isinstance(X, DenseData) or issparse(X): return self tmp_dataset = X # If numpy array, temporarily convert to pandas for easier and uniform timestamp handling if isinstance(X, np.ndarray): tmp_dataset = pd.DataFrame(X, columns=self._features) self._time_col_names = [ column for column in tmp_dataset.columns if is_datetime(tmp_dataset[column]) ] # Calculate the min date for each column self._min = [] for time_col_name in self._time_col_names: self._min.append( tmp_dataset[time_col_name].map(lambda x: x.timestamp()).min()) return self
def reduce_mem_usage(df: pd.DataFrame, cols_exclude: List[str] = []) -> pd.DataFrame: '''Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin ''' start_mem = df.memory_usage().sum() / 1024**2 cols = [c for c in df.columns if c not in cols_exclude] print("Reducing memory for the following columns: ", cols, sep='\n') for col in cols: if is_datetime(df[col]) or is_categorical_dtype(df[col]): continue print(f"Reducing memory for {col}") col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min \ and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min \ and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min \ and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min \ and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype("category") end_mem = df.memory_usage().sum() / 1024**2 print(f"Memory usage before: {start_mem:.2f} MB", f"Memory usage after: {end_mem:.2f} MB " f"({100 * (start_mem - end_mem) / start_mem:.1f}% decrease)", sep='\n') return df
def test_read_sources_file(self): """Sources tables are read in properly""" test_path = os.path.join("tests", "res", "test_sources.txt") test_sources = utils.read_sources_file(test_path) self.assertEqual(test_sources.shape, (2, len(utils.SOURCES_COLUMNS))) self.assertEqual(test_sources.loc[0, "website"], "Google") self.assertTrue(is_datetime(test_sources["date"]))
def preprocess_dataframe(df, time_granularity="1s"): # pragma: no cover for feature in df: if feature.dtype == object: df[feature] = pd.Categorical(df[feature]) elif is_datetime(df[feature]): df[feature] = ((df.feature - df[feature].min()) / pd.Timedelta(time_granularity)) return
def add_category_data( stock_data: pd.DataFrame, ) -> typing.Dict[str, typing.List[typing.List[typing.Union[float, int]]]]: if is_datetime(stock_data["date"]): data_times = stock_data["date"].dt.strftime("%Y-%m-%d").to_list() else: data_times = stock_data["date"].to_list() return dict(categoryData=data_times, )
def compute_data_type(self, ldf: LuxDataFrame): for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time"] if (isinstance(attr, pd._libs.tslibs.timestamps.Timestamp)): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') ldf.data_type_lookup[attr] = "temporal" # elif any(var in str(attr).lower() for var in temporal_var_list): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif ldf.dtypes[attr] == "float64": ldf.data_type_lookup[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if (ldf.pre_aggregated): if (ldf.cardinality[attr] == len(ldf)): ldf.data_type_lookup[attr] = "nominal" if ldf.cardinality[attr] / len( ldf) < 0.4 and ldf.cardinality[attr] < 10: ldf.data_type_lookup[attr] = "nominal" elif check_if_id_like(ldf, attr): ldf.data_type_lookup[attr] = "id" else: ldf.data_type_lookup[attr] = "quantitative" # Eliminate this clause because a single NaN value can cause the dtype to be object elif ldf.dtypes[attr] == "object": ldf.data_type_lookup[attr] = "nominal" elif is_datetime_series( ldf.dtypes[attr] ): #check if attribute is any type of datetime dtype ldf.data_type_lookup[attr] = "temporal" # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): # if self.cardinality[attr]>50: if (ldf.index.dtype != 'int64' and ldf.index.name): ldf.data_type_lookup[ldf.index.name] = "nominal" ldf.data_type = self.mapping(ldf.data_type_lookup) from pandas.api.types import is_datetime64_any_dtype as is_datetime non_datetime_attrs = [] for attr in ldf.columns: if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime( ldf[attr]): non_datetime_attrs.append(attr) if len(non_datetime_attrs) == 1: warnings.warn( f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", stacklevel=2) elif len(non_datetime_attrs) > 1: warnings.warn( f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", stacklevel=2)
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Finds datetime variables or checks that the variables selected by the user can be converted to datetime. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # special case index if self.variables == "index": if not ( is_datetime(X.index) or ( not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index) ) ): raise TypeError("The dataframe index is not datetime.") if self.missing_values == "raise": self._check_index_contains_na(X.index) self.variables_ = None else: # find or check for datetime variables self.variables_ = _find_or_check_datetime_variables(X, self.variables) # check if datetime variables contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_) if self.features_to_extract is None: self.features_to_extract_ = FEATURES_DEFAULT elif isinstance(self.features_to_extract, str): self.features_to_extract_ = FEATURES_SUPPORTED else: self.features_to_extract_ = self.features_to_extract # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def compress_memory_usage(df_in: pd.DataFrame, replacer: dict = None): start_mem_usg = df_in.memory_usage().sum() / 1024**2 cols_with_nas = [] df = df_in.copy() # Avoid changing input df for col in tqdm(df.columns, "DataFrame: compress_memory_usage"): if df[col].dtype != object and not is_datetime(df[col]): # make variables for Int, max and min is_int = False mx = df[col].max() mn = df[col].min() # Integer does not support NA, therefore, NA needs to be filled if not np.isfinite(df[col]).all(): cols_with_nas.append(col) if replacer: df[col].fillna(replacer[col], inplace=True) else: df[col].fillna(mn - 1, inplace=True) as_int = df[col].fillna(0).astype(np.int64) result = (df[col] - as_int) result = result.sum() if -0.01 < result < 0.01: is_int = True # Make integer/unsigned integer types if is_int: if mn >= 0: if mx < 255: df[col] = df[col].astype(np.uint8) elif mx < 65535: df[col] = df[col].astype(np.uint16) elif mx < 4294967295: df[col] = df[col].astype(np.uint32) else: df[col] = df[col].astype(np.uint64) else: if mn > np.iinfo(np.int8).min and mx < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif mn > np.iinfo(np.int16).min and mx < np.iinfo( np.int16).max: df[col] = df[col].astype(np.int16) elif mn > np.iinfo(np.int32).min and mx < np.iinfo( np.int32).max: df[col] = df[col].astype(np.int32) elif mn > np.iinfo(np.int64).min and mx < np.iinfo( np.int64).max: df[col] = df[col].astype(np.int64) # Make float data types 32 bit else: df[col] = df[col].astype(np.float32) mem_usg = df.memory_usage().sum() / 1024**2 print('Memory usage pre-compression was {}'.format(start_mem_usg)) print('Memory usage after-compression was {}'.format(mem_usg)) print("This is {}% of the initial size".format(100 * mem_usg / start_mem_usg)) return df, cols_with_nas
def test_get_stations_info(param, expected_output): data_all = noaastn.get_stations_info(country=param) # check number of columns assert data_all.shape[1] == num_column # check type of the columns for i in range(9): assert data_all.dtypes[i] == object is_datetime(data_all["start"]) is_datetime(data_all["end"]) # match and check each col value pattern by comparing length row_df = data_all.sample(1) for col in col_len.keys(): assert (pd.isna(row_df[col].values[0]) or pd.isnull(row_df[col].values[0]) or len(row_df[col].values[0]) == col_len[col])
def nan_filler(self): for col in self.df.columns: if is_datetime(self.df[col]) or is_categorical_dtype(self.df[col]): continue self.df[col].fillna(self.df[col].median(), inplace=True) return self.df
def reduce_mem_usage(self, use_float16: bool = False, info: bool = True): """ Automatically distinguish the type of one single data and reset a suitable type. :param use_float16: use float16 or not :param info: stay True if a display of information is required """ # the original memory usage start_mem = self.memory_usage().sum() / 1024**2 # reduce the memory usage for col in self.columns: if is_datetime(self[col]) or is_categorical_dtype(self[col]): continue col_type = self[col].dtype if col_type != object: c_min = self[col].min() c_max = self[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: self[col] = self[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( np.int16).max: self[col] = self[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( np.int32).max: self[col] = self[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( np.int64).max: self[col] = self[col].astype(np.int64) else: if use_float16 and c_min > np.finfo( np.float16).min and c_max < np.finfo( np.float16).max: self[col] = self[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: self[col] = self[col].astype(np.float32) else: self[col] = self[col].astype(np.float64) else: self[col] = self[col].astype("category") if info: end_mem = self.memory_usage().sum() / 1024**2 print("Memory usage before optimization:\t{:.3f} MB".format( start_mem)) print( "Memory usage after optimization:\t{:.3f} MB".format(end_mem)) print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
def save_geodataframe(gdf, filename, output_dir, include_shp_files=False): if not gdf.empty: gdf = sanitiser.sanitise_geodataframe(gdf) persistence.ensure_dir(output_dir) gdf.to_file(os.path.join(output_dir, f'{filename}.geojson'), driver='GeoJSON') for col in [col for col in gdf.columns if is_datetime(gdf[col])]: gdf[col] = gdf[col].astype(str) if include_shp_files: shp_files = os.path.join(output_dir, 'shp_files') persistence.ensure_dir(shp_files) gdf.to_file(os.path.join(shp_files, f'{filename}.shp'))
def reduce_mem_usage(df, use_float16=False, cols_exclude=[]): """ Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin Modified to support timestamp type, categorical type Modified to add option to use float16 """ start_mem = df.memory_usage().sum() / 1024**2 print("Memory usage of dataframe is {:.2f} MB".format(start_mem)) cols = [c for c in df.columns if c not in cols_exclude] print(cols) for col in cols: if is_datetime(df[col]) or is_categorical_dtype(df[col]): continue col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( np.int64).max: df[col] = df[col].astype(np.int64) else: if use_float16 and c_min > np.finfo( np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype("category") end_mem = df.memory_usage().sum() / 1024**2 print("Memory usage after optimization is: {:.2f} MB".format(end_mem)) print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem)) return df
def validate_column(vals, n_not_null=N_NOT_NULL, min_observations_in_class=MIN_OBSERVATIONS_IN_CLASS): '''The function checks whether a columns can be modeled using ML models. It also classifies column into type of applicable models - binary, multiclass, regression or unknown. Args: vals: column to validate n_not_null: columns needs to have at least n_not_null values in order to be valid min_observations_in_class: the function will preserve only groups that have at least `min_observations_in_class` observations Returns: boolean, str: assessment whether column is valid and either its classification or reason for not being a valid column ''' valid, reason = True, '' vals_not_na = vals.dropna() if vals_not_na.shape[0] < n_not_null: valid, reason = False, f'Not enough values ({vals_not_na.shape[0]}) provided, required: {n_not_null}' return valid, reason if is_datetime(vals_not_na): valid, reason = True, 'datetime' return valid, reason # determine column type n_unique = vals_not_na.unique().shape[0] if n_unique <= 1: valid, reason = False, 'Not enough classes (0 or 1).' elif n_unique > vals_not_na.shape[0]*0.9 and (vals_not_na.dtype != 'float64' and vals_not_na.dtype != 'int64'): valid, reason = True, 'identifier' elif n_unique == 2: valid, reason = True, f'binary' # 17 is somewhat arbitrary elif n_unique > 2 and ((n_unique < 17 and vals_not_na.dtype == 'float64') or \ (vals_not_na.dtype != 'float64' and vals_not_na.dtype != 'int64')): valid, reason = True, 'multiclass' elif n_unique > 2 and (vals_not_na.dtype == 'float64' or vals_not_na.dtype == 'int64'): valid, reason = True, 'regression' else: valid, reason = False, 'Type not known.' # second pass - this time we drop small groups for binary and multiclass variables # and check whether the type changed if reason == 'binary' or reason == 'multiclass': vals_not_na = drop_infrequent_groups(vals_not_na, min_observations_in_class) n_unique = vals_not_na.unique().shape[0] if n_unique <= 1: valid, reason = False, 'Not enough classes (0 or 1).' elif n_unique == 2: valid, reason = True, 'binary' else: # otherwise the type - multiclass is preserved pass return valid, reason
def reduce_mem_usage(df, use_float16=False): """ Original function code is from: https://www.kaggle.com/aitude/ashrae-kfold-lightgbm-without-leak-1-08 Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() / 1024**2 print("Memory usage of dataframe is {:.2f} MB".format(start_mem)) for col in df.columns: if is_datetime(df[col]) or is_categorical_dtype(df[col]): continue col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( np.int64).max: df[col] = df[col].astype(np.int64) else: if use_float16 and c_min > np.finfo( np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype("category") end_mem = df.memory_usage().sum() / 1024**2 print("Memory usage after optimization is: {:.2f} MB".format(end_mem)) print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem)) return df
def _coerce_dates(self, series: pd.Series, timezone_: timezone) -> pd.Series: """ Method to parse dates in the pandas.DataFrame. Leverages the data timezone attribute to ensure correct comparison of dates. :param series: :return: """ if not is_datetime(series): series = pd.Series(series.map(lambda x: pd.Timestamp(x))) if not series.dt.tz: series = series.dt.tz_localize(timezone_) return series.dt.tz_convert(pytz.UTC)
def assert_filled_in_dates(ser): """ assert that a pandas.Series with datetime index has values for all days in between the minimum and maximum one. Parameters ---------- ser : pandas.Series Series with datetime index """ assert is_datetime(ser.index), 'ser index must be of dtype datetime' ndays = (ser.index.max() - ser.index.min()).days + 1 assert ndays == len(ser), "there are gaps in dates index"
def reduce_mem_usage(df, use_float16=False): """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() / 1024**2 print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: if col == 'id': df[col] = df[col].astype(str) continue if is_datetime(df[col]): # skip datetime type continue col_type = df[col].dtype if col_type == list: continue if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( np.int64).max: df[col] = df[col].astype(np.int64) else: if use_float16 and c_min > np.finfo( np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category') end_mem = df.memory_usage().sum() / 1024**2 print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) return df
def test_import_utils(test, data): output, renamed, missing = test._import_utils(name=deepcopy(data)) assert output.shape == (3, 3) assert "ModelName" in output.columns assert list(output["ModelID"]) == list(data["pymodelid"]) assert is_datetime(output["SnapshotTime"]) assert "Performance" not in output.columns assert renamed == { "ModelID": "ModelID", "Name": "ModelName", "SnapshotTime": "SnapshotTime", } assert "Junk" not in output.columns assert "Treatment" not in output.columns