def Gis_DataWithLabel(self, labels: Union[str, Sequence]) -> None: """ The setter for the GIS with label feature property. The label argument is either a data type, for which all features are selected or a str/list of labels of features, which are kept :param labels: indicates the labels for the GisDataWithLabel object :type labels: Union[str, Sequence] :return: """ data = NameSelector(self._gis_features).fit_transform(self.input) label_transformer = (TypeSelector(labels) if labels in self.__dtypes else NameSelector(labels)) label_data = label_transformer.fit_transform(self.input) result = pd.concat([data, label_data], axis=1) self._data_grouped = GisAnalyzerWithClusterLabel( result, self._gis_features[0], self._gis_features[1], labels)
def String_Data(self): """ A property which initializes a String Data Analyzer object. Takes all string columns and uses them in a Categorical Analyzer object. :return: a Categorical Analyzer with all string features :rtype: CategoricalAnalyzer """ if self.len_string_features: val = NameSelector(self.string_features).fit_transform(self.input) log.info( f"Created string DataFrame of size {self.len_string_features} with the following columns: {self.string_features}" ) return CategoricalAnalyzer(val)
def Numerical_Data(self) -> NumericalAnalyzer: """ Selects all numerical values and uses those as input values to create a Numerical_Data processor object. :return: a numerical data analyzer """ if self.len_numerical_features: val = NameSelector(self.numerical_features).fit_transform( self.input) log.info( f"Setting numerical DataFrame of size {self.len_numerical_features} with the following columns: {self.numerical_features}" ) return NumericalAnalyzer(val)
def Date_By_Data(self, dtype: Union[float, int, str, Union]) -> None: """ A setter for the Data by date. It takes a column type and checks if the date_features are not empty. It then creates a Data by Date object for the provided data type. The provided argument can either be a data type defined in the class or a str/list of columns which are found in the DataFrame :param dtype: the data for which data shall be analyzed along the date column :type dtype: one of the classes defined data types :return: a date_by_Analyzer object """ if self.len_date_features: date_data = NameSelector(self.date_features).fit_transform( self.input) x_transformer = (TypeSelector(dtype) if dtype in self.__dtypes else NameSelector(dtype)) x_data = x_transformer.fit_transform(self.input) result = pd.concat([date_data, x_data], axis=1) log.info( f"Created Date DataFrame of size {self.len_date_features} with the following columns: {self.date_features}" ) self._date_by_data = DateByAnalyzer(result, list(x_data.columns), self.date_features)
def Boolean_Data(self) -> CategoricalAnalyzer: """ A property which initializes a Categorical Analyzer for boolean values. Takes all boolean columns and uses them in a Categorical Analyzer object. :return: A Categorical Analyzer Object containing boolean values :rtype: CategoricalAnalyzer """ if self.len_bool_features: val = NameSelector(self.bool_features).fit_transform(self.input) log.info( f"Created boolean DataFrame of size {self.len_bool_features} with the following columns: {self.bool_features}" ) return CategoricalAnalyzer(val)
def Categorical_Data(self) -> CategoricalAnalyzer: """ A property which initializes a Categorical Analyzer for categorical values. Takes all categorical features and uses them in a Categorical Analyzer object. :return: A Categorical Data Analyzer :rtype: CategoricalAnalyzer """ if self.len_categorical_features: val = NameSelector(self.categorical_features).fit_transform( self.input) log.info( f"Created categorical DataFrame of size {self.len_categorical_features} with the following columns: {self.categorical_features}" ) return CategoricalAnalyzer(val)
def GIS_Data(self) -> GisAnalyzer: """ A property which initializes a GIS Analyzer by taking the first and second entry of the gis feature list. :return: """ if self.len_gis_features >= 2: data = NameSelector(self._gis_features).fit_transform(self.input) log.info( f"Setting the GIS DataFrame of size {len(data)} with {self._gis_features[0]} as the latitude column and {self._gis_features[1]} as the longitude column." f"Note that you can reset them if necessary") return GisAnalyzer(data, self._gis_features[0], self._gis_features[1]) else: log.error( "No features have been defined as GIS data. You first need to set them using gis_features" )
def Data_Mixed_Types(self, dtype: Union[Sequence, str]): """ Setter property for the data mixed types. It initializes a Data Mixed Tye Object. There are several ways to set it: 1) as a tuple: in this case, for both arguments are made a check if the provided argument is a data type. If yes, all columns of this data type are provided. If not, all columns found in the respective arguments are kept. Example to mix receive a Mixed Data Type for all boolean columns on the one side and the columns "income" and "revenue" on the other side: > # Assume an ``explorer`` object named "explorer" > explorer.Data_Mixed_Types = explorer.dtype_bool, ["income", "revenue"] Or to get all boolean and numeric types: > # Assume an ``explorer`` object named "explorer" > explorer.Data_Mixed_Types = explorer.dtype_bool, explorer.dtype_num 2) as a string: in this case, it is assumed, that the first column type is numerical, i.e. it is the default value. The second column can be a string that indicates either all columns of a given data type if the parameter is a valid data type or a single column. For example, to gain all numerical features by the boolean column "Male": > # Assume an ``explorer`` object named "explorer" > explorer.Data_Mixed_Types = "Male" Or > # Assume an ``explorer`` object named "explorer" > explorer.Data_Mixed_Types = bool 3) as a list: in this case, it is assumed, that the first column type is numerical, i.e. it is the default value. The list indicates a list of features to be kept as second feature dimension. Example to mix receive a Mixed Data Type for all numerical columns on the one side and the columns "gender" and "married" on the other side: > # Assume an ``explorer`` object named "explorer" > explorer.Data_Mixed_Types = ["gender", "married"] :param dtype: As described in the description, different ways exist to initialize call this object :type dtype: Union[Sequence, str] :return: a Data Mixed Type object """ # first case, the provided argument is a tuple. the two entries are taken as input for the respective data type/ data column names if isinstance(dtype, tuple): log.info( f'Received the following two inputs "{dtype[0]}" and "{dtype[1]}". Deriving the data frame to be generated out of it. ' ) subset_1, subset_2 = dtype[0], dtype[1] data_1 = (TypeSelector(subset_1) if subset_1 in self.__dtypes else NameSelector(subset_1)) df_1 = data_1.fit_transform(self.input) data_2 = (TypeSelector(subset_2) if subset_2 in self.__dtypes else NameSelector(subset_2)) df_2 = data_2.fit_transform(self.input) result = pd.concat([df_1, df_2], axis=1) self._data_grouped = MixedDataAnalyser(result, list(df_1.columns), list(df_2.columns)) # second case: the provided argument is a string. The derived object is the column name indicated by the string. The second dimension is numerical features elif isinstance(dtype, str): log.info( f"Received a single input. The first data type dimension is automatically set to numerical. " f'The second is derived from the input "{dtype}"') num = TypeSelector(self.dtype_num) group = (TypeSelector(dtype) if dtype in self.__dtypes else NameSelector(dtype)) data_num = num.fit_transform(self.input) data_grouped = group.fit_transform(self.input) result = pd.concat([data_num, data_grouped], axis=1) self._data_grouped = MixedDataAnalyser(result, self.numerical_features, list(data_grouped.columns)) # third case: the provided argument is a list: the entries in the list are the feature which are kept. The second dimension is numerical features. elif isinstance(dtype, list): log.info( f"Received a single input containing a list of features. The first data type dimension is automatically set to numerical. " f'The second is derived from the input "{dtype}"') num = TypeSelector(self.dtype_num) group = NameSelector(dtype) data_num = num.fit_transform(self.input) data_grouped = group.fit_transform(self.input) result = pd.concat([data_num, data_grouped], axis=1) self._data_grouped = MixedDataAnalyser(result, self.numerical_features, list(data_grouped.columns)) # grouper = NameSelector(dtype) log.info( f"Setting Grouped DataFrame of size {len(self._data_grouped.data)} where the first feature dimension contains columns" f": {list(self._data_grouped.nums)} and the second dimension {list(self._data_grouped.group)}" )