def __init__(self, output_path: str): """Initialise the objects and constants. :param output_path: """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__output_path = output_path self.__readers_writers = ReadersWriters()
def __init__(self, variables_settings: PandasDataFrame, output_path: str, output_table: str): """Initialise the objects and constants. :param variables_settings: :param output_path: the output path. :param output_table: the output table name. """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__variables_settings = variables_settings self.__output_path = output_path self.__output_table = output_table self.__readers_writers = ReadersWriters() self.__FeatureParserThread = FeatureParserThread()
def __init__(self, model_features_table: str, input_path: str, output_path: str, input_features_configs: str, output_table: str): """Initialise the objects and constants. :param model_features_table: the feature table name. :param input_path: the input path. :param output_path: the output path. :param input_features_configs: the input features' configuration file. :param output_table: the output table name. """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__model_features_table = model_features_table self.__output_path = output_path self.__output_table = output_table self.__readers_writers = ReadersWriters() # initialise settings self.__variables_settings = self.__init_settings( input_path, input_features_configs) self.__features_dic_names = self.__init_features_names() self.__features_dic_dtypes = self.__init_features_dtypes() self.__init_output(output_path, output_table)
def __init__(self, method_name: str, path: str = None, title: str = None): """Initialise the objects and constants. :param method_name: the training method that will be used (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation, 'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier, 'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes, 'nn': Multi-Layer Perceptron (MLP) Neural Network}). :param path: the directory path of the saved trained model file, using this application (if applicable). :param title: the file name of the saved trained model file, using this application """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__readers_writers = ReadersWriters() self.__method = None self.method_name = method_name self.model_labels = None self.model_train = None self.model_predict = dict() self.model_cross_validate = None if method_name is not None: self.__init__method(method_name) else: self.load(path, title)
class PreProcess: def __init__(self, output_path: str): """Initialise the objects and constants. :param output_path: """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__output_path = output_path self.__readers_writers = ReadersWriters() def stats_discrete_df(self, df: PandasDataFrame, includes: List, file_name: str) -> PandasDataFrame: """Calculate the odds ratio for all the features that are included and all the categorical states. :param df: the features dataframe. :param includes: the name of included features. :param file_name: the name of the summary output file. :return: the summary output. """ self.__logger.debug("Produce statistics for discrete features.") summaries = None self.__readers_writers.save_csv(path=self.__output_path, title=file_name, data=[], append=False) for f_name in includes: if f_name in df: self.__readers_writers.save_csv(path=self.__output_path, title=file_name, data=["Feature Name", f_name], append=True) summaries = stats.itemfreq(df[f_name]) summaries = pd.DataFrame({ "value": summaries[:, 0], "freq": summaries[:, 1] }) summaries = summaries.sort_values("freq", ascending=False) self.__readers_writers.save_csv(path=self.__output_path, title=file_name, data=summaries, append=True, header=True) return summaries def stats_continuous_df(self, df: PandasDataFrame, includes: List, file_name: str) -> PandasDataFrame: """Calculate the descriptive statistics for all the included continuous features. :param df: the features dataframe. :param includes: the name of included features. :param file_name: the name of the summary output file. :return: the summary output. """ self.__logger.debug("Produce statistics for continuous features.") summaries = None self.__readers_writers.save_csv(path=self.__output_path, title=file_name, data=[], append=False) for f_name in includes: if f_name in df: self.__readers_writers.save_csv(path=self.__output_path, title=file_name, data=["Feature Name", f_name], append=True) summaries = df[f_name].apply(pd.to_numeric).describe( percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]).transpose() summaries = pd.Series.to_frame(summaries).transpose() self.__readers_writers.save_csv(path=self.__output_path, title=file_name, data=summaries, append=True, header=True) return summaries def factoring_group_wise(self, df: PandasDataFrame, categories_dic: Dict, labels_dic: Dict, dtypes_dic: Dict, threaded: bool = False) -> PandasDataFrame: """Categorise groups of features that are selected. :param df: the features dataframe. :param categories_dic: the dictionary of the categorical states for the included features. :param labels_dic: the dictionary of the features names of the categorised features. :param dtypes_dic: the dictionary of the dtypes of the categorised features. :param threaded: indicates if it is multi-threaded. :return: the inputted dataframe with categorised features (if applicable). """ self.__logger.debug("Categorise groups of features.") categories_dic = OrderedDict(categories_dic) if threaded is not True: pool_df_encoded = self.__factoring_group_wise_series( df, categories_dic, labels_dic) else: pool_df_encoded = self.__factoring_group_wise_threaded( df, categories_dic, labels_dic) # encoded labels labels_encoded = [] for label_group in categories_dic.keys(): labels_encoded += list(categories_dic[label_group].keys()) # preserve types dtype_orig = {**df.dtypes.to_dict(), **dtypes_dic} dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes for label in labels_encoded: del dtype_orig[label] # combine df = df.drop(labels_encoded, axis=1) df = pd.concat([df] + pool_df_encoded, axis=1) df = df.astype(dtype_orig) return df def __factoring_group_wise_series(self, df: PandasDataFrame, categories_dic: Dict, labels_dic: Dict) -> List: """Categorise a group of features that are selected (single-threaded). :param df: the features dataframe. :param categories_dic: the dictionary of the categorical states for the included features. :param labels_dic: the dictionary of the features names of the categorised features. :return: the categorised features. """ self.__logger.debug("Categorise groups of features (single-threaded).") factoring_thread = FactoringThread(df, categories_dic, labels_dic) pool_df_encoded = [] try: for label_group in categories_dic.keys(): pool_df_encoded.append( factoring_thread.factor_arr_multiple(label_group)) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() return pool_df_encoded def __factoring_group_wise_threaded(self, df: PandasDataFrame, categories_dic: Dict, labels_dic: Dict) -> List: """Categorise a group of features that are selected (multi-threaded). :param df: the features dataframe. :param categories_dic: the dictionary of the categorical states for the included features. :param labels_dic: the dictionary of the features names of the categorised features. :return: the categorised features. """ self.__logger.debug("Categorise groups of features (multi-threaded).") factoring_thread = FactoringThread(df, categories_dic, labels_dic) try: with mp.Pool(processes=(mp.cpu_count() - 1)) as pool: pool_df_encoded = pool.map( partial(factoring_thread.factor_arr_multiple), categories_dic.keys()) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() return pool_df_encoded def factoring_feature_wise(self, df: PandasDataFrame, categories_dic: Dict, labels_dic: Dict, dtypes_dic: Dict, threaded: bool = False) -> PandasDataFrame: """Categorise features that are selected. :param df: the features dataframe. :param categories_dic: the dictionary of the categorical states for the included features. :param labels_dic: the dictionary of the features names of the categorised features. :param dtypes_dic: the dictionary of the dtypes of the categorised features. :param threaded: indicates if it is multi-threaded. :return: the inputted dataframe with categorised features (if applicable). """ self.__logger.debug("Categorise.") categories_dic = OrderedDict(categories_dic) if threaded is not True: pool_df_encoded = self.__factoring_feature_wise_series( df, categories_dic, labels_dic) else: pool_df_encoded = self.__factoring_feature_wise_threaded( df, categories_dic, labels_dic) # encoded labels labels_encoded = list(categories_dic.keys()) # preserve types dtype_orig = {**df.dtypes.to_dict(), **dtypes_dic} dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes for label in labels_encoded: del dtype_orig[label] # combine df = df.drop(labels_encoded, axis=1) df = pd.concat([df] + pool_df_encoded, axis=1) df = df.astype(dtype_orig) return df def __factoring_feature_wise_series(self, df: PandasDataFrame, categories_dic: Dict, labels_dic: Dict) -> List: """Categorise features that are selected (single-threaded). :param df: the features dataframe. :param categories_dic: the dictionary of the categorical states for the included features. :param labels_dic: the dictionary of the features names of the categorised features. :return: the categorised features. """ self.__logger.debug("Categorise (single-threaded).") factoring_thread = FactoringThread(df, categories_dic, labels_dic) pool_df_encoded = [] try: for label_group in categories_dic.keys(): pool_df_encoded.append( factoring_thread.factor_arr(label_group)) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() return pool_df_encoded def __factoring_feature_wise_threaded(self, df: PandasDataFrame, categories_dic: Dict, labels_dic: Dict) -> List: """Categorise features that are selected (multi-threaded). :param df: the features dataframe. :param categories_dic: the dictionary of the categorical states for the included features. :param labels_dic: the dictionary of the features names of the categorised features. :return: the categorised features. """ self.__logger.debug("Categorise (multi-threaded).") factoring_thread = FactoringThread(df, categories_dic, labels_dic) try: with mp.Pool() as pool: pool_df_encoded = pool.map( partial(factoring_thread.factor_arr), categories_dic.keys()) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() return pool_df_encoded def transform_df(self, df: PandasDataFrame, excludes: List, transform_type: str, threaded: bool = False, method_args: Dict = None, **kwargs: Any) -> [PandasDataFrame, Dict]: """Transform the included features, using the selected and configured method. :param df: the features dataframe. :param excludes: the name of excluded features. :param transform_type: the transformation type (options: 'scale', 'robust_scale', 'max_abs_scalar', 'normalizer', 'kernel_centerer', 'yeo_johnson', 'box_cox') :param threaded: indicates if it is multi-threaded. :param method_args: the transformation arguments, which needs to preserved if it is applied to more than one data set. :param kwargs: the input argument for the selected transformation function. :return: the inputted dataframe with transformed features (if applicable). """ self.__logger.info("Transform Features.") excludes = set(excludes) includes = [ label for label in df.columns.values if label not in excludes ] method_args = dict() if method_args is None else method_args # preserve types dtype_orig = df.dtypes.to_dict() for label in includes: dtype_orig[label] = 'f8' dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes df = df.astype(dtype_orig) # transform if threaded is False: df, method_args = self.__transform_df_series( df, includes, transform_type, **kwargs) else: df, method_args = self.__transform_df_threaded( df, includes, transform_type, method_args, **kwargs) return df, method_args def __transform_df_series(self, df: PandasDataFrame, includes: List, transform_type: str, method_args: Dict = None, **kwargs: Any) -> [PandasDataFrame, Dict]: """Transform the included features, using the selected and configured method (single-threaded). :param df: the features dataframe. :param includes: the name of included features. :param transform_type: the transformation type (options: 'scale', 'robust_scale', 'max_abs_scalar', 'normalizer', 'kernel_centerer', 'yeo_johnson', 'box_cox') :param method_args: the transformation arguments, which needs to preserved if it is applied to more than one data set. :param kwargs: the input argument for the selected transformation function. :return: the transformed feature. """ self.__logger.debug("Transform features (single-threaded).") transform_thread = TransformThread(**kwargs) method_args = dict() if method_args is None else method_args try: if transform_type == "scale": for name in includes: transform_thread.transform_scale_arr(df, method_args, name) elif transform_type == "robust_scale": for name in includes: transform_thread.transform_robust_scale_arr( df, method_args, name) elif transform_type == "max_abs_scalar": for name in includes: transform_thread.transform_max_abs_scalar_arr( df, method_args, name) elif transform_type == "normalizer": for name in includes: transform_thread.transform_normalizer_arr( df, method_args, name) elif transform_type == "kernel_centerer": for name in includes: transform_thread.transform_kernel_centerer_arr( df, method_args, name) elif transform_type == "yeo_johnson": for name in includes: transform_thread.transform_yeo_johnson_arr( df, method_args, name) elif transform_type == "box_cox": for name in includes: transform_thread.transform_box_cox_arr( df, method_args, name) else: raise Exception(transform_type) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() return df, method_args def __transform_df_threaded(self, df: PandasDataFrame, includes: List, transform_type: str, method_args: Dict = None, **kwargs: Any) -> [PandasDataFrame, Dict]: """Transform the included features, using the selected and configured method (multi-threaded). :param df: the features dataframe. :param includes: the name of included features. :param transform_type: the transformation arguments, which needs to preserved if it is applied to more than one data set. :param method_args: the transformation arguments, which needs to preserved if it is applied to more than one data set. :param kwargs: the input argument for the selected transformation function. :return: the transformed feature. """ self.__logger.debug("Transform features (multi-threaded).") manager = mp.Manager() dt = manager.dict( list(zip(df[includes].columns, df[includes].T.values.tolist()))) transform_thread = TransformThread(**kwargs) method_args = dict() if method_args is None else method_args # run try: with mp.Pool(processes=(mp.cpu_count() - 1)) as pool: if transform_type == "scale": pool.map( partial(transform_thread.transform_scale_arr, dt, method_args), includes) elif transform_type == "robust_scale": pool.map( partial(transform_thread.transform_robust_scale_arr, dt, method_args), includes) elif transform_type == "max_abs_scalar": pool.map( partial(transform_thread.transform_max_abs_scalar_arr, dt, method_args), includes) elif transform_type == "normalizer": pool.map( partial(transform_thread.transform_normalizer_arr, dt, method_args), includes) elif transform_type == "kernel_centerer": pool.map( partial(transform_thread.transform_kernel_centerer_arr, dt, method_args), includes) elif transform_type == "yeo_johnson": pool.map( partial(transform_thread.transform_yeo_johnson_arr, dt, method_args), includes) elif transform_type == "box_cox": pool.map( partial(transform_thread.transform_box_cox_arr, dt, method_args), includes) else: raise Exception(transform_type) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() # set for k, v in dt.items(): df[k] = v return df, method_args def high_linear_correlation_df( self, df: PandasDataFrame, excludes: List, file_name: str, thresh_corr_cut: float = 0.95, to_search: bool = True ) -> [PandasDataFrame, CollectionsOrderedDict]: """Find and optionally remove the selected highly linearly correlated features. The Pearson correlation coefficient was calculated for all the pair of variables to measure linear dependence between them. :param df: the features dataframe. :param excludes: the name of excluded features. :param file_name: the name of the summary output file. :param thresh_corr_cut: the numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95. :param to_search: to search or use the saved configuration. :return: the inputted dataframe with exclusion of features that were selected to be removed. """ self.__logger.debug( "Remove features with high linear correlation (if applicable).") corr = None df_excludes = df[excludes] excludes = set(excludes) matches = [] summaries = OrderedDict() # search if to_search is True: corr = df[[col for col in df.columns if col not in excludes]].corr(method='pearson') for label in corr.columns.values: matches_temp = list( corr[abs(corr[label]) >= thresh_corr_cut].index) if len(matches_temp) > 1: # set matches try: matches_temp.remove(label) except ValueError and AttributeError: pass # not in some-list! OR not behaving like a list! matches = np.union1d(matches, matches_temp) # summaries for match in matches_temp: if match in summaries.keys(): matches_temp.remove(match) if len(matches_temp) > 0: summaries[label] = matches_temp self.__logger.info("High Linear Correlation: " + label + " ~ " + str(matches_temp)) # delete df = self.__remove( df, summaries, to_search, os.path.join(self.__output_path, file_name + ".ini")) for name in excludes: df[name] = df_excludes[name] if any(np.isnan(df.index)): df = df.reset_index(drop=True) # summaries if to_search is True: summaries["Features Matches"] = matches summaries["Correlation Matrix"] = corr return df, summaries def near_zero_var_df_sklearn( self, df: PandasDataFrame, excludes: List, file_name: str, thresh_variance: float = 0.05, to_search: bool = True ) -> [PandasDataFrame, CollectionsOrderedDict]: """Find and optionally remove the selected near-zero-variance features (Scikit algorithm). Feature selector that removes all low-variance features. This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. :param df: the features dataframe. :param excludes: the name of excluded features. :param file_name: the name of the summary output file. :param thresh_variance: Features with a training-set variance lower than this threshold will be removed. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples. :param to_search: to search or use the saved configuration. :return: the inputted dataframe with exclusion of features that were selected to be removed. """ self.__logger.debug( "Remove features with near-zero-variance (if applicable), using Scikit algorithm." ) df_excludes = df[excludes] excludes = set(excludes) matches = [] indices = OrderedDict() summaries = OrderedDict() # find indices for label in df.columns.values(): indices[df.columns.get_loc(label)] = label # search if to_search is True: variances_ = feature_selection.VarianceThreshold(thresh_variance) matches_indices = variances_.get_support(indices=True) matches_labels = [indices[index] for index in matches_indices] for match in matches_labels: if match not in excludes: matches += [match] # delete df = self.__remove( df, {'NZV': list(matches)}, to_search, os.path.join(self.__output_path, file_name + ".ini")) for name in excludes: df[name] = df_excludes[name] if any(np.isnan(df.index)): df = df.reset_index(drop=True) # summaries if to_search is True: summaries["Features Matches"] = matches return df, summaries def near_zero_var_df( self, df: PandasDataFrame, excludes: List, file_name: str, thresh_unique_cut: float = 100, thresh_freq_cut: float = 1000, to_search: bool = True ) -> [PandasDataFrame, CollectionsOrderedDict]: """Find and optionally remove the selected near-zero-variance features (custom algorithm). The features that had constant counts less than or equal a threshold may be filtered out, to exclude highly constants and near-zero variances. Rules are as the following: - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be greater than a threshold; - Percent of unique values: The number of unique values divided by the total number of samples to be greater than the threshold. :param df: the features dataframe. :param excludes: the name of excluded features. :param file_name: the name of the summary output file. :param thresh_unique_cut: the cutoff for the percentage of distinct values out of the number of total samples (upper limit). e.g. 10 * 100 / 100. :param thresh_freq_cut: the cutoff for the ratio of the most common value to the second most common value (lower limit). e.g. 95/5. :param to_search: to search or use the saved configuration. :return: the inputted dataframe with exclusion of features that were selected to be removed. """ self.__logger.debug( "Remove features with near-zero-variance (if applicable), using custom algorithm." ) df_excludes = df[excludes] excludes = set(excludes) matches = [] summaries = OrderedDict() # search if to_search is True: for label in df.columns.values: # set match and summaries # check of NaN if not isinstance(df[label].iloc[0], (int, np.int, float, np.float)) \ or np.isnan(np.sum(df[label])): matches += [label] continue # check of near zero variance match, summaries[label] = self.__near_zero_var( df[label], label, excludes, thresh_unique_cut, thresh_freq_cut) if match is True: matches += [label] self.__logger.info("Near Zero Variance: " + label) # to_remove df = self.__remove( df, {'NZV': list(matches)}, to_search, os.path.join(self.__output_path, file_name + ".ini")) for name in excludes: df[name] = df_excludes[name] if any(np.isnan(df.index)): df = df.reset_index(drop=True) # summaries if to_search is True: summaries["Features Matches"] = matches return df, summaries def __near_zero_var(self, arr: List, label: str, excludes: set, thresh_unique_cut: float, thresh_freq_cut: float) -> [bool, Dict]: """Assess a single feature for near-zero-variance (custom algorithm). The features that had constant counts less than or equal a threshold may be filtered out, to exclude highly constants and near-zero variances. Rules are as the following: - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be greater than a threshold; - Percent of unique values: The number of unique values divided by the total number of samples to be greater than the threshold. :param arr: the feature value. :param label: the feature name. :param excludes: the name of excluded features. :param thresh_unique_cut: the cutoff for the percentage of distinct values out of the number of total samples (upper limit). e.g. 10 * 100 / 100. :param thresh_freq_cut: the cutoff for the ratio of the most common value to the second most common value (lower limit). e.g. 95/5. :return: indicates if the feature has near-zero-variance. """ self.__logger.debug( "Find near-zero-variance (if applicable), using custom algorithm.") unique, counts = np.unique(arr, return_counts=True) if len(counts) == 1: return True, {'unique': list(unique), 'counts': list(counts)} else: counts = sorted(counts, reverse=True) if label not in excludes and (len(unique) * 100) / float( len(arr)) > thresh_unique_cut: return True, {'unique': list(unique), 'counts': list(counts)} if label not in excludes and counts[0] / float( counts[1]) > thresh_freq_cut: return True, {'unique': list(unique), 'counts': list(counts)} else: return False, {'unique': list(unique), 'counts': list(counts)} def __remove(self, df: PandasDataFrame, dict_matches: Dict, to_search: bool, path: str, section: str = "features") -> PandasDataFrame: """Confirm removals and if confirmed, then re-read the selected features, then remove :param df: the features dataframe. :param dict_matches: the matched features. :param to_search: to search or use the saved configuration. :param path: the file path to the configuration file. :param section: the section name in the configuration file. :return: the updated features. """ self.__logger.debug("Confirm removals and implement removal process.") config = PyConfigParser(path, CONSTANTS.app_name) if to_search is True: # write to config config.reset() config.write_dict(dict_matches, section) # confirm response = self.__readers_writers.question_overwrite( "the features defined in the following file to be removed: " + path) if response is False: config.reset() return df # if to_search is False or response was yes then read from config config.refresh() dict_matches = config.read_dict(section) # remove self.__logger.debug("The feature removal list: " + ",".join(dict_matches)) labels = [ label for label_group in dict_matches.values() for label in label_group if label in df ] if len(labels) > 0: df = df.drop(labels, axis=1) return df
class FeatureParser: def __init__(self, variables_settings: PandasDataFrame, output_path: str, output_table: str): """Initialise the objects and constants. :param variables_settings: :param output_path: the output path. :param output_table: the output table name. """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__variables_settings = variables_settings self.__output_path = output_path self.__output_table = output_table self.__readers_writers = ReadersWriters() self.__FeatureParserThread = FeatureParserThread() def generate(self, history_table: str, features: PandasDataFrame, variables: PandasDataFrame, prevalence: Dict) -> PandasDataFrame: """ :param history_table: the source table alias name (a.k.a. history table name) that features belong to (e.g. inpatient, or outpatient). :param features: the output features. :param variables: the input variables. :param prevalence: the prevalence dictionary of values for all the variables. :return: the output features. """ variables_settings = self.__variables_settings[ self.__variables_settings["Table_History_Name"] == history_table] for _, row in variables_settings.iterrows(): self.__logger.info("variable: " + row["Variable_Name"] + " ...") if not pd.isnull(row["Variable_Aggregation"]): postfixes = row["Variable_Aggregation"].replace(' ', '').split(',') # aggregate stats features_temp = self.__aggregate( variables[row["Variable_Name"]], row["Variable_Type_Original"], postfixes, prevalence[row["Variable_Name"]]) for p in range(len(postfixes)): # feature name feature_name = row["Variable_Name"] + "_" + postfixes[p] # set features[feature_name] = features_temp[:, p] else: # init and replace none by zero features_temp = np.nan_to_num(variables[row["Variable_Name"]]) features_temp = np.where(features_temp == np.array(None), 0, features_temp) # set features[row["Variable_Name"]] = features_temp return features def __aggregate(self, variable: PandasDataFrame, variable_type: str, postfixes: str, prevalence: Dict) -> NumpyNdarray: """ :param variable: the input variable. :param variable_type: the type of input variable. :param postfixes: name of the aggregation functions. :param prevalence: the prevalence dictionary of values for all the variables. :return: the aggregated variable. """ try: with mp.Pool() as pool: features_temp = pool.map( partial(self.__FeatureParserThread.aggregate_cell, postfixes, variable_type, prevalence), variable) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() features_temp = np.asarray(features_temp) return features_temp def prevalence(self, variable: PandasDataFrame, variable_name: str) -> List: """ :param variable: the input variable. :param variable_name: the name of the input variable. :return: the prevalence of values for all the variables. """ try: with mp.Pool() as pool: prevalence_temp = pool.map( partial(self.__FeatureParserThread.prevalence_cell), variable) except ValueError as exception: self.__logger.error(__name__ + " - Invalid configuration(s): " + str(exception)) sys.exit() prevalence_temp = [sub2 for sub1 in prevalence_temp for sub2 in sub1] prevalence = Counter(prevalence_temp).most_common() self.__readers_writers.save_text( self.__output_path, self.__output_table, [ variable_name, '; '.join( [str(p[0]) + ":" + str(p[1]) for p in prevalence]) ], append=True, ext="txt") prevalence = [p[0] for p in prevalence] return prevalence
class Variables: def __init__(self, model_features_table: str, input_path: str, output_path: str, input_features_configs: str, output_table: str): """Initialise the objects and constants. :param model_features_table: the feature table name. :param input_path: the input path. :param output_path: the output path. :param input_features_configs: the input features' configuration file. :param output_table: the output table name. """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__model_features_table = model_features_table self.__output_path = output_path self.__output_table = output_table self.__readers_writers = ReadersWriters() # initialise settings self.__variables_settings = self.__init_settings( input_path, input_features_configs) self.__features_dic_names = self.__init_features_names() self.__features_dic_dtypes = self.__init_features_dtypes() self.__init_output(output_path, output_table) def set(self, input_schemas: List, input_tables: List, history_tables: List, column_index: str, query_batch_size: int): """Set the variables by reading the selected features from MySQL database. :param input_schemas: the mysql database schemas. :param input_tables: the mysql table names. :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to (e.g. inpatient, or outpatient). :param column_index: the name of index column (unique integer value) in the database table, which is used for batch reading the input. :param query_batch_size: the number of rows to be read in each batch. :return: """ self.__logger.debug(__name__) query_batch_start, query_batch_max = self.__init_batch( input_schemas[0], input_tables[0]) features_names, features_dtypes = self.__set_features_names_types() self.__validate_mysql_names(input_schemas, input_tables) prevalence = self.__init_prevalence(input_schemas, input_tables, history_tables) self.__set_batch(features_names, features_dtypes, input_schemas, input_tables, history_tables, column_index, prevalence, query_batch_start, query_batch_max, query_batch_size) def __init_settings(self, input_path: str, input_features_configs: str) -> PandasDataFrame: """Read and set the settings of input variables that are selected. :param input_path: the path of the input file. :param input_features_configs: the input features' configuration file. :return: the input variables settings. """ self.__logger.debug(__name__) variables_settings = self.__readers_writers.load_csv( input_path, input_features_configs, 0, True) variables_settings = variables_settings.loc[ (variables_settings["Selected"] == 1) & (variables_settings["Table_Reference_Name"] == self.__model_features_table)] variables_settings = variables_settings.reset_index() return variables_settings def __init_features_names(self) -> Dict: """Generate the features names, based on variable name, source table alias name (a.k.a. history table name), and the aggregation function name. :return: the name of features. """ self.__logger.debug(__name__) table_history_names = set( self.__variables_settings["Table_History_Name"]) features_names = dict( zip(table_history_names, [[] for _ in range(len(table_history_names))])) for _, row in self.__variables_settings.iterrows(): if not pd.isnull(row["Variable_Aggregation"]): postfixes = row["Variable_Aggregation"].replace(' ', '').split(',') for postfix in postfixes: features_names[row["Table_History_Name"]].append( row["Variable_Name"] + "_" + postfix) else: features_names[row["Table_History_Name"]].append( row["Variable_Name"]) return features_names def __init_features_dtypes(self) -> Dict: """Generate the features types, based on the input configuration file. :return: the dtypes of features. """ self.__logger.debug(__name__) table_history_names = set( self.__variables_settings["Table_History_Name"]) features_dtypes = dict( zip(table_history_names, [[] for _ in range(len(table_history_names))])) for _, row in self.__variables_settings.iterrows(): feature_types = row["Variable_dType"].replace(' ', '').split(',') for feature_type in feature_types: features_dtypes[row["Table_History_Name"]].append(feature_type) return features_dtypes def __init_output(self, output_path: str, output_table: str): """Initialise the output file by writing the header row. :param output_path: the output path. :param output_table: the output table name. """ self.__logger.debug(__name__) keys = sorted(self.__features_dic_names.keys()) features_names = [ f for k in keys for f in self.__features_dic_names[k] ] self.__readers_writers.reset_csv(output_path, output_table) self.__readers_writers.save_csv(output_path, output_table, features_names, append=False) def __init_prevalence(self, input_schemas: List, input_tables: List, history_tables: List) -> Dict: """Generate the prevalence dictionary of values for all the variables. :param input_schemas: the mysql database schemas. :param input_tables: the mysql table names. :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to (e.g. inpatient, or outpatient). :return: the prevalence dictionary of values for all the variables. """ self.__readers_writers.save_text( self.__output_path, self.__output_table, ["Feature Name", "Top Prevalence Feature Name"], append=False, ext="ini") self.__readers_writers.save_text( self.__output_path, self.__output_table, ["Feature Name", "Prevalence & Freq."], append=False, ext="txt") feature_parser = FeatureParser(self.__variables_settings, self.__output_path, self.__output_table) prevalence = dict() # for tables for table_i in range(len(input_schemas)): variables_settings = self.__variables_settings[ self.__variables_settings["Table_History_Name"] == history_tables[table_i]] prevalence[input_tables[table_i]] = dict() # for features for _, row in variables_settings.iterrows(): self.__logger.info("Prevalence: " + row["Variable_Name"] + " ...") if not pd.isnull(row["Variable_Aggregation"]): # read features variables = self.__init_prevalence_read( input_schemas[table_i], input_tables[table_i], row["Variable_Name"]) # validate if variables is None or len(variables) == 0: continue # prevalence prevalence[input_tables[table_i]][row["Variable_Name"]] = \ feature_parser.prevalence(variables[row["Variable_Name"]], row["Variable_Name"]) # for sub features postfixes = row["Variable_Aggregation"].replace( ' ', '').split(',') for p in range(len(postfixes)): feature_name = row["Variable_Name"] + "_" + postfixes[p] if len(postfixes[p] ) > 11 and postfixes[p][0:11] == "prevalence_": index = int(postfixes[p].split('_')[1]) - 1 feature_name_prevalence = "None" if index < len(prevalence[input_tables[table_i]][ row["Variable_Name"]]): feature_name_prevalence = \ feature_name + "_" + \ str(prevalence[input_tables[table_i]][row["Variable_Name"]][index]) # save prevalence self.__readers_writers.save_text( self.__output_path, self.__output_table, [feature_name, feature_name_prevalence], append=True, ext="ini") return prevalence def __init_prevalence_read(self, input_schema: str, input_table: str, variable_name: str) -> PandasDataFrame: """Read a variable from database, to calculate the prevalence of the values. :param input_schema: the mysql database schema. :param input_table: the mysql database table. :param variable_name: the variable name. :return: the selected variable. """ query = "SELECT `" + variable_name + "` FROM `" + input_table + "`;" return self.__readers_writers.load_mysql_query(query, input_schema, dataframing=True) def __init_batch(self, input_schema: str, input_table: str) -> [int, int]: """Find the minimum and maximum value of the index column, to use when reading mysql tables in batches. :param input_schema: the mysql database schema. :param input_table: the mysql database table. :return: the minimum and maximum of the index column. """ self.__logger.debug(__name__) query = "select min(localID), max(localID) from `" + input_table + "`;" output = list( self.__readers_writers.load_mysql_query(query, input_schema, dataframing=False)) if [r[0] for r in output][0] is None: self.__logger.error(__name__ + " No data is found: " + query) sys.exit() query_batch_start = int([r[0] for r in output][0]) query_batch_max = int([r[1] for r in output][0]) return query_batch_start, query_batch_max def __set_features_names_types(self): """Produce the sorted lists of features names and features dtypes. :return: the sorted lists of features names and features dtypes. """ self.__logger.debug(__name__) keys = sorted(self.__features_dic_names.keys()) features_names = [ f for k in keys for f in self.__features_dic_names[k] ] features_dtypes = [ pd.Series(dtype=f) for k in keys for f in self.__features_dic_dtypes[k] ] features_dtypes = pd.DataFrame( dict(zip(features_names, features_dtypes))).dtypes return features_names, features_dtypes def __set_batch(self, features_names: list, features_dtypes: Dict, input_schemas: List, input_tables: List, history_tables: List, column_index: str, prevalence: Dict, query_batch_start: int, query_batch_max: int, query_batch_size: int): """Using batch processing first read variables, then generate features and write them into output. :param features_names: the name of features that are selected. :param features_dtypes: the dtypes of features that are selected. :param input_schemas: the mysql database schemas. :param input_tables: the mysql table names. :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to (e.g. inpatient, or outpatient). :param column_index: the name of index column (unique integer value) in the database table, which is used for batch reading the input. :param prevalence: the prevalence dictionary of values for all the variables. :param query_batch_start: the minimum value of the column index. :param query_batch_max: the maximum value of the column index. :param query_batch_size: the number of rows to be read in each batch. """ self.__logger.debug(__name__) feature_parser = FeatureParser(self.__variables_settings, self.__output_path, self.__output_table) step = -1 batch_break = False while not batch_break: step += 1 features = None for table_i in range(len(input_schemas)): self.__logger.info("Batch: " + str(step) + "; Table: " + input_tables[table_i]) # read job variables = self.__set_batch_read(input_schemas[table_i], input_tables[table_i], step, column_index, query_batch_start, query_batch_max, query_batch_size) # validate if variables is None: batch_break = True break elif len(variables) == 0: continue # process job if features is None: features = pd.DataFrame(0, index=range(len(variables)), columns=features_names) features = features.astype(dtype=features_dtypes) features = self.__set_batch_process( feature_parser, history_tables[table_i], features, variables, prevalence[input_tables[table_i]]) # write job if features is not None: features = features.astype(dtype=features_dtypes) self.__set_batch_write(features) def __set_batch_read( self, input_schema: str, input_table: str, step: int, column_index: str, query_batch_start: int, query_batch_max: int, query_batch_size: int) -> Callable[[PandasDataFrame, None], None]: """Read the queried variables. :param input_schema: the mysql database schema. :param input_table: the mysql database table. :param step: the batch id. :param column_index: the name of index column (unique integer value) in the database table, which is used for batch reading the input. :param query_batch_start: the minimum value of the column index. :param query_batch_max: the maximum value of the column index. :param query_batch_size: the number of rows to be read in each batch. :return: the queried variables. """ step_start = query_batch_start + step * query_batch_size step_end = step_start + query_batch_size if step_start >= query_batch_max: return None # read query = "SELECT * FROM `" + input_table + \ "` WHERE `" + str(column_index) + "` >= " + str(step_start) + \ " AND `" + str(column_index) + "` < " + str(step_end) + ";" return self.__readers_writers.load_mysql_query(query, input_schema, dataframing=True) def __set_batch_process(self, feature_parser: FeaturesFeatureParser, history_table: str, features: PandasDataFrame, variables: PandasDataFrame, prevalence: List) -> PandasDataFrame: """Process variables and generate features. :param feature_parser: :param history_table: the source table alias name (a.k.a. history table name) that features belong to (e.g. inpatient, or outpatient). :param features: the output features. :param variables: the input variables. :param prevalence: the prevalence dictionary of values for all the variables. :return: the generated features. """ return feature_parser.generate(history_table, features, variables, prevalence) def __set_batch_write(self, features: PandasDataFrame): """Write the features into an output file. :param features: the output features. """ self.__readers_writers.save_csv(self.__output_path, self.__output_table, features, append=True) def __validate_mysql_names(self, input_schemas: List, history_tables: List): """Validate mysql tables and their columns, and generate exception if table/column name is invalid. :param input_schemas: the mysql database schemas. :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to (e.g. inpatient, or outpatient). """ # for tables for table_i in range(len(input_schemas)): variables_settings = self.__variables_settings[ self.__variables_settings["Table_History_Name"] == history_tables[table_i]] # validate table name if not self.__readers_writers.exists_mysql( input_schemas[table_i], history_tables[table_i]): self.__logger.error(__name__ + " - Table does not exist: " + history_tables[table_i]) sys.exit() # for features for _, row in variables_settings.iterrows(): # validate column name if not self.__readers_writers.exists_mysql_column( input_schemas[table_i], history_tables[table_i], row["Variable_Name"]): self.__logger.error(__name__ + " - Column does not exist: " + row["Variable_Name"]) sys.exit()
class TrainingMethod: def __init__(self, method_name: str, path: str = None, title: str = None): """Initialise the objects and constants. :param method_name: the training method that will be used (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation, 'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier, 'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes, 'nn': Multi-Layer Perceptron (MLP) Neural Network}). :param path: the directory path of the saved trained model file, using this application (if applicable). :param title: the file name of the saved trained model file, using this application """ self.__logger = logging.getLogger(CONSTANTS.app_name) self.__logger.debug(__name__) self.__readers_writers = ReadersWriters() self.__method = None self.method_name = method_name self.model_labels = None self.model_train = None self.model_predict = dict() self.model_cross_validate = None if method_name is not None: self.__init__method(method_name) else: self.load(path, title) def __init__method(self, method_name: str, model_labels: List = None, model_train: Any = None, model_predict: Dict = None, model_cross_validate: NumpyNDArray = None): """Initialise the selected training method. :param method_name: the training method that will be used (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation, 'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier, 'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes, 'nn': Multi-Layer Perceptron (MLP) Neural Network}). :param model_labels: the features names to be inputted into the model. Note: the order of features will be preserved internally. :param model_train: the training model. :param model_predict: the prediction outputs. :param model_cross_validate: the cross-validation model. """ self.__logger.debug("Initialise the training method.") if method_name == "lr": self.__method = _LogisticRegression() elif method_name == "lr_cv": self.__method = _LogisticRegressionCV() elif method_name == "mlm": self.__method = _MixedLinearModel() elif method_name == "rfc": self.__method = _RandomForestClassifier() elif method_name == "gbc": self.__method = _GradientBoostingClassifier() elif method_name == "dtc": self.__method = _DecisionTreeClassifier() elif method_name == "knc": self.__method = _KNeighborsClassifier() elif method_name == "nb": self.__method = _NaiveBayes() elif method_name == "nn": self.__method = _NeuralNetwork() else: self.__logger.error(__name__ + " - Invalid training method: " + str(method_name)) sys.exit() self.model_labels = model_labels self.model_train = model_train self.model_predict = dict() if model_predict is None else model_predict self.model_cross_validate = model_cross_validate def train(self, features_indep_df: PandasDataFrame, feature_target: List, **kwargs: Any) -> Any: """Perform the training, using the selected method. :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param kwargs: the training method's argument. :return: the trained model. """ self.__logger.debug("Train.") self.model_labels = list(features_indep_df.columns.values) self.model_train = self.__method.train( features_indep_df[self.model_labels], feature_target, self.model_labels, **kwargs) return self.model_train def plot(self) -> Any: """Plot the tree diagram. :return: the model graph. """ self.__logger.debug("Plot.") return self.__method.plot(self.model_train, self.model_labels, ["True", "False"]) def train_summaries(self) -> Any: """ Produce the training summary. :return: the training summary. """ self.__logger.debug("Summarise training model.") return self.__method.train_summaries(self.model_train) def predict(self, features_indep_df: PandasDataFrame, sample_name: str) -> PandasDataFrame: """Predict probability of labels, using the training model. :param features_indep_df: the independent features, which are inputted into the model. :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate'). :return: the predicted probabilities, and the predicted labels. """ self.__logger.debug("Predict.") self.model_predict[sample_name] = self.__method.predict( self.model_train, features_indep_df[self.model_labels]) return self.model_predict[sample_name] def predict_summaries(self, feature_target: List, sample_name: str) -> CollectionsOrderedDict: """roduce summary statistics for the prediction performance. :param feature_target: the target feature, which is being estimated. :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate'). :return: the prediction summaries. """ self.__logger.debug("Summarise predictions.") self.model_predict[sample_name]['target'] = feature_target return self.__method.predict_summaries(self.model_predict[sample_name], feature_target) def predict_summaries_risk_bands( self, feature_target: List, sample_name: str, cutoffs: List = np.arange(0, 1.05, 0.05) ) -> CollectionsOrderedDict: """Produce a summary statistics table for a range of cut-off points. :param feature_target: the target feature, which is being estimated. :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate'). :param cutoffs: a list of risk cut-off points. :return: the summary statistics table for the cut-off points. """ self.__logger.debug("Summarise predictions.") self.model_predict[sample_name]['target'] = feature_target return self.__method.predict_summaries_cutoffs_table( self.model_predict[sample_name]['score'], feature_target, cutoffs) def cross_validate(self, features_indep_df: PandasDataFrame, feature_target: List, scoring: str = "neg_mean_squared_error", cv: int = 10) -> Any: """Evaluate the model by performing cross-validation. :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param scoring: the scoring method (default: 'neg_mean_squared_error'). :param cv: the cross-validation splitting strategy (optional). :return: the cross-validation summary """ self.__logger.info("Cross-Validate") self.model_cross_validate = cross_val_score( self.model_train, features_indep_df[self.model_labels], feature_target, scoring=scoring, cv=cv) return self.model_cross_validate def cross_validate_summaries(self) -> Any: """Produce a summary of the applied cross-validation :return: the cross-validation summary """ return self.model_cross_validate def save_model(self, path: str, title: str): """Save (pickle) the training model, as well as predictions and cross-validations. Note: summaries statistics won't not saved. :param path: the directory path of the saved trained model file, using this application (if applicable). :param title: the file name of the saved trained model file, using this application. """ self.__logger.info("Saving model") objects = dict() objects['method_name'] = self.method_name objects['model_labels'] = self.model_labels objects['model_train'] = self.model_train objects['model_predict'] = self.model_predict objects['model_cross_validate'] = self.model_cross_validate self.__readers_writers.save_serialised(path, title, objects=objects) def save_model_compressed(self, path: str, title: str): """Save (pickle) & compressthe training model, as well as predictions and cross-validations. Note: summaries statistics won't not saved. :param path: the directory path of the saved trained model file, using this application (if applicable). :param title: the file name of the saved trained model file, using this application. """ self.__logger.debug("Save model.") objects = dict() objects['method_name'] = self.method_name objects['model_labels'] = self.model_labels objects['model_train'] = self.model_train objects['model_predict'] = self.model_predict objects['model_cross_validate'] = self.model_cross_validate self.__readers_writers.save_serialised_compressed(path, title, objects=objects) def load(self, path: str, title: str): """Load (unpickle) the model, which was saved using this application. :param path: the directory path of the saved trained model file, using this application (if applicable). :param title: the file name of the saved trained model file, using this application """ self.__logger.debug("Load model.") objects = self.__readers_writers.load_serialised(path, title) try: self.__init__method( method_name=objects['method_name'], model_labels=objects['model_labels'], model_train=objects['model_train'], model_predict=objects['model_predict'], model_cross_validate=objects['model_cross_validate']) except (): self.__logger.error(__name__ + " - Invalid field(s) in the model file: " + path) sys.exit()
os.makedirs(io_path, exist_ok=True) logger = Logger(path=io_path, app_name=app_name, ext="log") logger = logging.getLogger(app_name) # Initialise constants and some of classes # In[ ]: # Initialise constants CONSTANTS.set(io_path, app_name) # In[ ]: # Initialise other classes readers_writers = ReadersWriters() preprocess = PreProcess(io_path) # In[ ]: # Set print settings pd.set_option('display.width', 1600, 'display.max_colwidth', 800) pp = pprint.PrettyPrinter(indent=4) # ### 1.2. Initialise Features Metadata # Read the input features' confugration file & store the features metadata # In[ ]: # variables settings