Beispiel #1
0
class FeatureParser:
    def __init__(self, variables_settings: PandasDataFrame, output_path: str,
                 output_table: str):
        """Initialise the objects and constants.
        :param variables_settings:
        :param output_path: the output path.
        :param output_table: the output table name.
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__variables_settings = variables_settings
        self.__output_path = output_path
        self.__output_table = output_table
        self.__readers_writers = ReadersWriters()
        self.__FeatureParserThread = FeatureParserThread()

    def generate(self, history_table: str, features: PandasDataFrame,
                 variables: PandasDataFrame,
                 prevalence: Dict) -> PandasDataFrame:
        """

        :param history_table: the source table alias name (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param features: the output features.
        :param variables: the input variables.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the output features.
        """
        variables_settings = self.__variables_settings[
            self.__variables_settings["Table_History_Name"] == history_table]

        for _, row in variables_settings.iterrows():
            self.__logger.info("variable: " + row["Variable_Name"] + " ...")

            if not pd.isnull(row["Variable_Aggregation"]):
                postfixes = row["Variable_Aggregation"].replace(' ',
                                                                '').split(',')
                # aggregate stats
                features_temp = self.__aggregate(
                    variables[row["Variable_Name"]],
                    row["Variable_Type_Original"], postfixes,
                    prevalence[row["Variable_Name"]])
                for p in range(len(postfixes)):
                    # feature name
                    feature_name = row["Variable_Name"] + "_" + postfixes[p]
                    # set
                    features[feature_name] = features_temp[:, p]
            else:
                # init and replace none by zero
                features_temp = np.nan_to_num(variables[row["Variable_Name"]])
                features_temp = np.where(features_temp == np.array(None), 0,
                                         features_temp)
                # set
                features[row["Variable_Name"]] = features_temp
        return features

    def __aggregate(self, variable: PandasDataFrame, variable_type: str,
                    postfixes: str, prevalence: Dict) -> NumpyNdarray:
        """

        :param variable: the input variable.
        :param variable_type: the type of input variable.
        :param postfixes: name of the aggregation functions.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the aggregated variable.
        """
        try:
            with mp.Pool() as pool:
                features_temp = pool.map(
                    partial(self.__FeatureParserThread.aggregate_cell,
                            postfixes, variable_type, prevalence), variable)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        features_temp = np.asarray(features_temp)
        return features_temp

    def prevalence(self, variable: PandasDataFrame,
                   variable_name: str) -> List:
        """
        :param variable: the input variable.
        :param variable_name: the name of the input variable.
        :return: the prevalence of values for all the variables.
        """
        try:
            with mp.Pool() as pool:
                prevalence_temp = pool.map(
                    partial(self.__FeatureParserThread.prevalence_cell),
                    variable)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        prevalence_temp = [sub2 for sub1 in prevalence_temp for sub2 in sub1]
        prevalence = Counter(prevalence_temp).most_common()
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table, [
                variable_name, '; '.join(
                    [str(p[0]) + ":" + str(p[1]) for p in prevalence])
            ],
            append=True,
            ext="txt")
        prevalence = [p[0] for p in prevalence]
        return prevalence
Beispiel #2
0
class Variables:
    def __init__(self, model_features_table: str, input_path: str,
                 output_path: str, input_features_configs: str,
                 output_table: str):
        """Initialise the objects and constants.
        :param model_features_table: the feature table name.
        :param input_path: the input path.
        :param output_path: the output path.
        :param input_features_configs: the input features' configuration file.
        :param output_table: the output table name.
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__model_features_table = model_features_table
        self.__output_path = output_path
        self.__output_table = output_table
        self.__readers_writers = ReadersWriters()
        # initialise settings
        self.__variables_settings = self.__init_settings(
            input_path, input_features_configs)
        self.__features_dic_names = self.__init_features_names()
        self.__features_dic_dtypes = self.__init_features_dtypes()
        self.__init_output(output_path, output_table)

    def set(self, input_schemas: List, input_tables: List,
            history_tables: List, column_index: str, query_batch_size: int):
        """Set the variables by reading the selected features from MySQL database.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param query_batch_size: the number of rows to be read in each batch.
        :return:
        """
        self.__logger.debug(__name__)
        query_batch_start, query_batch_max = self.__init_batch(
            input_schemas[0], input_tables[0])
        features_names, features_dtypes = self.__set_features_names_types()
        self.__validate_mysql_names(input_schemas, input_tables)
        prevalence = self.__init_prevalence(input_schemas, input_tables,
                                            history_tables)
        self.__set_batch(features_names, features_dtypes, input_schemas,
                         input_tables, history_tables, column_index,
                         prevalence, query_batch_start, query_batch_max,
                         query_batch_size)

    def __init_settings(self, input_path: str,
                        input_features_configs: str) -> PandasDataFrame:
        """Read and set the settings of input variables that are selected.
        :param input_path: the path of the input file.
        :param input_features_configs: the input features' configuration file.
        :return: the input variables settings.
        """
        self.__logger.debug(__name__)
        variables_settings = self.__readers_writers.load_csv(
            input_path, input_features_configs, 0, True)
        variables_settings = variables_settings.loc[
            (variables_settings["Selected"] == 1)
            & (variables_settings["Table_Reference_Name"] ==
               self.__model_features_table)]
        variables_settings = variables_settings.reset_index()
        return variables_settings

    def __init_features_names(self) -> Dict:
        """Generate the features names, based on variable name, source table alias name (a.k.a. history table
            name), and the aggregation function name.
        :return: the name of features.
        """
        self.__logger.debug(__name__)
        table_history_names = set(
            self.__variables_settings["Table_History_Name"])
        features_names = dict(
            zip(table_history_names,
                [[] for _ in range(len(table_history_names))]))
        for _, row in self.__variables_settings.iterrows():
            if not pd.isnull(row["Variable_Aggregation"]):
                postfixes = row["Variable_Aggregation"].replace(' ',
                                                                '').split(',')
                for postfix in postfixes:
                    features_names[row["Table_History_Name"]].append(
                        row["Variable_Name"] + "_" + postfix)
            else:
                features_names[row["Table_History_Name"]].append(
                    row["Variable_Name"])
        return features_names

    def __init_features_dtypes(self) -> Dict:
        """Generate the features types, based on the input configuration file.
        :return: the dtypes of features.
        """
        self.__logger.debug(__name__)
        table_history_names = set(
            self.__variables_settings["Table_History_Name"])
        features_dtypes = dict(
            zip(table_history_names,
                [[] for _ in range(len(table_history_names))]))
        for _, row in self.__variables_settings.iterrows():
            feature_types = row["Variable_dType"].replace(' ', '').split(',')
            for feature_type in feature_types:
                features_dtypes[row["Table_History_Name"]].append(feature_type)
        return features_dtypes

    def __init_output(self, output_path: str, output_table: str):
        """Initialise the output file by writing the header row.
        :param output_path: the output path.
        :param output_table: the output table name.
        """
        self.__logger.debug(__name__)
        keys = sorted(self.__features_dic_names.keys())
        features_names = [
            f for k in keys for f in self.__features_dic_names[k]
        ]
        self.__readers_writers.reset_csv(output_path, output_table)
        self.__readers_writers.save_csv(output_path,
                                        output_table,
                                        features_names,
                                        append=False)

    def __init_prevalence(self, input_schemas: List, input_tables: List,
                          history_tables: List) -> Dict:
        """Generate the prevalence dictionary of values for all the variables.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :return: the prevalence dictionary of values for all the variables.
        """
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table,
            ["Feature Name", "Top Prevalence Feature Name"],
            append=False,
            ext="ini")
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table, ["Feature Name", "Prevalence & Freq."],
            append=False,
            ext="txt")
        feature_parser = FeatureParser(self.__variables_settings,
                                       self.__output_path, self.__output_table)
        prevalence = dict()

        # for tables
        for table_i in range(len(input_schemas)):
            variables_settings = self.__variables_settings[
                self.__variables_settings["Table_History_Name"] ==
                history_tables[table_i]]
            prevalence[input_tables[table_i]] = dict()

            # for features
            for _, row in variables_settings.iterrows():
                self.__logger.info("Prevalence: " + row["Variable_Name"] +
                                   " ...")
                if not pd.isnull(row["Variable_Aggregation"]):
                    # read features
                    variables = self.__init_prevalence_read(
                        input_schemas[table_i], input_tables[table_i],
                        row["Variable_Name"])

                    # validate
                    if variables is None or len(variables) == 0:
                        continue

                    # prevalence
                    prevalence[input_tables[table_i]][row["Variable_Name"]] = \
                        feature_parser.prevalence(variables[row["Variable_Name"]], row["Variable_Name"])

                    # for sub features
                    postfixes = row["Variable_Aggregation"].replace(
                        ' ', '').split(',')
                    for p in range(len(postfixes)):
                        feature_name = row["Variable_Name"] + "_" + postfixes[p]
                        if len(postfixes[p]
                               ) > 11 and postfixes[p][0:11] == "prevalence_":
                            index = int(postfixes[p].split('_')[1]) - 1
                            feature_name_prevalence = "None"
                            if index < len(prevalence[input_tables[table_i]][
                                    row["Variable_Name"]]):
                                feature_name_prevalence = \
                                    feature_name + "_" + \
                                    str(prevalence[input_tables[table_i]][row["Variable_Name"]][index])
                            # save prevalence
                            self.__readers_writers.save_text(
                                self.__output_path,
                                self.__output_table,
                                [feature_name, feature_name_prevalence],
                                append=True,
                                ext="ini")
        return prevalence

    def __init_prevalence_read(self, input_schema: str, input_table: str,
                               variable_name: str) -> PandasDataFrame:
        """Read a variable from database, to calculate the prevalence of the values.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :param variable_name: the variable name.
        :return: the selected variable.
        """
        query = "SELECT `" + variable_name + "` FROM `" + input_table + "`;"
        return self.__readers_writers.load_mysql_query(query,
                                                       input_schema,
                                                       dataframing=True)

    def __init_batch(self, input_schema: str, input_table: str) -> [int, int]:
        """Find the minimum and maximum value of the index column, to use when reading mysql tables in
            batches.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :return: the minimum and maximum of the index column.
        """
        self.__logger.debug(__name__)
        query = "select min(localID), max(localID) from `" + input_table + "`;"
        output = list(
            self.__readers_writers.load_mysql_query(query,
                                                    input_schema,
                                                    dataframing=False))
        if [r[0] for r in output][0] is None:
            self.__logger.error(__name__ + " No data is found: " + query)
            sys.exit()

        query_batch_start = int([r[0] for r in output][0])
        query_batch_max = int([r[1] for r in output][0])
        return query_batch_start, query_batch_max

    def __set_features_names_types(self):
        """Produce the sorted lists of features names and features dtypes.
        :return: the sorted lists of features names and features dtypes.
        """
        self.__logger.debug(__name__)
        keys = sorted(self.__features_dic_names.keys())
        features_names = [
            f for k in keys for f in self.__features_dic_names[k]
        ]
        features_dtypes = [
            pd.Series(dtype=f) for k in keys
            for f in self.__features_dic_dtypes[k]
        ]
        features_dtypes = pd.DataFrame(
            dict(zip(features_names, features_dtypes))).dtypes
        return features_names, features_dtypes

    def __set_batch(self, features_names: list, features_dtypes: Dict,
                    input_schemas: List, input_tables: List,
                    history_tables: List, column_index: str, prevalence: Dict,
                    query_batch_start: int, query_batch_max: int,
                    query_batch_size: int):
        """Using batch processing first read variables, then generate features and write them into output.
        :param features_names: the name of features that are selected.
        :param features_dtypes: the dtypes of features that are selected.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :param query_batch_start: the minimum value of the column index.
        :param query_batch_max: the maximum value of the column index.
        :param query_batch_size: the number of rows to be read in each batch.
        """
        self.__logger.debug(__name__)
        feature_parser = FeatureParser(self.__variables_settings,
                                       self.__output_path, self.__output_table)
        step = -1
        batch_break = False

        while not batch_break:
            step += 1
            features = None
            for table_i in range(len(input_schemas)):
                self.__logger.info("Batch: " + str(step) + "; Table: " +
                                   input_tables[table_i])

                # read job
                variables = self.__set_batch_read(input_schemas[table_i],
                                                  input_tables[table_i], step,
                                                  column_index,
                                                  query_batch_start,
                                                  query_batch_max,
                                                  query_batch_size)

                # validate
                if variables is None:
                    batch_break = True
                    break
                elif len(variables) == 0:
                    continue

                # process job
                if features is None:
                    features = pd.DataFrame(0,
                                            index=range(len(variables)),
                                            columns=features_names)
                    features = features.astype(dtype=features_dtypes)
                features = self.__set_batch_process(
                    feature_parser, history_tables[table_i], features,
                    variables, prevalence[input_tables[table_i]])

            # write job
            if features is not None:
                features = features.astype(dtype=features_dtypes)
                self.__set_batch_write(features)

    def __set_batch_read(
            self, input_schema: str, input_table: str, step: int,
            column_index: str, query_batch_start: int, query_batch_max: int,
            query_batch_size: int) -> Callable[[PandasDataFrame, None], None]:
        """Read the queried variables.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :param step: the batch id.
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param query_batch_start: the minimum value of the column index.
        :param query_batch_max: the maximum value of the column index.
        :param query_batch_size: the number of rows to be read in each batch.
        :return: the queried variables.
        """
        step_start = query_batch_start + step * query_batch_size
        step_end = step_start + query_batch_size
        if step_start >= query_batch_max:
            return None
        # read
        query = "SELECT * FROM `" + input_table + \
                "` WHERE `" + str(column_index) + "` >= " + str(step_start) + \
                " AND `" + str(column_index) + "` < " + str(step_end) + ";"
        return self.__readers_writers.load_mysql_query(query,
                                                       input_schema,
                                                       dataframing=True)

    def __set_batch_process(self, feature_parser: FeaturesFeatureParser,
                            history_table: str, features: PandasDataFrame,
                            variables: PandasDataFrame,
                            prevalence: List) -> PandasDataFrame:
        """Process variables and generate features.
        :param feature_parser:
        :param history_table: the source table alias name (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param features: the output features.
        :param variables: the input variables.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the generated features.
        """
        return feature_parser.generate(history_table, features, variables,
                                       prevalence)

    def __set_batch_write(self, features: PandasDataFrame):
        """Write the features into an output file.
        :param features: the output features.
        """
        self.__readers_writers.save_csv(self.__output_path,
                                        self.__output_table,
                                        features,
                                        append=True)

    def __validate_mysql_names(self, input_schemas: List,
                               history_tables: List):
        """Validate mysql tables and their columns, and generate exception if table/column name is invalid.
        :param input_schemas: the mysql database schemas.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        """
        # for tables
        for table_i in range(len(input_schemas)):
            variables_settings = self.__variables_settings[
                self.__variables_settings["Table_History_Name"] ==
                history_tables[table_i]]
            # validate table name
            if not self.__readers_writers.exists_mysql(
                    input_schemas[table_i], history_tables[table_i]):
                self.__logger.error(__name__ + " - Table does not exist: " +
                                    history_tables[table_i])
                sys.exit()

            # for features
            for _, row in variables_settings.iterrows():
                # validate column name
                if not self.__readers_writers.exists_mysql_column(
                        input_schemas[table_i], history_tables[table_i],
                        row["Variable_Name"]):
                    self.__logger.error(__name__ +
                                        " - Column does not exist: " +
                                        row["Variable_Name"])
                    sys.exit()