Esempio n. 1
0
    def evaluate_model(self, evaluation_metric_list, predicted_target,
                       actual_target):
        """
        :param evaluation_metric_list: A List of the metrics to be evaluated.
        :param predicted_target: A list containing the predicted outcome of the model.
        :param actual_target: A list containing the actual outcome values.
        :return: A Dataframe containing the evaluated metrics specified in the 'evaluation_metric_list'.
        """
        try:

            self.logger.warning("Validating the arguments")
            self.__argument_none_validation(evaluation_metric_list,
                                            predicted_target, actual_target)
            self.__argument_type_validation(evaluation_metric_list,
                                            predicted_target, actual_target)

            # :todo: Have to evaluate against model code

            self.logger.info('Going to perform Model evaluation')
            model_evaluation_handler = ModelEvaluationHandler()
            metric_results_df = model_evaluation_handler.perform_model_evaluation(
                evaluation_metric_list, predicted_target, actual_target)
            self.logger.warning('Model evaluation completed successfully')
            return metric_results_df

        except MissingMandatoryFieldException as exp:
            raise CommonBaseException(exp)
        except CommonBaseException as exp:
            raise exp
        except Exception as exp:
            self.logger.error(
                'Exception occured while evaluating model with evaluation metric list : '
                + str(evaluation_metric_list))
            raise CommonBaseException(exp)
    def load_feature_mapping_dict_from_hdfs(cls, path, model_mapping_features_list):
        """
        author: [email protected]
        Load the hitrate dict from given path

        :param path: hdfs path
        :return: loaded dict
        """

        try:
            model_mapping_dict = {}
            cls.logger.warning("Going to load feature mappings for: " + str(model_mapping_features_list) + " from hdfs path: " + str(path))
            if model_mapping_features_list is None:
                cls.logger.error("Model mapping feature list is empty/null for path: " + str(path))
                raise CommonBaseException("Model mapping feature list is empty/null for path: " + str(path))

            for mapping_name in model_mapping_features_list:
                if not mapping_name.endswith('_histogram'):
                    cls.logger.warning("Going to load mapping: " + str(mapping_name) + " from path: " + str(path))
                    json_df = CommonConstants.SPARK_SESSION.read.json(path + mapping_name)
                    model_mapping_dict[mapping_name] = json_df.rdd.collectAsMap()
            return model_mapping_dict
        except HDFSException as exp:
            raise CommonBaseException(exp)
        except CommonBaseException as exp:
            raise exp
        except Exception as exp:
            cls.logger.error('Exception occured while loading dict from hdfs ' + str(path))
            raise CommonBaseException(exp)
    def perform_model_testing(self, test_data, model_parameters,
                              trained_model):
        """

        :param test_data:
        :param model_parameters:
        :param trained_model:
        :return: An array of the target predictions
        """
        try:
            # todo: Persist the predicted results in the DB against the model and version id's.
            self.logger.warning(
                "Getting an instance of the test driver for model name: " +
                str(model_parameters[CommonConstants.MODEL_NAME_TAG]))
            test_service_handler = AbstractTestDriverFactory.get_instance()
            self.logger.warning("Going to test the trained model on test data")
            prediction_df = test_service_handler.test_model(
                test_data=test_data,
                model_params=model_parameters,
                trained_model=trained_model)
            return prediction_df
        except CommonBaseException as exp:
            raise exp
        except SQLException as exp:
            raise CommonBaseException(exp)
        except Exception as exp:
            self.logger.error(
                'Exception occured while performing model testing')
            raise CommonBaseException(exp)
Esempio n. 4
0
    def test_model(self, test_data, model_params, trained_model):
        """
        Uses trained model to predict for given data

        :param test_data: dataframe of the test data features
        :param model_params: configurations of the model
        :param trained_model: A trained model instance to be used on the test data
        :return: An array of the target predictions
        """

        try:
            self.logger.info("Validating input test data and model params")
            self.__validate_arguments(test_data, model_params)
            CommonValidations.validate_model_params(model_params)

            self.logger.info(
                "Validating test data features are present in the required features' list"
            )
            CommonValidations.validate_data_features_against_model(
                test_data, model_params[CommonConstants.FEATURE_LIST_TAG])

            #Todo Validating a model for diff api's

            # # self.logger.info("Validating input trained_model")
            # CommonValidations.validate_pipeline_model('trained_pipeline', trained_model)
            # # Validate a trained model instance instead

            self.logger.warning(
                "Going to predict using trainied model for test data on columns: "
                + str(test_data.columns))

            self.logger.info("Getting model class implementation for: " +
                             str(model_params[CommonConstants.MODEL_NAME_TAG]))
            model_class = AbstractMachineLearningModelFactory.get_instance(
                model_name=model_params[CommonConstants.MODEL_NAME_TAG])
            self.logger.info(
                "Model class implementation successfully obtained")

            predicted_data = model_class.predict(model=trained_model,
                                                 data=test_data,
                                                 params_dict=model_params)
            self.logger.warning("Predictions successfully made")

            self.logger.warning("Returning predicted data")
            return predicted_data

        except MissingMandatoryFieldException as exp:
            raise CommonBaseException(exp)
        except InvalidInfoException as exp:
            raise CommonBaseException(exp)
        except DataFrameException as exp:
            raise CommonBaseException(exp)
        except CommonBaseException as exp:
            raise exp
        except Exception as exp:
            self.logger.error(
                'Exception occured while testing model_params = ' +
                str(model_params) + ' model = ' + str(trained_model))
            raise CommonBaseException(exp)
    def filter_dataframe_by_column_list(cls, df, read_data_dict, model_id,
                                        version_id,
                                        train_test_prediction_mode):
        try:
            cls.logger.warning(
                "Going to filter dataframe for given column list for model_id: "
                + str(model_id) + " version_id: " + str(version_id))

            if read_data_dict is None:
                cls.logger.error(
                    "MissingMandatoryFieldException : read data configs are not defined for model_id: "
                    + str(model_id) + " version_id: " + str(version_id))
                raise MissingMandatoryFieldException(
                    "read data configs are not defined for model_id: " +
                    str(model_id) + " version_id: " + str(version_id))

            model_specific_data_columns = list(
                read_data_dict.get(CommonConstants.READ_DATA_COLUMNS_TAG,
                                   None))

            if model_specific_data_columns is None:
                cls.logger.error(
                    "MissingMandatoryFieldException : Read Data columns are not defined for model_id: "
                    + str(model_id) + " version_id: " + str(version_id))
                raise MissingMandatoryFieldException(
                    "Read Data columns are not defined for model_id: " +
                    str(model_id) + " version_id: " + str(version_id))

            if train_test_prediction_mode in ['0', '1', '2']:
                model_specific_data_columns.append(
                    CommonConstants.KEY_IDENTIFIER)
                model_specific_data_columns.append(
                    CommonConstants.LABEL_COLUMN)
            else:
                model_specific_data_columns.append(
                    CommonConstants.KEY_IDENTIFIER)

            cls.logger.warning("filtering dataframe for given column list: " +
                               str(model_specific_data_columns))
            model_specific_input_df = df.select(model_specific_data_columns)

            return model_specific_input_df
        except MissingMandatoryFieldException as exp:
            raise CommonBaseException(exp)
        except Exception as exp:
            cls.logger.error(
                'Exception occured while fetching model specific configurations'
            )
            raise CommonBaseException(exp)
Esempio n. 6
0
 def engineer_feature(self, args):
     try:
         if self.engineer_feature_validation(args):
             return self.engineer_feature_operation(args)
     except (MissingMandatoryFieldException, InvalidInfoException,
             DataFrameException) as exp:
         raise CommonBaseException(exp)
    def predict(cls, model, data, params_dict):
        try:
            cls.logger.info("Testing model")
            cls.logger.debug("Data frame columns: " + str(data.columns))

            test_vectors = cls.make_test_vectors(data, params_dict)

            test_input_fn = bert.run_classifier.input_fn_builder(
                features=test_vectors,
                seq_length=params_dict[CommonConstants.MAX_SEQ_LENGTH],
                is_training=False,
                drop_remainder=False)

            result, output = model.predict(input_fn=test_input_fn)

            print('Result', result)
            print('Output', output)

            return result

        except Exception as exp:
            cls.logger.error(
                'Exception occured while testing data to model : ' +
                str(model))
            raise CommonBaseException(exp)
    def train(cls, model, data, target, params_dict):
        try:
            cls.logger.info("Fitting model")
            cls.logger.debug("Data frame columns: " + str(data.columns))

            train_vectors = cls.make_train_vectors(data, params_dict)

            train_input_fn = bert.run_classifier.input_fn_builder(
                features=train_vectors,
                seq_length=params_dict[CommonConstants.MAX_SEQ_LENGTH],
                is_training=True,
                drop_remainder=False)

            trained_model = model.train(
                input_fn=train_input_fn,
                steps=params_dict[CommonConstants.NUM_TRAIN_STEPS])
            cls.logger.info("Model fitted")

            return trained_model

        except Exception as exp:
            cls.logger.error(
                'Exception occured while training data to model : ' +
                str(model))
            raise CommonBaseException(exp)
    def perform_model_training(self, train_df, target, model_parameters,
                               model_hyper_parameters,
                               model_cross_validator_params):
        """

        :param train_df:
        :param target:
        :param model_parameters:
        :param model_hyper_parameters:
        :param model_cross_validator_params:
        :return a trained model instance:
        """
        try:
            # todo: save trained model instance against the model and version id's on hdfs and its path in informix
            self.logger.warning(
                "Getting an instance of the train driver for model name: " +
                str(model_parameters[CommonConstants.MODEL_NAME_TAG]))
            train_service_handler = AbstractTrainDriverFactory.get_instance()
            self.logger.warning("Going to train model on train_df")
            trained_model = train_service_handler.train_model(
                data=train_df,
                target=target,
                model_params=model_parameters,
                model_hyper_params=model_hyper_parameters,
                model_cross_validator_params=model_cross_validator_params)
            return trained_model

        except SQLException as exp:
            raise exp
        except CommonBaseException as exp:
            raise exp
        except Exception as exp:
            self.logger.error(
                'Exception occured while performing model training')
            raise CommonBaseException(exp)
Esempio n. 10
0
    def load_model_from_hdfs(cls, path, model_name):
        """
        author: [email protected]
        Load the model from given path, by first taking the model_name and getting the appropriate model load object for
        it and then loads the model from hdfs into that load object

        :param path: hdfs path
        :param model_name: the actual model name (decision_tree, random_forest etc)
        :return: loaded model
        """

        try:
            cls.logger.info("Getting object of DAO Factory class")
            dao_obj = AbstractMachineLearningModelDAOFactory()
            cls.logger.info("Object retrieved")

            cls.logger.info("Getting dao instance for database: " + str(CommonConstants.DATABASE_HDFS))
            hdfs_dao = dao_obj.get_machine_learning_model_dao(CommonConstants.DATABASE_HDFS)
            cls.logger.info("Instance retrieved")

            cls.logger.info("Getting object of Machine Learning Model Factory class")
            model_class = AbstractMachineLearningModelFactory()
            cls.logger.info("Object retrieved")

            cls.logger.info("Getting model class implementation for : " + str(model_name))
            model_class = model_class.get_instance(model_name)
            cls.logger.info("Model class implementation successfully obtained")

            cls.logger.info("Going to load pipeline model " + str(model_name) + " from the HDFS path: " + str(path))
            model_obj = model_class.get_loadable_object()
            cls.logger.info("Model load object successfully retrieved")

            cls.logger.warning("Going to load model: " + str(model_name) + " from the HDFS path: " + str(path))
            loaded_model = hdfs_dao.load(path, model_obj)
            cls.logger.warning("Model of: " + str(model_name) + " successfully loaded from path: " + str(path))

            return loaded_model
        except HDFSException as exp:
            raise CommonBaseException(exp)
        except CommonBaseException as exp:
            raise exp
        except Exception as exp:
            cls.logger.error('Exception occured while loading model ' + str(model_name) + ' from hdfs '+ str(path))
            raise CommonBaseException(exp)
Esempio n. 11
0
    def evaluate_metric(self, predicted_target, actual_target):
        """
        :param predicted_target: An array containing the predicted outcome of the model.
        :param actual_target: An array containing the actual outcome values.
        :return: Recall score.
        """
        try:
            self.logger.info('Going to calculate recall score')
            return recall_score(actual_target, predicted_target)

        except Exception as exp:
            self.logger.error('Exception occured while calculating recall metric')
            raise CommonBaseException(exp)
    def validate_model_id_and_version(cls, model_code, model_id, version_id):

        if (model_id is None) | (version_id is None):
            cls.logger.error(
                'CommonBaseException : model_id is None or version_id is None against model_code: '
                + str(model_code))
            raise CommonBaseException(
                'model_id is None or version_id is None against model_code: ' +
                str(model_code))

        cls.logger.warning("For model_code: " + str(model_code) +
                           " model_id: " + str(model_id) + " version_id: " +
                           str(version_id))
Esempio n. 13
0
    def perform_data_preprocessing(self,
                                   postgre_spark_prop,
                                   raw_data_df,
                                   driver_config_dict,
                                   preprocessing_flow_version,
                                   preprocessing_model_specific_dict,
                                   persist_query,
                                   model_id,
                                   version_id,
                                   model_code,
                                   train_test_prediction_mode,
                                   database_metadata_dict,
                                   engineered_features_list,
                                   is_db_save='1'):
        """

        :param postgre_spark_prop: spark connection
        :param raw_data_df: financial df
        :param transformation_pipeline: transformation pipeline for online mode
        :return: preprocessed data and pipeline for string indexer
        """
        preprocess_service = None
        try:
            """
            okay so here i call the run function of my PreProcessing Service and pass it the following configurations:
            2. Per Model Requirements - preprocess operations required by each model in their specific order.
                i. model-id: lowercase, lemmatize, stem
            3. Raw Input Data - passed as input
            4. Mode - Predict, or test/train
            """

        except CommonBaseException as exp:
            raise exp
        except SQLException as exp:
            raise CommonBaseException(exp)
        except Exception as exp:
            self.logger.error(
                'Exception occured while performing data preprocessing')
            raise CommonBaseException(exp)
    def perform_metrics_evaluation(self, evaluation_metric_list, predicted_target, actual_target):
        """
            :param evaluation_metric_list: List of metrics to be evaluated.
            :param predicted_target: An array containing the predicted outcome of the model.
            :param actual_target: An array containing the actual outcome values.
            :return: A Dataframe containing the evaluated metrics specified in the 'evaluation_metric_list'.
        """
        metrics_evaluation_dict = {}
        confusion_matrix_cache = None
        try:
                for metric_id in evaluation_metric_list:
                        self.logger.info('Performing evaluation for metricID = ' + str(metric_id))

                        self.logger.info('Saving evaluation score in metrics evaluation dictionary for metricID = ' + str(metric_id))
                        if metric_id in set(Constants.ConfusionMatrixTags):
                            if confusion_matrix_cache is None:
                                confusion_matrix_list = self.perform_evaluation_on_given_metric(metric_id,
                                                                                                predicted_target,
                                                                                                actual_target)
                                confusion_matrix_cache= {metric: score for metric, score in zip(Constants.ConfusionMatrixTags, confusion_matrix_list)}
                            metrics_evaluation_dict[metric_id]= [confusion_matrix_cache[metric_id]]
                        else:
                            metric_evaluation_score = self.perform_evaluation_on_given_metric(metric_id,
                                                                                              predicted_target,
                                                                                              actual_target)
                            metrics_evaluation_dict[metric_id] = [metric_evaluation_score]
                metric_results_df = DataFrame(metrics_evaluation_dict).T
                metric_results_df.columns=[""]
                return metric_results_df

        except DataFrameException as exp:
            raise CommonBaseException(exp)
        except Exception as exp:
            self.logger.error(
                'Exception occured while performing metrics evaluations')
            raise CommonBaseException(exp)
    def evaluate_metric(self, predicted_target, actual_target):
        """
            :param predicted_target: An array containing the predicted outcome of the model.
            :param actual_target: An array containing the actual outcome values.
            :return: A confusion matrix.

            """
        try:
            self.logger.info('Going to calculate the Confusion Matrix')
            confusion_matx = confusion_matrix(actual_target,
                                              predicted_target).ravel()
            return confusion_matx

        except Exception as exp:
            self.logger.error(
                'Exception occured while getting the confusion matrix')
            raise CommonBaseException(exp)
    def preprocess_validation(self, args):

        # TRY THIS:
        try:

            # IF INITIAL VALIDATION SUCCESSFUL:
            if super().preprocess_validation(args):

                # FOR ALL REQ_INPUT:
                if self.config_pattern.properties.req_input is not None:
                    for arr in self.config_pattern.properties.req_input:
                        for elem in arr:

                            # IF ROW-WISE ELEMENT IN ARGS IS NOT OF REQUIRED DATA TYPE:
                            if args[elem].dtype != str:

                                # ERROR:
                                self.logger.error(
                                    InvalidInfoException.__name__, 'Given:',
                                    args[elem].dtype, 'Required:', str)
                                raise InvalidInfoException(
                                    'Given:', args[elem].dtype, 'Required:',
                                    str)

                # FOR ALL REQ_DATA:
                if self.config_pattern.properties.req_data is not None:
                    for arr in self.config_pattern.properties.req_data:
                        for elem in arr:

                            # IF ROW-WISE ELEMENT IN ARGS IS NOT OF REQUIRED DATA TYPE:
                            if args[elem].dtype != str:

                                # ERROR:
                                self.logger.error(
                                    InvalidInfoException.__name__, 'Given:',
                                    args[elem].dtype, 'Required:', str)
                                raise InvalidInfoException(
                                    'Given:', args[elem].dtype, 'Required:',
                                    str)

                # ALL CASES POSITIVE
                return True

        # CATCH ERRORS:
        except (MissingMandatoryFieldException, InvalidInfoException) as exp:
            raise CommonBaseException(exp)
    def convert_dict_values_to_list(cls, params_dict):
        """
        author: [email protected]
        Given dict , convert all non-list values to list

        :param params_dict: dictionary with original values
        :return: the updated dict
        """
        try:
            for key, value in params_dict.items():
                if key is not CommonConstants.NUM_FOLDS_TAG:
                    if not isinstance(value, list):
                        params_dict[key] = [value]
            return params_dict

        except Exception as exp:
            cls.logger.error(
                'Exception occured while converting dict values to list ')
            raise CommonBaseException(exp)
    def perform_cross_validation(cls, model, data, target, params_dict):
        """
        performs cross validation to get the best learned model

        :param model: the estimator to be cross-validated
        :param data:  dataframe of the training data features
        :param target: outcome associated with the training data features
        :param params_dict: contains both the model hyper parameter grid as well as the grid search parameters


        :return: A model tuned over the parameter search space.
        """
        try:
            cls.logger.warning(
                "Building a Parameter grid for tuning model on a set of hyper parameters "
            )
            param_grid = cls.build_param_grid(cls, params_dict)

            cls.logger.info("Instantiating Grid Search Cross Validator object")
            grid_search_clf = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                scoring=params_dict[CommonConstants.SCORING_TAG],
                cv=params_dict[CommonConstants.NUM_FOLDS_TAG],
                n_jobs=params_dict[CommonConstants.NUM_CORES_TAG],
                iid=params_dict[CommonConstants.IID_TAG])
            cls.logger.info("Instantiated  Grid Search Cross Validator object")

            cls.logger.warning("Fitting models using Cross Validator")
            cls.logger.warning(
                "Data frame columns: " +
                str(data.columns.to_list() +
                    params_dict[CommonConstants.TARGET_COLUMN_TAG].tolist()))

            cv_model = grid_search_clf.fit(data, target)
            cls.logger.warning("Model fitted")

            return cv_model

        except Exception as exp:
            cls.logger.error(
                "Exception: Cross validation couldn't be performed")
            raise CommonBaseException(exp)
Esempio n. 19
0
    def get_dialogue_preprocessor(self, preprocessor_type):

        # TRY THIS:
        try:

            # IF INITIAL VALIDATION SUCCESSFUL:
            if self.validation(preprocessor_type):

                # GET REQUIRED PREPROCESSOR TYPE FROM DICT:
                switcher = dict()
                for y in AbstractDialoguePreProcessor().__class__.__subclasses__():
                    switcher[y.__name__] = y()
                    switcher.update(dict((x.__name__, x()) for x in switcher[y.__name__].__class__.__subclasses__()))

                # RETURN PREPROCESSOR CLASS IF EXISTS:
                return switcher.get(preprocessor_type, None)

        # CATCH ERRORS:
        except (MissingMandatoryFieldException, InvalidInfoException) as exp:
            raise CommonBaseException(exp)
    def predict(cls, model, data, params_dict):
        """
        Makes predictions on data using a trained model object

        :param model: model object to be used for predictions
        :param data: dataframe with data to make predictions on
        :return: An array of the target predictions
        """
        try:
            cls.logger.info("Predicting on data")
            cls.logger.debug("Dataframe columns: " + str(data.columns))
            predicted_data = model.predict(data)
            cls.logger.info("Predictions made")
            return predicted_data

        except Exception as exp:
            cls.logger.error(
                'Exception occured while predicting data from model : ' +
                str(model))
            raise CommonBaseException(exp)
    def train(cls, model, data, target, params_dict):
        """
        Trains model on the provided data

        :param model: model object to be trained
        :param data: dataframe of the training data features
        :param target: outcome associated with the training data features
        :return: trained model object
        """
        try:
            cls.logger.info("Fitting model")
            cls.logger.debug("Data frame columns: " + str(data.columns))
            trained_model = model.fit(data, target)
            cls.logger.info("Model fitted")
            return trained_model

        except Exception as exp:
            cls.logger.error(
                'Exception occured while training data to model : ' +
                str(model))
            raise CommonBaseException(exp)
    def preprocess(self, args):

        # TRY THIS:
        try:

            # IF INITIAL VALIDATION SUCCESSFUL:
            # if self.preprocess_validation(args):

            # CREATE RESULTANT DATAFRAME:
            res = pd.DataFrame()

            # FOR ALL REQ_INPUT:
            if self.config_pattern.properties.req_input is not None:
                for arr in self.config_pattern.properties.req_input:
                    for elem in arr:

                        # PERFORM PREPROCESS OPERATION:
                        res = args[elem].apply(
                            lambda x: self.preprocess_operation(
                                x, args[self.config_pattern.properties.req_args
                                        ]))

            # FOR ALL REQ_DATA:
            if self.config_pattern.properties.req_data is not None:
                for arr in self.config_pattern.properties.req_data:
                    for elem in arr:

                        # PERFORM PREPROCESS OPERATION
                        res = args[elem].apply(
                            lambda x: self.preprocess_operation(
                                x, args[self.config_pattern.properties.req_args
                                        ]))

            # RETURN RESULTANT DATAFRAME
            return res

        # CATCH ERRORS:
        except (MissingMandatoryFieldException, InvalidInfoException,
                DataFrameException) as exp:
            raise CommonBaseException(exp)
Esempio n. 23
0
    def _load_mapping_feature_names(cls, hdfs_path):
        """
         author: [email protected]
         Loads the feature mapping names from hdfs

        :param hdfs_path_mapping:  HDFS path on which feature mapping is stored
        :return: string of mapping_feature names
        """
        path = ""
        try:

            mapping_feature_names = ''

            cls.logger.warning("Getting ip address from base hdfs path")
            hdfs_host = re.findall(r'[0-9]+(?:\.[0-9]+){3}',hdfs_path)
            cls.logger.info("Making HDFS client object")
            hdfs = PyWebHdfsClient(host=hdfs_host[0], port=CommonConstants.HDFS_WEB_PORT, user_name='hdfs')
            cls.logger.warning("Parsing hdfs path to get the exacr path with IP and PORT")
            path = hdfs_path.split(CommonConstants.HDFS_PORT)
            path = path[1]
            cls.logger.warning("Going to get directory status of hdfs path " + str(path))
            directory_status = hdfs.list_dir(path)
            cls.logger.warning("Going to get status of each file in path " + str(path))
            files_status = directory_status['FileStatuses']['FileStatus']
            cls.logger.warning("Going to get names of each file ")

            for file in files_status:
                mapping_feature_names += file['pathSuffix'] + ','
            cls.logger.warning("Removing last comma from comma separated string ")
            mapping_feature_names = mapping_feature_names[:-1]

            cls.logger.info("File names loaded successfully.")
            cls.logger.info("File names : " + str(mapping_feature_names))

            return mapping_feature_names
        except Exception as exp:
            cls.logger.error('Exception occured while loading feature mapping names from hdfs path ' + str(path))
            raise CommonBaseException(exp)
    def check_feature_mappings_existance(self, hdfs_path_mapping):
        """
        author: [email protected]

        :param hdfs_path_mapping: hdfs path
        :return: returns True if path exists else False
        """
        try:
            mapping_feature_names = ''
            hdfs_host = re.findall(r'[0-9]+(?:\.[0-9]+){3}', hdfs_path_mapping)
            self.logger.info("Making HDFS client object")
            hdfs = PyWebHdfsClient(host=hdfs_host[0],
                                   port=CommonConstants.HDFS_WEB_PORT,
                                   user_name='hdfs')
            self.logger.warning(
                "Parsing hdfs path to get the exacr path with IP and PORT")
            path = hdfs_path_mapping.split(CommonConstants.HDFS_PORT)
            path = path[1]
            path = path.replace(CommonConstants.SUB_DIR_MAPPING_TAG + "/", "")
            self.logger.warning("Going to get directory status of hdfs path " +
                                str(path))
            directory_status = hdfs.list_dir(path)
            self.logger.warning("Going to get status of each file in path " +
                                str(path))
            files_status = directory_status['FileStatuses']['FileStatus']
            self.logger.warning("Going to get names of each file ")
            for file in files_status:
                mapping_feature_names += file['pathSuffix'] + ','

            if (CommonConstants.SUB_DIR_MAPPING_TAG in mapping_feature_names):
                return True
            return False

        except Exception as exp:
            self.logger.error(
                'Exception occured while loading feature mapping names from hdfs path '
                + str(hdfs_path_mapping))
            raise CommonBaseException(exp)
    def merge_and_overwrite_dicts(cls, *dict_args):
        """
        author: [email protected]
        Given any number of dicts, shallow copy and merge into a new dict,
            precedence goes to key value pairs in latter dicts.

        :param base_dict: dictionary with original values
        :param overwrite_dict: dictionary with va;ues to overwrite
        :return: the new merged dictionary with updated/overwritten values
        """
        try:
            result = {}

            cls.logger.info("Merging the dictionaries: ")
            for dictionary in dict_args:
                result.update(dictionary)

            cls.logger.info("Merged dictionary: " + str(result))
            return result
        except Exception as exp:
            cls.logger.error(
                'Exception occured while merging and overwriting dicts ')
            raise CommonBaseException(exp)
    def perform_model_evaluation(self, evaluation_metric_list, predicted_target, actual_target):
        """
        :param evaluation_metric_list: List of metrics to be evaluated.
        :param predicted_target: An array containing the predicted outcome of the model.
        :param actual_target: An array containing the actual outcome values.
        :return: A Dataframe containing the evaluated metrics specified in the 'evaluation_metric_list'.
        """
        try:
            default_evaluations = EvaluationUtils.get_default_evaluation_metric_list()
            if not evaluation_metric_list:
                self.logger.warning('evaluation_metrics_list is empty, going to fetch default metrics list')
                evaluation_metric_list = default_evaluations

            self.logger.warning("Validating the 'evaluation_metric_list' values for admissible metric types")
            self.__validate_evaluation_metric_type(evaluation_metric_list, default_evaluations)

            metric_results_df = self.perform_metrics_evaluation(evaluation_metric_list, predicted_target, actual_target)
            self.logger.info('Metrics Evaluated successfully')
            return metric_results_df

        except CommonBaseException as exp:
            raise exp
        except DataFrameException as exp:
            raise CommonBaseException(exp)
    def get_max_unique_count(cls, df, feature_list):
        """
         author: [email protected]
         Get the maximum unique value count of a categorical features

        :param pipeline_model: pipeline_model containing categorical features unique values lists
        :return:
        """
        try:
            cls.logger.info(
                "Getting maximum unique value count from given dataframe and feature list "
                + str(feature_list))
            unique_count_row = df.agg(*(countDistinct(col(c)).alias(c)
                                        for c in feature_list)).head()
            cls.logger.warning("Count distinct categories: " +
                               str(unique_count_row))
            max_count = max(unique_count_row)
            cls.logger.warning("Maximum unique count: " + str(max_count))
            return max_count

        except Exception as exp:
            cls.logger.error(
                "Exception occured while getting max unique value count ")
            raise CommonBaseException(exp)
Esempio n. 28
0
    def save_to_hdfs_and_informix(cls, base_path, model_code, model_id, model_version, model, informix_multi_instance):
        """
        author: [email protected]
        Save model to hdfs; first gets model_id and model_version against model_code; then formulates an hdfs path
        against these value and then saves the model to that path, and saves the path back into informix db

        :param base_path: base/root path from HDFS
        :param model_code:
        :param model_id: a unique identifier for the model made
        :param model_version:
        :param model: trained model object
        :param informix_multi_instance: informix mi instance
        :return:
        """
        hdfs_path = None
        try:
            cls.logger.info("Getting object of DAO Factory class")
            dao_obj = AbstractMachineLearningModelDAOFactory()
            cls.logger.info("Object retrieved")

            cls.logger.info("Getting dao instance for database: " + str(CommonConstants.DATABASE_HDFS))
            hdfs_dao = dao_obj.get_machine_learning_model_dao(CommonConstants.DATABASE_HDFS)
            cls.logger.info("Instance retrieved")
            hdfs_path_pipeline = ''
            cls.logger.warning("Checking if model is of type Pipeline model ")
            if isinstance(model, PipelineModel):
                cls.logger.info("Going to make HDFS path to save model")
                hdfs_path_pipeline = HDFSUtils._make_hdfs_path(base_path,
                                                            CommonConstants.HDFS_MACHINE_LEARNING_MODELS_PATH_TAG,
                                                            model_id, model_version, CommonConstants.SUB_DIR_PIPELINE_TAG)
                cls.logger.info("HDFS path made: " + str(hdfs_path_pipeline))

                cls.logger.warning("Going to save model to the HDFS path: " + str(hdfs_path_pipeline))
                # hdfs_dao.save(hdfs_path, model)
                model.save(hdfs_path_pipeline)
                cls.logger.warning("Model successfully saved to the HDFS path: " + str(hdfs_path_pipeline))

            cls.logger.info("Going to make HDFS path to get feature mappings")
            hdfs_path_mapping = HDFSUtils._make_hdfs_path(base_path,
                                                        CommonConstants.HDFS_MACHINE_LEARNING_MODELS_PATH_TAG,
                                                        model_id, model_version, CommonConstants.SUB_DIR_MAPPING_TAG)

            cls.logger.warning("Checking if feature mappings exists in hdfs path: " + str(hdfs_path_mapping) + "  for model_id: " + str(model_id) +" version_id: " + str(model_version))

            if(CommonUtilities.check_feature_mappings_existance(hdfs_path_mapping) is True):
                cls.logger.warning("Going to load all mapping_feature names from the HDFS path: " + str(hdfs_path_mapping))
                mapping_feature_names = HDFSUtils._load_mapping_feature_names(hdfs_path_mapping)
            else:
                cls.logger.warning("Feature mappings does not exist in hdfs path: " + str(
                    hdfs_path_mapping) + "  for model_id: " + str(model_id) + " version_id: " + str(model_version))
                mapping_feature_names = None

            cls.logger.info("Getting dao instance for database: " + str(CommonConstants.DATABASE_INFORMIX))
            informix_dao = dao_obj.get_machine_learning_model_dao(CommonConstants.DATABASE_INFORMIX)
            cls.logger.info("Instance retrieved")
            if isinstance(model, PipelineModel):
                cls.logger.warning("Going to write hdfs model path to Informix DB: " + str(hdfs_path_pipeline))
                informix_dao.write_path_and_mapping_features(hdfs_path_pipeline, model_code, mapping_feature_names, informix_multi_instance)
                cls.logger.warning("Model path successfully saved to Informix DB: " + str(hdfs_path_pipeline))

            else:
                cls.logger.warning("Going to write hdfs model path to Informix DB: " + str(hdfs_path_mapping))
                informix_dao.write_path_and_mapping_features(hdfs_path_mapping, model_code, mapping_feature_names,
                                                             informix_multi_instance)
                cls.logger.warning("Model path successfully saved to Informix DB: " + str(hdfs_path_mapping))

        except SQLException as exp:
            raise exp
        except HDFSException as exp:
            raise CommonBaseException(exp)
        except CommonBaseException as exp:
            raise exp
        except Exception as exp:
            cls.logger.error('Exception occured while saving model to hdfs ' + str(hdfs_path))
            raise CommonBaseException(exp)
    def train_model(self, data, target, model_params, model_hyper_params,
                    model_cross_validator_params):
        """
        :param data: dataframe of the training data features
        :param target: outcome associated with the training data features
        :param model_params: The model configurations
        :param model_hyper_params: Hyper-parameter settings to train the model on
        :param model_cross_validator_params:
        :return: trained model
        """
        try:

            self.logger.warning("Validating the arguments")
            self.__validate_arguments(data, model_params, model_hyper_params,
                                      model_cross_validator_params)

            self.logger.warning("Validating the model parameters")
            CommonValidations.validate_model_params(model_params)

            self.logger.info("Getting model class implementation for: " +
                             str(model_params[CommonConstants.MODEL_NAME_TAG]))
            model_class = AbstractMachineLearningModelFactory.get_instance(
                model_name=model_params[CommonConstants.MODEL_NAME_TAG])
            self.logger.info(
                "Model class implementation successfully obtained")

            self.logger.info(
                "Getting default params dict for the specific class")
            default_params_dict = model_class.get_default_params()
            self.logger.info("Default params retrieved: " +
                             str(default_params_dict))

            if (model_params.get(CommonConstants.ENABLE_CV_TAG, None) is None) or \
                    (model_params[CommonConstants.ENABLE_CV_TAG] != 'Y'):
                self.logger.info(
                    "Going to merge these dictionaries for model parameters: "
                    + "default params dictionary: " +
                    str(default_params_dict) + "model_hyper_params: " +
                    str(model_hyper_params) + "model_params: " +
                    str(model_params))
                merged_params_dict = DictionaryUtils.merge_and_overwrite_dicts(
                    default_params_dict, model_hyper_params, model_params)
                self.logger.info("Merged params dictionary: " +
                                 str(merged_params_dict))

            else:

                self.logger.info(
                    "Going to merge these dictionaries for model parameters: "
                    + "default params dictionary: " +
                    str(default_params_dict) +
                    "cross_validation_hyper_params: " +
                    str(model_cross_validator_params) + "model_params: " +
                    str(model_params))
                merged_params_dict = DictionaryUtils.merge_and_overwrite_dicts(
                    default_params_dict, model_hyper_params,
                    model_cross_validator_params, model_params)
            self.logger.warning(
                "Validating merged_params_dict for empty values")
            CommonValidations.validate_dict_for_empty_values(
                merged_params_dict)

            self.logger.warning(
                "Initializing model object from class implementation of: " +
                str(model_params[CommonConstants.MODEL_NAME_TAG]) +
                " with parameters: " + str(merged_params_dict))
            model = model_class.initialize_model(
                params_dict=merged_params_dict)
            self.logger.warning("Model object initialized" +
                                " with target column: " + "'" +
                                CommonConstants.TARGET_COLUMN_TAG + "'" +
                                " with parameters: " +
                                str(model_class.get_model_params(model)))

            if (model_params.get(CommonConstants.ENABLE_CV_TAG, None) is None) or \
                    (model_params[CommonConstants.ENABLE_CV_TAG] != 'Y'):
                self.logger.warning("Going to train the model")
                trained_model = model_class.train(
                    model=model,
                    data=data,
                    target=target,
                    params_dict=merged_params_dict)
                self.logger.warning("model successfully trained")
            else:
                self.logger.warning(
                    "Going to perform Cross Validation for selecting the best model"
                )
                trained_model = model_class.perform_cross_validation(
                    model=model,
                    data=data,
                    target=target,
                    params_dict=merged_params_dict)
                self.logger.warning(
                    "Cross Validation for the model " +
                    str(model_params[CommonConstants.MODEL_NAME_TAG]) +
                    " completed successfully")

            # TODO: return and save trained_pipeline on hdfs
            return trained_model

        except MissingMandatoryFieldException as exp:
            raise CommonBaseException(exp)
        except InvalidInfoException as exp:
            raise CommonBaseException(exp)
        except DataFrameException as exp:
            raise CommonBaseException(exp)
        except InitializationException as exp:
            raise CommonBaseException(exp)
        except CommonBaseException as exp:
            raise CommonBaseException(exp)
        except Exception as exp:
            self.logger.error(
                'Exception occured while training model on model_params = ' +
                str(model_params) + 'model_hyper_params= ' +
                str(model_hyper_params))
            raise CommonBaseException(exp)