class PropertyCheckerLogger(PropertyChecker): def __init__(self): super().__init__() self.log = Logger().logger def _talk_to_logger(self, error_message): self.log.critical(error_message)
class ManualAnnotator: def __init__(self): self.log = Logger().logger self.vep_annotators = [] self.location = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'annotaters', 'vep') self._load_vep_annotators() def _load_vep_annotators(self): python_modules = load_modules(self.location) self._check_n_modules(python_modules) loaded_python_modules = importer(python_modules, path=self.location) self._check_n_modules(loaded_python_modules) for module in loaded_python_modules: if 'name' in dir(module) and module.usable: self.vep_annotators.append(module) self._check_n_modules(self.vep_annotators) def _check_n_modules(self, modules_list): if len(modules_list) < 1: error_message = 'Unable to locate VEP Processors at {}, ' \ 'was the directory moved?'.format(self.location) self.log.critical(error_message) raise FileNotFoundError(error_message) def process(self, dataset: pd.DataFrame): for processor in self.vep_annotators: if processor.name in dataset.columns and processor.usable: dataset = processor.process(dataset) if processor.drop: dataset.drop(columns=processor.name, inplace=True) else: self.log.warning( 'Could not use processor {} on input dataset!'.format( processor.name ) ) return dataset
class TrainChecker: """ Class specific to the train_model.py to check certain parts within it's process. """ def __init__(self): self.log = Logger().logger def check_specified_defaults(self, loaded_defaults: dict): """ Function to check if "learning_rate", "n_estimators" and "max_depth" are present within the specified defaults file. Also check if the variable type match the expected type. :param loaded_defaults: dict """ required_arguments = { 'learning_rate': float, 'n_estimators': int, 'max_depth': int } for argument in required_arguments.keys(): if argument not in loaded_defaults.keys(): error_message = 'Argument {} is not found in the ' \ 'specified defaults file!'.format(argument) self.log.critical(error_message) raise KeyError(error_message) if not isinstance(loaded_defaults[argument], required_arguments[argument]): error_message = """ For argument expected type: {}, but got: {} """.format(required_arguments[argument], type(loaded_defaults[argument])) self.log.critical(error_message) raise TypeError(error_message) def check_labels(self, dataset: pd.DataFrame, include_balancing=False): """ Function to check if "binarized_label" and "sample_weight" are present within the columns of a given dataset. Set include_balancing to true if user wants to perform balancing algorithm. :param dataset: pandas DataFrame :param include_balancing: bool """ required_columns = ['binarized_label', 'sample_weight'] if include_balancing: required_columns += ['Consequence', 'MAX_AF'] for col_name in required_columns: if col_name not in dataset.columns: error_message = """ Error locating label {} within dataset! """.format(col_name) self.log.critical(error_message) raise KeyError(error_message)
class TemplateImputeValues(metaclass=ABCMeta): """ Abstract template class for new imputing files. """ def __init__(self, name, usable, vep_version, grch_build): self.log = Logger().logger self.property_checker = PropertyCheckerLogger() self.name = name self.usable = usable self.supported_vep_version = vep_version self.supported_grch_build = grch_build self.impute_data = self._get_impute_data() @property def name(self): return self._name @name.setter def name(self, value='Template'): self.property_checker.check_property(value=value, expected_type=str) self._name = value @property def usable(self): return self._usable @usable.setter def usable(self, value=False): self.property_checker.check_property(value=value, expected_type=bool) self._usable = value @property def supported_vep_version(self): return self._vep_version @supported_vep_version.setter def supported_vep_version(self, value): self.property_checker.check_property(value=value, expected_type=float) self._vep_version = value @property def supported_grch_build(self): return self._grch_build @supported_grch_build.setter def supported_grch_build(self, value): self.property_checker.check_property(value=value, expected_type=int) self._grch_build = value def _get_impute_data(self): with open(self._json_loc()) as json_file: json_data = json.load(json_file) return json_data def _json_loc(self): path = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'data_files', 'json_data') json_name = self._json_name() if json_name == 'none': error_message = 'Location of JSON must be specified!' self.log.critical(error_message) raise FileNotFoundError(error_message) return os.path.join(path, json_name) @staticmethod @abstractmethod def _json_name(): """ Abstract template function to define the location of where the imputing JSON is stored, containing the required columns for the input datafile. :return: path-like """ return 'none' @property def annotation_features(self): """ Property getter annotation_feature. Get the annotation features defined within the impute file. :return: list """ return list(self.impute_data.keys()) @property def impute_values(self): """ Property impute_values getter. Get the default / impute values as defined within an impute file. :return: dict """ return_dict = {} for key, value in self.impute_data.items(): if value is not None: return_dict[key] = value return return_dict
class PreProcessor: """ Class to dynamically load in all model files for preprocessing and choosing the correct preprocessing file according to the given config arguments or parsed VEP file header. (or the --overwrite_model_file argument) """ def __init__(self, is_train: bool = False): self.manager = CapiceManager() self.log = Logger().logger self.log.info('Preprocessor started.') self.overrule = self.manager.overwrite_model self.vep_version = self.manager.vep_version self.grch_build = self.manager.grch_build self.train = is_train self.preprocessors = [] self.preprocessor = None self._prepare_preprocessor() def _prepare_preprocessor(self): """ Function to see if the training protocol should be used or the preprocessors should be loaded in. """ if self.train: from src.main.python.resources.models.training_preprocessor import \ TrainPreprocessor self.preprocessor = TrainPreprocessor() else: self._load_preprocessors() self._load_correct_preprocessor() self._check_preprocessor_is_applied() def _load_preprocessors(self): """ Function to dynamically load in the preprocessors modules, but must have the following properties: name, supported_vep_version and supported_genomebuild_version. """ self.log.info('Identifying preprocessing files.') directory = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'models') usable_modules = load_modules(directory) if len(usable_modules) < 1: self._raise_no_module_found_error() imported_modules = importer( usable_modules=usable_modules, path=directory ) for module in imported_modules: if "name" in dir(module) and "supported_vep_version" in dir( module) and "supported_grch_build" in dir(module): self.preprocessors.append(module) if len(self.preprocessors) < 1: self._raise_no_module_found_error() self.log.info( 'Succesfully loaded {} preprocessors.'.format( len(self.preprocessors) ) ) def _raise_no_module_found_error(self): """ Specialized function to be used into _load_preprocessors() and _load_correct_preprocessor() to be raised when no preprocessing files can be found. """ error_message = 'No usable python files are ' \ 'found within the model directory!' self.log.critical(error_message) raise FileNotFoundError(error_message) def _load_correct_preprocessor(self): """ Function to check the dynamically loaded preprocessors to match either the overrule argument or the vep version and genome build. """ for preprocessor in self.preprocessors: if self.overrule and preprocessor.name == self.overrule: self.log.info( 'Overrule successful for: {} , located at: {}'.format( self.overrule, inspect.getfile(preprocessor.__class__) ) ) self.preprocessor = preprocessor break else: module_vep = preprocessor.supported_vep_version module_grch = preprocessor.supported_grch_build if module_vep == self.vep_version and \ module_grch == self.grch_build: self.log.info(""" Preprocessing and model file successfully found: {} , Located at: {} """.format( preprocessor.name, inspect.getfile(preprocessor.__class__) ).strip() ) self.preprocessor = preprocessor break def _check_preprocessor_is_applied(self): if self.preprocessor is None: if self.overrule: error_message = 'No model data file found for overrule: ' \ '{}'.format(self.overrule) else: error_message = """ No model data file found for VEP version: {} and genome build: {}""".format( self.vep_version, self.grch_build ).strip() self.log.critical(error_message) raise FileNotFoundError(error_message) def preprocess(self, datafile: pd.DataFrame): """ Callable function for external modules to start call the preprocessor of the correctly chosen module. :param datafile: unprocessed pandas DataFrame :return: processed pandas Dataframe """ processed_data = self.preprocessor.preprocess( dataset=datafile, is_train=self.train ) return processed_data def predict(self, datafile: pd.DataFrame): """ Callable function for external modules to start the call to the predict of the correctly chosen module. :param datafile: preprocessed pandas DataFrame :return: predicted pandas DataFrame """ predicted_data = self.preprocessor.predict(data=datafile) return predicted_data def get_model_features(self): if self.preprocessor is None: error_message = "Preprocessor has to be initialized before " \ "model features can be requested." self.log.critical(error_message) raise InitializationError(error_message) return self.preprocessor.model_features
class CapiceImputing: """ Class to dynamically load in all imputing files and identify the file suitable for the run's use case. """ def __init__(self): self.manager = CapiceManager() self.vep_version = self.manager.vep_version self.grch_build = self.manager.grch_build self.log = Logger().logger self.log.info('Imputer started.') self.overrule = self.manager.overwrite_impute self.modules = [] self.module = None self._load_modules() self._is_correct_datafile_present() self._check_if_imputer_is_applied() self.columns = [] self.annotation_columns_present = [] self.impute_values = {} self.pre_dtypes = {} self.dtypes = {} def _load_modules(self): """ Method to dynamically load in all python files containing a class that contains the properties name and _json_name. If at the end of this function, the list of impute files is empty, will throw the module not found error. """ self.log.info('Identifying imputing files.') directory = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'data_files', 'imputing') usable_modules = load_modules(directory) if len(usable_modules) < 1: self._raise_no_module_found_error() loaded_modules = importer(usable_modules=usable_modules, path=directory) for module in loaded_modules: if "name" in dir(module) and "_json_name" in dir(module): self.modules.append(module) if len(self.modules) < 1: self._raise_no_module_found_error() self.log.info( 'Identified {} files available for usage in imputing.'.format( len(self.modules))) def _raise_no_module_found_error(self): """ Function to raise when no suitable impute files are found. Put into a function since 2 other functions within this module will use it. """ error_message = 'No usable python files are found ' \ 'within the imputing directory!' self.log.critical(error_message) raise FileNotFoundError(error_message) def _is_correct_datafile_present(self): """ Function to check the VEP version and GRCh build (or --overwrite_impute_file) match the impute file. """ for module in self.modules: if self.overrule and module.name == self.overrule: self.log.info( 'Overrule successful for: {} , located at: {}'.format( self.overrule, inspect.getfile(module.__class__))) self.module = module break else: module_vep_version = module.supported_vep_version module_grch_build = module.supported_grch_build if module_vep_version == self.vep_version and \ module_grch_build == self.grch_build: self.log.info('Impute data file successfully found: {} , ' 'located at: {}'.format( module.name, inspect.getfile(module.__class__))) self.module = module break def _check_if_imputer_is_applied(self): # Checking if self.data_file is assigned if self.module is None: if self.overrule: error_message = 'No imputing data file found for overrule: ' \ '{}'.format(self.overrule) else: error_message = 'No imputing data file found for ' \ 'VEP version: {} and ' \ 'GRCh build: {}'.format(self.vep_version, self.grch_build ) self.log.critical(error_message) raise FileNotFoundError(error_message) def _load_values(self, dataset: pd.DataFrame): """ Function to be called right when impute() is called, gets the input datafile features, imputes values from the impute file and saves the datafile features to the manager. """ self.columns = self.module.annotation_features for col in self.columns: if col in dataset.columns: self.annotation_columns_present.append(col) self.manager.annotation_features = self.columns self.impute_values = self.module.impute_values def impute(self, datafile: pd.DataFrame): """ Function to call the CapiceImputing to start imputing. :return: pandas DataFrame """ self._load_values(datafile) datafile = self._check_chrom_pos(datafile) self._get_nan_ratio_per_column(dataset=datafile) self._get_full_nan_row(dataset=datafile) datafile.dropna(how='all', subset=self.annotation_columns_present) datafile = datafile[~datafile['CAPICE_drop_out']] datafile.drop(columns=['CAPICE_drop_out'], inplace=True) self._correct_dtypes(datafile=datafile) datafile.fillna(self.impute_values, inplace=True) datafile = datafile.astype(dtype=self.pre_dtypes, copy=False) datafile = datafile.astype(dtype=self.dtypes, copy=False) datafile = self._add_missing_columns(datafile) self.log.info('Imputing successfully performed.') return datafile @deprecated def _add_missing_columns(self, datafile: pd.DataFrame): for key, value in self.impute_values.items(): if key not in datafile.columns: datafile[key] = value return datafile def _correct_dtypes(self, datafile: pd.DataFrame): """ Function to correct the dtypes that originate from the lookup annotator according to the dtypes specified within the data json. """ for key, item in self.impute_values.items(): if key in datafile.columns: self._save_dtypes(key=key, item=item) def _save_dtypes(self, key, item): if isinstance(item, int): self.pre_dtypes[key] = float else: self.pre_dtypes[key] = type(item) self.dtypes[key] = type(item) def _check_chrom_pos(self, dataset: pd.DataFrame): """ Function to check if all values of the columns Chr and Pos are present. :param dataset: not imputed pandas DataFrame :return: pandas DataFrame containing no NaN or gaps for Chr and Pos columns. """ chrom_is_float = False if dataset['Chr'].isnull().values.any(): if dataset.dtypes['Chr'] == np.float64: chrom_is_float = True n_delete = dataset['Chr'].isnull().values.sum() self.log.warning('Detected NaN in the Chromosome column! ' 'Deleting {} row(s).'.format(n_delete)) dataset = dataset[~dataset['Chr'].isnull()] if dataset['Pos'].isnull().values.any(): n_delete = dataset['Pos'].isnull().values.sum() self.log.warning('Detected NaN is the Position column! ' 'Deleting {} row(s).'.format(n_delete)) dataset = dataset[~dataset['Pos'].isnull()] dataset.index = range(0, dataset.shape[0]) if chrom_is_float: dataset['Chr'] = dataset['Chr'].astype(int) dataset['Chr'] = dataset['Chr'].astype(str) dataset['Pos'] = dataset['Pos'].astype(int) return dataset def _get_nan_ratio_per_column(self, dataset: pd.DataFrame): """ Generic function to get the percentage of gaps per column :param dataset: not imputed pandas DataFrame """ for column in dataset.columns: series = dataset[column] self._calculate_percentage_nan(column=series) def _calculate_percentage_nan(self, column): n_nan = column.isnull().sum() if n_nan > 0: n_samples = column.size p_nan = round((n_nan / n_samples) * 100, ndigits=2) self.log.debug( 'NaN detected in column {}, percentage: {}%.'.format( column.name, p_nan)) def _get_full_nan_row(self, dataset: pd.DataFrame): """ Function to get the samples of which absolutely no prediction is possible due to all non chr pos ref alt rows being gaps. :param dataset: not imputed pandas DataFrame """ n_samples = dataset.shape[0] dataset.index = range(1, n_samples + 1) dataset['CAPICE_drop_out'] = dataset[ self.annotation_columns_present].isnull().values.all(axis=1) samples_dropped_out = dataset[dataset['CAPICE_drop_out']] if samples_dropped_out.shape[0] > 0: self.log.warning( 'The following samples are filtered out due to missing values: ' '(indexing is python based, ' 'so the index starts at 0). \n {}'.format(samples_dropped_out[[ 'Chr', 'Pos', 'Ref', 'Alt', 'FeatureID' ]])) else: self.log.info( 'No samples are filtered out due to too many NaN values.')
class InputVersionChecker: """ Class to check the given VEP config argument and file VEP version match. Class is self running. """ def __init__(self, config_vep_version: float, file_vep_version: float, config_grch_build: int, file_grch_build: int): """ Class to check the given VEP config argument and the header of the VEP file match. :param config_vep_version: float, config argument for the used VEP version :param file_vep_version: flaot, config argument for the used GRCh build """ self.config_vep_version = config_vep_version self.file_vep_version = file_vep_version self.config_grch_build = config_grch_build self.file_grch_build = file_grch_build self.manager = CapiceManager() self.export_vep_version = None self.export_grch_build = None self.check_match = [] self.unable_check = [] self.check_overrule = False self.log = Logger().logger self._check_all_present() if self.check_overrule: self._check_overrule() self._check_version_match() self._set_global_vep_version() self._set_global_grch_build() def _set_global_vep_version(self): """ Function to provide the CapiceManager with the VEP version to be used globally later on in CAPICE. """ self.manager.vep_version = self.export_vep_version self.log.info('VEP version set to: {}'.format(self.export_vep_version)) def _set_global_grch_build(self): """ Function to provide the CapiceManager with the Genome Build version to be used globally later on in CAPICE. """ self.manager.grch_build = self.export_grch_build self.log.info('GRCh build set to: {}'.format(self.export_grch_build)) def _check_overrule(self): """ Function called when either the VEP version or GRCh build can not be determined. Overrule must be present for both impute and model, since it can not determine what file to use without VEP or GRCh argument. """ if self.manager.overwrite_impute is False and \ self.manager.overwrite_model is False: error_message = """ VEP version or GRCh build not specified and both overwrites are not set! Not able to find a correct impute or processing file! """.strip() self.log.critical(error_message) raise InputError(error_message) def _check_all_present(self): """ Function to check if both the VEP version and GRCh build are present within either the config arguments or within the file. """ dict_of_all_present = { 'VEP': [self.file_vep_version, self.config_vep_version], 'GRCh': [self.file_grch_build, self.config_grch_build] } for type_of_check in dict_of_all_present.keys(): to_check = dict_of_all_present[type_of_check] self._check_individual_argument(to_check=to_check, type_of_check=type_of_check) def _check_individual_argument(self, to_check, type_of_check): """ Function belonging to _check_all_present to check if a VEP version and GRCh build can be set globally. :param to_check: list :param type_of_check: string """ if False in to_check: if to_check.count(False) == len(to_check): self._turn_on_check_overrule(type_of_check=type_of_check) for argument in to_check: self._apply_export_version(argument=argument, type_of_check=type_of_check) else: self.check_match.append(type_of_check) def _turn_on_check_overrule(self, type_of_check): """ Function to turn on the overrule check if no VEP or GRCh arguments are passed. """ self.check_overrule = type_of_check self.log.warning( 'Unable to obtain {} version from file or config file!'.format( type_of_check)) self.check_overrule = True def _apply_export_version(self, argument, type_of_check): """ Function to set the global VEP version or GRCh build. :param argument: int or float """ if argument is not False: if type_of_check == 'VEP': self.export_vep_version = argument else: self.export_grch_build = argument def _check_version_match(self): """ Function to check if the Config Argument and the file header specified VEP versions match. If not: use the config argument as form of "overwrite" and warn. """ if len(self.check_match) > 0: for check_match in self.check_match: if check_match == 'VEP': self._check_vep_match(check_match=check_match) elif check_match == 'GRCh': self._check_grch_match(check_match=check_match) def _check_vep_match(self, check_match): if self.file_vep_version != self.config_vep_version: self._raise_version_mismatch(type_of_mismatch=check_match, version_cla=self.file_vep_version, version_file=self.config_vep_version) else: self._raise_version_mismatch(type_of_mismatch=check_match, match_successful=True) self.export_vep_version = self.file_vep_version def _check_grch_match(self, check_match): if self.config_grch_build != self.file_grch_build: self._raise_version_mismatch(type_of_mismatch=check_match, version_cla=self.config_grch_build, version_file=self.file_grch_build) else: self._raise_version_mismatch(type_of_mismatch=check_match, match_successful=True) self.export_grch_build = self.file_grch_build def _raise_version_mismatch(self, type_of_mismatch, version_cla=None, version_file=None, match_successful=False): if match_successful: self.log.info( 'Successfully matched CLA and file versions for {}.'.format( type_of_mismatch)) else: warning_message = """ Warning matching {} versions. CLA version supplied: {} does not match file version: {} !""".format( type_of_mismatch, version_cla, version_file).strip() warnings.warn(warning_message) self.log.warning(warning_message)