def capture_stdout_call(self): old_stdout = sys.stdout listener = io.StringIO() sys.stdout = listener log = Logger().logger log.info('SomeString') log.debug('SomeString') out = listener.getvalue() sys.stdout = old_stdout self.assertGreater(len(out), 0) return out
class LoadFilePostProcessor: def __init__(self, dataset: pd.DataFrame): self.log = Logger().logger self.dataset = dataset def process(self): """ Function to start the LoadFilePostProcessor to correct the input file of each column starting with % and the renaming of certain columns, like #CHROM to chr. Returns ------- dataset : pandas.DataFrame Processed dataset with corrected % sign and renamed columns. """ self.log.debug('Starting correcting % sign.') self._correct_percentage_sign() self.log.debug('% sign corrected, starting renaming of columns.') self._col_renamer() self.log.info('LoadFilePostProcessor successful.') return self.dataset def _correct_percentage_sign(self): new_columns = [] for column in self.dataset.columns: if column.startswith('%'): new_columns.append(column.split('%')[1]) elif column.startswith('#'): new_columns.append(column.split('#')[1]) else: new_columns.append(column) self.dataset.columns = new_columns def _col_renamer(self): """ Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to "GeneID, FeatureID, GeneName, Intron and Exon". """ self.dataset.rename(columns={ 'CHROM': Column.chr.value, 'POS': Column.pos.value, 'REF': Column.ref.value, 'ALT': Column.alt.value, 'Gene': Column.gene_id.value, 'SYMBOL_SOURCE': Column.id_source.value, 'Feature': Column.feature.value, 'Feature_type': Column.feature_type.value, 'SYMBOL': Column.gene_name.value, 'INTRON': 'Intron', 'EXON': 'Exon', 'MAX_AF': 'max_AF' }, inplace=True)
def test_stderr(self): print('Levels INFO and DEBUG not present in stderr') self.manager.loglevel = 10 old_stderr = sys.stderr listener = io.StringIO() sys.stderr = listener log = Logger().logger log.info(self.not_present_string) log.debug(self.not_present_string) out = listener.getvalue() sys.stderr = old_stderr self.assertNotIn(self.not_present_string, out)
class ManualVEPProcessor: """ Class ManualVEPProcessor, to process the (unusable) VEP-like features to features that are more usable. """ def __init__(self): self.log = Logger().logger def process(self, dataset: pd.DataFrame): """ Callable method for the ManualVEPProcessor to start processing. Loads all the VEP processors dynamically from /src/main/python/vep. :param dataset: pandas.DataFrame: loaded pandas dataframe of the user provided input TSV. :return: pandas.DataFrame: dataframe with processed features """ self.log.info('Starting manual VEP feature processing.') vep_annotators = self._load_vep_processors() dropping_columns = [] n_feats_processed = 0 for processor in vep_annotators: if processor.name in dataset.columns and processor.usable: self.log.debug('Processing: %s', processor.name) dataset = processor.process(dataset) if processor.drop and processor.name not in dropping_columns: dropping_columns.append(processor.name) n_feats_processed += 1 else: self.log.warning( 'Could not use processor %s on input dataset!', processor.name) self.log.debug('Property drop was set True for columns: %s', ', '.join(dropping_columns)) dataset.drop(columns=dropping_columns, inplace=True) self.log.info('Processing successful.') self.log.debug('Processed %d features.', n_feats_processed) return dataset def _load_vep_processors(self): location = os.path.join(get_project_root_dir(), 'vep') self.log.debug('Loading modules at %s', location) loader = DynamicLoader(required_attributes=['name', 'process'], path=location) loaded_modules = loader.load_manual_annotators() self.log.debug('Loaded %d modules.', len(loaded_modules)) return loaded_modules
class CapiceImputing: """ Class to perform the imputing on a fully VEP processed pandas dataframe. """ def __init__(self, impute_values: dict): """ :param impute_values: dict, Dictionary containing all features to be imputed as keys and the fill value as value. Can come from either the model or a loaded json. """ self.log = Logger().logger self.log.info('Imputer started.') self.impute_values = impute_values self.pre_dtypes = {} self.dtypes = {} def impute(self, datafile: pd.DataFrame): """ Function to call the CapiceImputing to start imputing. :return: pandas DataFrame """ # Get the amount of NaN per column self._get_nan_ratio_per_column(dataset=datafile) self._correct_dtypes(datafile=datafile) datafile.fillna(self.impute_values, inplace=True) datafile = datafile.astype(dtype=self.pre_dtypes, copy=False) datafile = datafile.astype(dtype=self.dtypes, copy=False) self.log.info('Imputing successfully performed.') return datafile def _correct_dtypes(self, datafile: pd.DataFrame): """ Function to correct the dtypes that originate from the lookup annotator according to the dtypes specified within the data json. """ # First, correct the Chromosome column, then the rest. datafile[Column.chr.value] = datafile[Column.chr.value].astype(str) for key, item in self.impute_values.items(): if key in datafile.columns: # Required, see pydoc of _save_dtypes() self._save_dtypes(key=key, item=item) def _save_dtypes(self, key, item): """ Pre-dtypes are required since converting to an integer requires a float """ if isinstance(item, int): self.pre_dtypes[key] = float else: self.pre_dtypes[key] = type(item) self.dtypes[key] = type(item) def _get_nan_ratio_per_column(self, dataset: pd.DataFrame): """ Generic function to get the percentage of gaps per column :param dataset: not imputed pandas DataFrame """ for column in dataset.columns: series = dataset[column] self._calculate_percentage_nan(column=series) @staticmethod def _calculate_percentage(value, total): return round((value / total) * 100, ndigits=2) def _calculate_percentage_nan(self, column): n_nan = column.isnull().sum() if n_nan > 0: n_samples = column.size p_nan = self._calculate_percentage(n_nan, n_samples) self.log.debug('NaN detected in column %s, percentage: %s%%.', column.name, p_nan)
class CapiceExplain(Main): def __init__(self, model, output_path, output_given): super().__init__(input_path=None, output_path=output_path, output_given=output_given) self.model = model self.output = output_path self.log = Logger().logger def run(self): gain_importances = self._extract_features_importances_gain(self.model) total_gain_importances = self._extract_features_importances_total_gain(self.model) weight_importances = self._extract_features_importances_weight(self.model) cover_importances = self._extract_features_importances_cover(self.model) total_cover_importances = self._extract_features_importances_total_cover(self.model) importances = self._convert_importances_to_dataframe(gain_importances, total_gain_importances, weight_importances, cover_importances, total_cover_importances) self._order_importances(importances) self._export(importances, self.output) def _extract_features_importances_gain(self, model: xgb.XGBClassifier): self.log.info('Extracting gain from model.') feature_importances = model.get_booster().get_score(importance_type='gain') self.log.debug('Extracted %d gain features from model.', len(feature_importances.keys())) return feature_importances def _extract_features_importances_total_gain(self, model: xgb.XGBClassifier): self.log.info('Extracting total gain from model.') feature_importances = model.get_booster().get_score(importance_type='total_gain') self.log.debug('Extracted %d total_gain features from model.', len(feature_importances.keys())) return feature_importances def _extract_features_importances_weight(self, model: xgb.XGBClassifier): self.log.info('Extracting weight from model.') feature_importances = model.get_booster().get_score(importance_type='weight') self.log.debug('Extracted %d weight features from model.', len(feature_importances.keys())) return feature_importances def _extract_features_importances_cover(self, model: xgb.XGBClassifier): self.log.info('Extracting cover from model.') feature_importances = model.get_booster().get_score(importance_type='cover') self.log.debug('Extracted %d cover features from model.', len(feature_importances.keys())) return feature_importances def _extract_features_importances_total_cover(self, model: xgb.XGBClassifier): self.log.info('Extracting total cover from model.') feature_importances = model.get_booster().get_score(importance_type='total_cover') self.log.debug('Extracted %d total_cover features from model.', len(feature_importances.keys())) return feature_importances def _convert_importances_to_dataframe(self, gain: dict, total_gain: dict, weight: dict, cover: dict, total_cover: dict): self.log.info('Converting importances to dataframe.') feature_importances = pd.DataFrame(data=[gain.keys(), gain.values()], index=['feature', 'gain']).T feature_importances['total_gain'] = feature_importances['feature'].map(total_gain) feature_importances['weight'] = feature_importances['feature'].map(weight) feature_importances['cover'] = feature_importances['feature'].map(cover) feature_importances['total_cover'] = feature_importances['feature'].map(total_cover) self.log.debug('Converted %d features into the dataframe', feature_importances.shape[0]) self.log.debug('Converted all %d importance types into the dataframe', feature_importances.shape[1]) return feature_importances def _order_importances(self, importances: pd.DataFrame): self.log.info('Ordering feature importances.') importances.sort_values(by='gain', ascending=False, inplace=True) def _export(self, dataset, output): output_path = os.path.join(output, CapiceManager().output_filename) dataset.to_csv(output_path, compression='gzip', index=False, sep='\t') if not self.output_given: print(f'Successfully exported explain to: {output_path}')
class PreProcessor: """ Class to preprocess the data before predicting or training to separate categorical columns. """ def __init__(self, exclude_features: list, model_features: list = None): """ :param exclude_features: list, all the features that the preprocessor should not process. Features that are already excluded include: chr_pos_ref_alt, chr and pos. :param model_features: list (default None), a list containing all the features present within a model file. """ self.log = Logger().logger self.manager = CapiceManager() self.log.info('Preprocessor started.') self.train = False self.exclude_features = [ Column.chr_pos_ref_alt.value, Column.chr.value, Column.pos.value ] self.exclude_features += exclude_features self.model_features = model_features self.objects = [] def _is_train(self): if self.model_features is None: self.train = True def preprocess(self, dataset: pd.DataFrame): """ Callable function for the preprocessor to start preprocessing. :param dataset: unprocessed pandas DataFrame :return: processed pandas Dataframe """ self._is_train() dataset = self._create_preservation_col(dataset) self._get_categorical_columns(dataset) processed_dataset = self._process_objects(dataset) if not self.train: processed_dataset = self._ensure_columns_present(processed_dataset) self.log.info('Successfully preprocessed data.') return processed_dataset @staticmethod def _create_preservation_col(dataset): """ Function to create the chr_pos_ref_alt column so that it doesn't get lost in preprocessing. :param dataset: unprocessed pandas DataFrame :return: unprocessed pandas DataFrame containing column 'chr_pos_ref_alt' """ dataset[Column.chr_pos_ref_alt.value] = dataset[ [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value] ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1) return dataset def _get_categorical_columns(self, dataset: pd.DataFrame): """ Function to get the categorical columns that are within the supplied annotation features of the imputing file. :param dataset: pandas DataFrame """ for feature in dataset.select_dtypes(include=["O"]).columns: if feature not in self.exclude_features: self.objects.append(feature) self.log.debug('Converting the categorical columns: %s.', ', '.join(self.objects)) def _process_objects(self, dataset: pd.DataFrame): """ (If train) will create a dictionary telling the processor how many categories are within a certain column. If not train: Will look up each annotation feature from the impute file within the columns of the datafile (either in full name or the column starts with the feature from the impute file). This dictionary is then passed to the actual processor. :param dataset: unprocessed pandas DataFrame :return: processed pandas DataFrame """ annotation_feats_dict = {} if self.train: hardcoded_features = [Column.ref.value, Column.alt.value] for feature in hardcoded_features: annotation_feats_dict[feature] = 5 self.log.info('Training protocol, creating new categorical conversion identifiers.') for feat in self.objects: if feat not in annotation_feats_dict.keys(): annotation_feats_dict[feat] = 5 else: for feature in self.objects: annotation_feats_dict = self._process_objects_no_train( feature=feature, annotation_features_dict=annotation_feats_dict ) processed_data = self._process_categorical_vars( dataset=dataset, annotation_feats_dict=annotation_feats_dict ) return processed_data def _process_objects_no_train(self, feature: str, annotation_features_dict: dict): for model_feature in self.model_features: if model_feature.startswith(feature): extension = model_feature.split(''.join([feature, '_']))[-1] if feature in annotation_features_dict.keys(): annotation_features_dict[feature].append(extension) else: annotation_features_dict[feature] = [extension] return annotation_features_dict def _process_categorical_vars(self, dataset: pd.DataFrame, annotation_feats_dict: dict): """ Processor of categorical columns. Will create new columns based on the quantity of a value within a column. :param dataset: unprocessed pandas DataFrame :param annotation_feats_dict: dictionary that is to contain the levels for each categorical feature :return: processed pandas DataFrame """ if self.train: for annotation_feature in annotation_feats_dict.keys(): feature_names = self._get_top_n_cats( column=dataset[annotation_feature], return_num=annotation_feats_dict[annotation_feature] ) dataset[annotation_feature] = np.where( dataset[annotation_feature].isin(feature_names), dataset[annotation_feature], 'other' ) else: for annotation_feature in annotation_feats_dict.keys(): feature_names = annotation_feats_dict[annotation_feature] self.log.debug('For feature: %s loaded %s levels: %s', annotation_feature, len(feature_names), feature_names ) dataset[annotation_feature] = np.where( dataset[annotation_feature].isin(feature_names), dataset[annotation_feature], 'other' ) dataset = pd.get_dummies(dataset, columns=list(annotation_feats_dict.keys())) return dataset def _get_top_n_cats(self, column: pd.Series, return_num: int): """ Function for when a training file is preprocessed to get the top return_num quantity values within a categorical column. Some converting is done for the logger to be able to print them. :param column: pandas Series :param return_num: integer :return: pandas Series """ value_counts = column.value_counts().index[:return_num].values printable_value_counts = [] for value in value_counts: if not isinstance(value, str): value = str(value) printable_value_counts.append(value) message = 'For feature: %s saved the following values: %s' self.log.info(message, column.name, ', '.join(printable_value_counts)) return value_counts def _ensure_columns_present(self, dataset): """ Function to ensure that for the prediction all prediction columns are present. If a columns is not present, add it with a full columns of NaN. """ column_utils = ColumnUtils() column_utils.set_specified_columns(self.model_features) missing = column_utils.get_missing_diff_with(dataset.columns) for feature in missing: message = 'Detected column %s not present in columns. Adding full column of NaN' self.log.debug(message, feature) dataset[feature] = np.nan return dataset
class Main(ABC): """ Main class of CAPICE that contains methods to help the different modes to function. """ def __init__(self, input_path, output_path, output_given): # Assumes CapiceManager has been initialized & filled. self.manager = CapiceManager() self.log = Logger().logger self.log.info('Initiating selected mode.') # Input file. self.infile = input_path self.log.debug('Input argument -i / --input confirmed: %s', self.infile) # Output file. self.output = output_path self.log.debug('Output directory -o / --output confirmed: %s', self.output) self.output_given = output_given # Preprocessor global exclusion features # Overwrite in specific module if features are incorrect self.exclude_features = [Column.gene_name.value, Column.gene_id.value, Column.id_source.value, Column.feature.value, Column.feature_type.value] @abstractmethod def run(self): pass def _load_file(self, additional_required_features: list = None): """ Function to load the input TSV file into main :return: pandas DataFrame """ input_parser = InputParser() input_file = input_parser.parse(input_file_path=self.infile) post_load_processor = LoadFilePostProcessor(dataset=input_file) input_file = post_load_processor.process() validator = PostFileParseValidator() # Individual calls to the validator for error readability validator.validate_variants_present(input_file) validator.validate_chrom_pos(input_file) validator.validate_n_columns(input_file) validator.validate_minimally_required_columns( input_file, additional_required_features=additional_required_features ) return input_file @staticmethod def process(loaded_data): """ Function to process the VEP features to CAPICE features. """ processor = ManualVEPProcessor() processed_data = processor.process(dataset=loaded_data) return processed_data @staticmethod def impute(loaded_data, impute_values): """ Function to perform imputing over the loaded data. self.model can be None, but impute_json has to be defined in that case. """ capice_imputer = CapiceImputing(impute_values=impute_values) capice_data = capice_imputer.impute(loaded_data) return capice_data def preprocess(self, loaded_data, model_features=None): """ Function to perform the preprocessing of the loaded data to convert categorical columns. :param loaded_data: Pandas dataframe of the imputed CAPICE data :param model_features: list (default None), a list containing all the features present within a model file. When set to None, PreProcessor will activate the train protocol. Note: please adjust self.exclude_features: to include all of the features that the preprocessor should NOT process. Features chr_pos_ref_alt, chr and pos are hardcoded and thus do not have to be included. """ preprocessor = PreProcessor( exclude_features=self.exclude_features, model_features=model_features) capice_data = preprocessor.preprocess(loaded_data) return capice_data def _export(self, dataset, output): """ Function to prepare the data to be exported """ CapiceExporter(file_path=output, output_given=self.output_given).export_capice_prediction( datafile=dataset)