Esempio n. 1
0
 def capture_stdout_call(self):
     old_stdout = sys.stdout
     listener = io.StringIO()
     sys.stdout = listener
     log = Logger().logger
     log.info('SomeString')
     log.debug('SomeString')
     out = listener.getvalue()
     sys.stdout = old_stdout
     self.assertGreater(len(out), 0)
     return out
Esempio n. 2
0
class LoadFilePostProcessor:
    def __init__(self, dataset: pd.DataFrame):
        self.log = Logger().logger
        self.dataset = dataset

    def process(self):
        """
        Function to start the LoadFilePostProcessor to correct the input file of
        each column starting with % and the renaming of certain columns,
        like #CHROM to chr.

        Returns
        -------
        dataset :   pandas.DataFrame
                    Processed dataset with corrected % sign and renamed columns.
        """
        self.log.debug('Starting correcting % sign.')
        self._correct_percentage_sign()
        self.log.debug('% sign corrected, starting renaming of columns.')
        self._col_renamer()
        self.log.info('LoadFilePostProcessor successful.')
        return self.dataset

    def _correct_percentage_sign(self):
        new_columns = []
        for column in self.dataset.columns:
            if column.startswith('%'):
                new_columns.append(column.split('%')[1])
            elif column.startswith('#'):
                new_columns.append(column.split('#')[1])
            else:
                new_columns.append(column)
        self.dataset.columns = new_columns

    def _col_renamer(self):
        """
        Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to
        "GeneID, FeatureID, GeneName, Intron and Exon".
        """
        self.dataset.rename(columns={
            'CHROM': Column.chr.value,
            'POS': Column.pos.value,
            'REF': Column.ref.value,
            'ALT': Column.alt.value,
            'Gene': Column.gene_id.value,
            'SYMBOL_SOURCE': Column.id_source.value,
            'Feature': Column.feature.value,
            'Feature_type': Column.feature_type.value,
            'SYMBOL': Column.gene_name.value,
            'INTRON': 'Intron',
            'EXON': 'Exon',
            'MAX_AF': 'max_AF'
        },
                            inplace=True)
Esempio n. 3
0
    def test_stderr(self):
        print('Levels INFO and DEBUG not present in stderr')
        self.manager.loglevel = 10

        old_stderr = sys.stderr
        listener = io.StringIO()
        sys.stderr = listener

        log = Logger().logger
        log.info(self.not_present_string)
        log.debug(self.not_present_string)

        out = listener.getvalue()
        sys.stderr = old_stderr
        self.assertNotIn(self.not_present_string, out)
Esempio n. 4
0
class ManualVEPProcessor:
    """
    Class ManualVEPProcessor, to process the (unusable) VEP-like features to
    features that are more usable.
    """
    def __init__(self):
        self.log = Logger().logger

    def process(self, dataset: pd.DataFrame):
        """
        Callable method for the ManualVEPProcessor to start processing.
        Loads all the VEP processors dynamically from /src/main/python/vep.
        :param dataset: pandas.DataFrame: loaded pandas dataframe of the user
        provided input TSV.
        :return: pandas.DataFrame: dataframe with processed features
        """
        self.log.info('Starting manual VEP feature processing.')
        vep_annotators = self._load_vep_processors()
        dropping_columns = []
        n_feats_processed = 0
        for processor in vep_annotators:
            if processor.name in dataset.columns and processor.usable:
                self.log.debug('Processing: %s', processor.name)
                dataset = processor.process(dataset)
                if processor.drop and processor.name not in dropping_columns:
                    dropping_columns.append(processor.name)
                n_feats_processed += 1
            else:
                self.log.warning(
                    'Could not use processor %s on input dataset!',
                    processor.name)
        self.log.debug('Property drop was set True for columns: %s',
                       ', '.join(dropping_columns))
        dataset.drop(columns=dropping_columns, inplace=True)
        self.log.info('Processing successful.')
        self.log.debug('Processed %d features.', n_feats_processed)
        return dataset

    def _load_vep_processors(self):
        location = os.path.join(get_project_root_dir(), 'vep')
        self.log.debug('Loading modules at %s', location)
        loader = DynamicLoader(required_attributes=['name', 'process'],
                               path=location)
        loaded_modules = loader.load_manual_annotators()
        self.log.debug('Loaded %d modules.', len(loaded_modules))
        return loaded_modules
Esempio n. 5
0
class CapiceImputing:
    """
    Class to perform the imputing on a fully VEP processed pandas dataframe.
    """
    def __init__(self, impute_values: dict):
        """
        :param impute_values: dict, Dictionary containing all features to be
        imputed as keys and the fill value as value. Can come from either the
        model or a loaded json.
        """
        self.log = Logger().logger
        self.log.info('Imputer started.')
        self.impute_values = impute_values
        self.pre_dtypes = {}
        self.dtypes = {}

    def impute(self, datafile: pd.DataFrame):
        """
        Function to call the CapiceImputing to start imputing.
        :return: pandas DataFrame
        """
        # Get the amount of NaN per column
        self._get_nan_ratio_per_column(dataset=datafile)

        self._correct_dtypes(datafile=datafile)
        datafile.fillna(self.impute_values, inplace=True)
        datafile = datafile.astype(dtype=self.pre_dtypes, copy=False)
        datafile = datafile.astype(dtype=self.dtypes, copy=False)
        self.log.info('Imputing successfully performed.')
        return datafile

    def _correct_dtypes(self, datafile: pd.DataFrame):
        """
        Function to correct the dtypes that originate from the lookup annotator
        according to the dtypes specified within the data json.
        """
        # First, correct the Chromosome column, then the rest.
        datafile[Column.chr.value] = datafile[Column.chr.value].astype(str)
        for key, item in self.impute_values.items():
            if key in datafile.columns:
                # Required, see pydoc of _save_dtypes()
                self._save_dtypes(key=key, item=item)

    def _save_dtypes(self, key, item):
        """
        Pre-dtypes are required since converting to an integer requires a float
        """
        if isinstance(item, int):
            self.pre_dtypes[key] = float
        else:
            self.pre_dtypes[key] = type(item)
        self.dtypes[key] = type(item)

    def _get_nan_ratio_per_column(self, dataset: pd.DataFrame):
        """
        Generic function to get the percentage of gaps per column
        :param dataset: not imputed pandas DataFrame
        """
        for column in dataset.columns:
            series = dataset[column]
            self._calculate_percentage_nan(column=series)

    @staticmethod
    def _calculate_percentage(value, total):
        return round((value / total) * 100, ndigits=2)

    def _calculate_percentage_nan(self, column):
        n_nan = column.isnull().sum()
        if n_nan > 0:
            n_samples = column.size
            p_nan = self._calculate_percentage(n_nan, n_samples)
            self.log.debug('NaN detected in column %s, percentage: %s%%.',
                           column.name, p_nan)
Esempio n. 6
0
class CapiceExplain(Main):
    def __init__(self, model, output_path, output_given):
        super().__init__(input_path=None, output_path=output_path, output_given=output_given)
        self.model = model
        self.output = output_path
        self.log = Logger().logger

    def run(self):
        gain_importances = self._extract_features_importances_gain(self.model)
        total_gain_importances = self._extract_features_importances_total_gain(self.model)
        weight_importances = self._extract_features_importances_weight(self.model)
        cover_importances = self._extract_features_importances_cover(self.model)
        total_cover_importances = self._extract_features_importances_total_cover(self.model)
        importances = self._convert_importances_to_dataframe(gain_importances,
                                                             total_gain_importances,
                                                             weight_importances,
                                                             cover_importances,
                                                             total_cover_importances)
        self._order_importances(importances)
        self._export(importances, self.output)

    def _extract_features_importances_gain(self, model: xgb.XGBClassifier):
        self.log.info('Extracting gain from model.')
        feature_importances = model.get_booster().get_score(importance_type='gain')
        self.log.debug('Extracted %d gain features from model.', len(feature_importances.keys()))
        return feature_importances

    def _extract_features_importances_total_gain(self, model: xgb.XGBClassifier):
        self.log.info('Extracting total gain from model.')
        feature_importances = model.get_booster().get_score(importance_type='total_gain')
        self.log.debug('Extracted %d total_gain features from model.',
                       len(feature_importances.keys()))
        return feature_importances

    def _extract_features_importances_weight(self, model: xgb.XGBClassifier):
        self.log.info('Extracting weight from model.')
        feature_importances = model.get_booster().get_score(importance_type='weight')
        self.log.debug('Extracted %d weight features from model.',
                       len(feature_importances.keys()))
        return feature_importances

    def _extract_features_importances_cover(self, model: xgb.XGBClassifier):
        self.log.info('Extracting cover from model.')
        feature_importances = model.get_booster().get_score(importance_type='cover')
        self.log.debug('Extracted %d cover features from model.',
                       len(feature_importances.keys()))
        return feature_importances

    def _extract_features_importances_total_cover(self, model: xgb.XGBClassifier):
        self.log.info('Extracting total cover from model.')
        feature_importances = model.get_booster().get_score(importance_type='total_cover')
        self.log.debug('Extracted %d total_cover features from model.',
                       len(feature_importances.keys()))
        return feature_importances

    def _convert_importances_to_dataframe(self, gain: dict, total_gain: dict, weight: dict,
                                          cover: dict, total_cover: dict):
        self.log.info('Converting importances to dataframe.')
        feature_importances = pd.DataFrame(data=[gain.keys(), gain.values()],
                                           index=['feature', 'gain']).T
        feature_importances['total_gain'] = feature_importances['feature'].map(total_gain)
        feature_importances['weight'] = feature_importances['feature'].map(weight)
        feature_importances['cover'] = feature_importances['feature'].map(cover)
        feature_importances['total_cover'] = feature_importances['feature'].map(total_cover)
        self.log.debug('Converted %d features into the dataframe', feature_importances.shape[0])
        self.log.debug('Converted all %d importance types into the dataframe',
                       feature_importances.shape[1])
        return feature_importances

    def _order_importances(self, importances: pd.DataFrame):
        self.log.info('Ordering feature importances.')
        importances.sort_values(by='gain', ascending=False, inplace=True)

    def _export(self, dataset, output):
        output_path = os.path.join(output, CapiceManager().output_filename)
        dataset.to_csv(output_path, compression='gzip', index=False, sep='\t')
        if not self.output_given:
            print(f'Successfully exported explain to: {output_path}')
Esempio n. 7
0
class PreProcessor:
    """
    Class to preprocess the data before predicting or training to separate
    categorical columns.
    """

    def __init__(self, exclude_features: list, model_features: list = None):
        """
        :param exclude_features: list,
            all the features that the preprocessor should not process.
        Features that are already excluded include:
            chr_pos_ref_alt, chr and pos.
        :param model_features: list (default None), a list containing all
        the features present within a model file.
        """
        self.log = Logger().logger
        self.manager = CapiceManager()
        self.log.info('Preprocessor started.')
        self.train = False
        self.exclude_features = [
            Column.chr_pos_ref_alt.value,
            Column.chr.value,
            Column.pos.value
        ]
        self.exclude_features += exclude_features
        self.model_features = model_features
        self.objects = []

    def _is_train(self):
        if self.model_features is None:
            self.train = True

    def preprocess(self, dataset: pd.DataFrame):
        """
        Callable function for the preprocessor to start preprocessing.
        :param dataset: unprocessed pandas DataFrame
        :return: processed pandas Dataframe
        """
        self._is_train()
        dataset = self._create_preservation_col(dataset)
        self._get_categorical_columns(dataset)
        processed_dataset = self._process_objects(dataset)
        if not self.train:
            processed_dataset = self._ensure_columns_present(processed_dataset)
        self.log.info('Successfully preprocessed data.')
        return processed_dataset

    @staticmethod
    def _create_preservation_col(dataset):
        """
        Function to create the chr_pos_ref_alt column so that it doesn't get
        lost in preprocessing.
        :param dataset: unprocessed pandas DataFrame
        :return: unprocessed pandas DataFrame
            containing column 'chr_pos_ref_alt'
        """
        dataset[Column.chr_pos_ref_alt.value] = dataset[
            [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value]
        ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1)
        return dataset

    def _get_categorical_columns(self, dataset: pd.DataFrame):
        """
        Function to get the categorical columns that are within the supplied
        annotation features of the imputing file.
        :param dataset: pandas DataFrame
        """
        for feature in dataset.select_dtypes(include=["O"]).columns:
            if feature not in self.exclude_features:
                self.objects.append(feature)
        self.log.debug('Converting the categorical columns: %s.', ', '.join(self.objects))

    def _process_objects(self, dataset: pd.DataFrame):
        """
        (If train) will create a dictionary telling the processor how many
        categories are within a certain column.
        If not train: Will look up each annotation feature from the impute file
        within the columns of the datafile (either in full name or the column
        starts with the feature from the impute file).
        This dictionary is then passed to the actual processor.
        :param dataset: unprocessed pandas DataFrame
        :return: processed pandas DataFrame
        """
        annotation_feats_dict = {}
        if self.train:
            hardcoded_features = [Column.ref.value, Column.alt.value]
            for feature in hardcoded_features:
                annotation_feats_dict[feature] = 5
            self.log.info('Training protocol, creating new categorical conversion identifiers.')
            for feat in self.objects:
                if feat not in annotation_feats_dict.keys():
                    annotation_feats_dict[feat] = 5
        else:
            for feature in self.objects:
                annotation_feats_dict = self._process_objects_no_train(
                    feature=feature,
                    annotation_features_dict=annotation_feats_dict
                )
        processed_data = self._process_categorical_vars(
            dataset=dataset,
            annotation_feats_dict=annotation_feats_dict
        )
        return processed_data

    def _process_objects_no_train(self, feature: str, annotation_features_dict: dict):
        for model_feature in self.model_features:
            if model_feature.startswith(feature):
                extension = model_feature.split(''.join([feature, '_']))[-1]
                if feature in annotation_features_dict.keys():
                    annotation_features_dict[feature].append(extension)
                else:
                    annotation_features_dict[feature] = [extension]
        return annotation_features_dict

    def _process_categorical_vars(self, dataset: pd.DataFrame, annotation_feats_dict: dict):
        """
        Processor of categorical columns. Will create new columns based on the
        quantity of a value within a column.
        :param dataset: unprocessed pandas DataFrame
        :param annotation_feats_dict:
            dictionary that is to contain the levels for each categorical
            feature
        :return: processed pandas DataFrame
        """
        if self.train:
            for annotation_feature in annotation_feats_dict.keys():
                feature_names = self._get_top_n_cats(
                    column=dataset[annotation_feature],
                    return_num=annotation_feats_dict[annotation_feature]
                )
                dataset[annotation_feature] = np.where(
                    dataset[annotation_feature].isin(feature_names),
                    dataset[annotation_feature],
                    'other'
                )
        else:
            for annotation_feature in annotation_feats_dict.keys():
                feature_names = annotation_feats_dict[annotation_feature]
                self.log.debug('For feature: %s loaded %s levels: %s',
                               annotation_feature,
                               len(feature_names),
                               feature_names
                               )
                dataset[annotation_feature] = np.where(
                    dataset[annotation_feature].isin(feature_names),
                    dataset[annotation_feature],
                    'other'
                )
        dataset = pd.get_dummies(dataset, columns=list(annotation_feats_dict.keys()))

        return dataset

    def _get_top_n_cats(self, column: pd.Series, return_num: int):
        """
        Function for when a training file is preprocessed to get the top
        return_num quantity values within a categorical column.
        Some converting is done for the logger to be able to print them.
        :param column: pandas Series
        :param return_num: integer
        :return: pandas Series
        """
        value_counts = column.value_counts().index[:return_num].values
        printable_value_counts = []
        for value in value_counts:
            if not isinstance(value, str):
                value = str(value)
            printable_value_counts.append(value)
        message = 'For feature: %s saved the following values: %s'
        self.log.info(message, column.name, ', '.join(printable_value_counts))
        return value_counts

    def _ensure_columns_present(self, dataset):
        """
        Function to ensure that for the prediction all prediction columns
        are present. If a columns is not present, add it with a full
        columns of NaN.
        """
        column_utils = ColumnUtils()
        column_utils.set_specified_columns(self.model_features)
        missing = column_utils.get_missing_diff_with(dataset.columns)
        for feature in missing:
            message = 'Detected column %s not present in columns. Adding full column of NaN'
            self.log.debug(message, feature)
            dataset[feature] = np.nan
        return dataset
Esempio n. 8
0
class Main(ABC):
    """
    Main class of CAPICE that contains methods to help the different modes to
    function.
    """

    def __init__(self, input_path, output_path, output_given):
        # Assumes CapiceManager has been initialized & filled.
        self.manager = CapiceManager()
        self.log = Logger().logger

        self.log.info('Initiating selected mode.')

        # Input file.
        self.infile = input_path
        self.log.debug('Input argument -i / --input confirmed: %s', self.infile)

        # Output file.
        self.output = output_path
        self.log.debug('Output directory -o / --output confirmed: %s', self.output)
        self.output_given = output_given

        # Preprocessor global exclusion features
        # Overwrite in specific module if features are incorrect
        self.exclude_features = [Column.gene_name.value,
                                 Column.gene_id.value,
                                 Column.id_source.value,
                                 Column.feature.value,
                                 Column.feature_type.value]

    @abstractmethod
    def run(self):
        pass

    def _load_file(self, additional_required_features: list = None):
        """
        Function to load the input TSV file into main
        :return: pandas DataFrame
        """
        input_parser = InputParser()
        input_file = input_parser.parse(input_file_path=self.infile)
        post_load_processor = LoadFilePostProcessor(dataset=input_file)
        input_file = post_load_processor.process()
        validator = PostFileParseValidator()
        # Individual calls to the validator for error readability
        validator.validate_variants_present(input_file)
        validator.validate_chrom_pos(input_file)
        validator.validate_n_columns(input_file)
        validator.validate_minimally_required_columns(
            input_file,
            additional_required_features=additional_required_features
        )
        return input_file

    @staticmethod
    def process(loaded_data):
        """
        Function to process the VEP features to CAPICE features.
        """
        processor = ManualVEPProcessor()
        processed_data = processor.process(dataset=loaded_data)
        return processed_data

    @staticmethod
    def impute(loaded_data, impute_values):
        """
        Function to perform imputing over the loaded data.
        self.model can be None, but impute_json has to be defined in that case.
        """
        capice_imputer = CapiceImputing(impute_values=impute_values)
        capice_data = capice_imputer.impute(loaded_data)
        return capice_data

    def preprocess(self, loaded_data, model_features=None):
        """
        Function to perform the preprocessing of the loaded data to convert
        categorical columns.
        :param loaded_data: Pandas dataframe of the imputed CAPICE data
        :param model_features: list (default None), a list containing all
        the features present within a model file. When set to None,
        PreProcessor will activate the train protocol.

        Note: please adjust self.exclude_features: to include all of the
        features that the preprocessor should NOT process.
        Features chr_pos_ref_alt, chr and pos are hardcoded and
        thus do not have to be included.
        """
        preprocessor = PreProcessor(
            exclude_features=self.exclude_features,
            model_features=model_features)
        capice_data = preprocessor.preprocess(loaded_data)
        return capice_data

    def _export(self, dataset, output):
        """
        Function to prepare the data to be exported
        """
        CapiceExporter(file_path=output, output_given=self.output_given).export_capice_prediction(
            datafile=dataset)