class FastaLookupAnnotator: def __init__(self): self.log = Logger().logger self.manager = CapiceManager() self.fasta_loc = self.manager.reference_genome self.fasta = None self._load_fasta() def _load_fasta(self): self.log.info('Loading in Fasta file, this may take a moment.') self.fasta = pysam.FastaFile(self.fasta_loc) self.log.info('Succesfully loaded Fasta file at: {}'.format( self.fasta_loc)) def get_reference_sequence(self, chromosome: str, start: int, end: int): """ Function to obtain a sequence from the reference Fasta file. :param chromosome: string, chromosome to get the reference sequence from. :param start: Chromosomal position at what point the sequence should be obtained. :param end: Chromosomal position at what point the obtained sequence should end. :return: string, obtained reference sequence. """ try: self.log.debug('Obtaining reference sequence for: ' '[Chromosome: {}], [start: {}], [stop: {}]'.format( chromosome, start, end)) append_ns = False if start < 0: append_ns = abs(start) start = 0 return_sequence = self.fasta.fetch(chromosome, start, end) if append_ns: return_sequence = '{}{}'.format('N' * append_ns, return_sequence) return return_sequence except KeyError: self.log.warning( 'Unable to obtain sequence for: ' '[Chromosome: {}], [start: {}], [stop: {}],' 'did you supply a reference with contigs 1-22 + x,y,mt?'. format(chromosome, start, end)) return None def close_connection(self): """ Function to tell pysam to close the connection to the Fasta file """ if self.fasta: self.fasta.close()
class ManualAnnotator: def __init__(self): self.log = Logger().logger self.vep_annotators = [] self.location = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'annotaters', 'vep') self._load_vep_annotators() def _load_vep_annotators(self): python_modules = load_modules(self.location) self._check_n_modules(python_modules) loaded_python_modules = importer(python_modules, path=self.location) self._check_n_modules(loaded_python_modules) for module in loaded_python_modules: if 'name' in dir(module) and module.usable: self.vep_annotators.append(module) self._check_n_modules(self.vep_annotators) def _check_n_modules(self, modules_list): if len(modules_list) < 1: error_message = 'Unable to locate VEP Processors at {}, ' \ 'was the directory moved?'.format(self.location) self.log.critical(error_message) raise FileNotFoundError(error_message) def process(self, dataset: pd.DataFrame): for processor in self.vep_annotators: if processor.name in dataset.columns and processor.usable: dataset = processor.process(dataset) if processor.drop: dataset.drop(columns=processor.name, inplace=True) else: self.log.warning( 'Could not use processor {} on input dataset!'.format( processor.name ) ) return dataset
class Exporter: """ Class specifically export files and create unique filenames. """ def __init__(self, file_path): self.log = Logger().logger self.force = CapiceManager().force self.now = CapiceManager().now self.capice_filename = CapiceManager().output_filename self.file_path = file_path self.export_cols = [ Column.chr_pos_ref_alt.value, Column.GeneName.value, Column.FeatureID.value, Column.Consequence.value, Column.probabilities.value ] def export_capice_prediction(self, datafile: pd.DataFrame): """ Function specific to export the dataset created for the prediction pathway. :param datafile: prediction pandas DataFrame """ filename = self._export_filename_ready(file_name=self.capice_filename, check_extension=False) datafile[self.export_cols].to_csv(filename, sep='\t', compression='gzip', index=False) self.log.info( 'Successfully exported CAPICE datafile to: {}'.format(filename)) def export_capice_training_dataset(self, datafile: pd.DataFrame, name: str, feature: str): """ Function specific to export a (splitted) dataset comming from the training pathway. :param datafile: pandas DataFrame :param name: Name of the export file :param feature: Name of what is exported """ filename = self._export_filename_ready(file_name=name) datafile.to_csv(filename, sep='\t', compression='gzip', index=False) self.log.info('Exported {} with shape {} to: {}'.format( feature, datafile.shape, filename)) def export_capice_model(self, model, model_type): """ Function specific to export a newly created CAPICE model :param model: RandomizedSearchCV or XGBClassifier instance :param model_type: either "XGBClassifier" or "RandomizedSearchCV" """ export_name = "" if model_type == 'XGBClassifier': export_name = 'xgb_classifier' elif model_type == 'RandomizedSearchCV': export_name = 'randomized_search_cv' filename_model = '{}_{}'.format(export_name, self.now.strftime("%H%M%S%f_%d%m%Y")) filename = self._export_filename_ready(file_name=filename_model, type_export='model') with open(filename, 'wb') as model_dump: pickle.dump(model, model_dump) def _export_filename_ready(self, file_name, type_export='dataset', check_extension=True): """ Function to build an unique filename in case that force is turned off. :param file_name: Name of the to be created file :param type_export: "prediction" for the prediction pathway, "dataset" for the export of datasets or "model" for the export of models. :param check_extension: Boolean if the extension should be checked before exporting. :return: full export path """ path_and_filename = os.path.join(self.file_path, file_name) types_export_and_extensions = { 'dataset': '.tsv.gz', 'model': '.pickle.dat' } if check_extension: extension = types_export_and_extensions[type_export] if not file_name.endswith(extension): file_name = file_name + extension full_path = os.path.join(self.file_path, file_name) export_path = None if not check_file_exists(full_path): self.log.info( 'No file found at {}, save to create.'.format(full_path)) export_path = full_path elif self.force and check_file_exists(full_path): self.log.warning( 'Found existing file at {}, ' 'removing file for overwriting.'.format(full_path)) os.remove(full_path) export_path = full_path else: self.log.info('Found existing file at {}, ' 'not able to overwrite. ' 'Creating new filename.'.format(full_path)) filename, extension = get_filename_and_extension(full_path) basedir = os.path.dirname(path_and_filename) export_exists = True extension_counter = 1 while export_exists: attempted_file = os.path.join( basedir, filename + "_{}.".format(extension_counter) + extension) if not check_file_exists(attempted_file): self.log.info('Able to create {}'.format(attempted_file)) export_exists = False export_path = attempted_file extension_counter += 1 return export_path
class CapiceImputing: """ Class to dynamically load in all imputing files and identify the file suitable for the run's use case. """ def __init__(self): self.manager = CapiceManager() self.vep_version = self.manager.vep_version self.grch_build = self.manager.grch_build self.log = Logger().logger self.log.info('Imputer started.') self.overrule = self.manager.overwrite_impute self.modules = [] self.module = None self._load_modules() self._is_correct_datafile_present() self._check_if_imputer_is_applied() self.columns = [] self.annotation_columns_present = [] self.impute_values = {} self.pre_dtypes = {} self.dtypes = {} def _load_modules(self): """ Method to dynamically load in all python files containing a class that contains the properties name and _json_name. If at the end of this function, the list of impute files is empty, will throw the module not found error. """ self.log.info('Identifying imputing files.') directory = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'data_files', 'imputing') usable_modules = load_modules(directory) if len(usable_modules) < 1: self._raise_no_module_found_error() loaded_modules = importer(usable_modules=usable_modules, path=directory) for module in loaded_modules: if "name" in dir(module) and "_json_name" in dir(module): self.modules.append(module) if len(self.modules) < 1: self._raise_no_module_found_error() self.log.info( 'Identified {} files available for usage in imputing.'.format( len(self.modules))) def _raise_no_module_found_error(self): """ Function to raise when no suitable impute files are found. Put into a function since 2 other functions within this module will use it. """ error_message = 'No usable python files are found ' \ 'within the imputing directory!' self.log.critical(error_message) raise FileNotFoundError(error_message) def _is_correct_datafile_present(self): """ Function to check the VEP version and GRCh build (or --overwrite_impute_file) match the impute file. """ for module in self.modules: if self.overrule and module.name == self.overrule: self.log.info( 'Overrule successful for: {} , located at: {}'.format( self.overrule, inspect.getfile(module.__class__))) self.module = module break else: module_vep_version = module.supported_vep_version module_grch_build = module.supported_grch_build if module_vep_version == self.vep_version and \ module_grch_build == self.grch_build: self.log.info('Impute data file successfully found: {} , ' 'located at: {}'.format( module.name, inspect.getfile(module.__class__))) self.module = module break def _check_if_imputer_is_applied(self): # Checking if self.data_file is assigned if self.module is None: if self.overrule: error_message = 'No imputing data file found for overrule: ' \ '{}'.format(self.overrule) else: error_message = 'No imputing data file found for ' \ 'VEP version: {} and ' \ 'GRCh build: {}'.format(self.vep_version, self.grch_build ) self.log.critical(error_message) raise FileNotFoundError(error_message) def _load_values(self, dataset: pd.DataFrame): """ Function to be called right when impute() is called, gets the input datafile features, imputes values from the impute file and saves the datafile features to the manager. """ self.columns = self.module.annotation_features for col in self.columns: if col in dataset.columns: self.annotation_columns_present.append(col) self.manager.annotation_features = self.columns self.impute_values = self.module.impute_values def impute(self, datafile: pd.DataFrame): """ Function to call the CapiceImputing to start imputing. :return: pandas DataFrame """ self._load_values(datafile) datafile = self._check_chrom_pos(datafile) self._get_nan_ratio_per_column(dataset=datafile) self._get_full_nan_row(dataset=datafile) datafile.dropna(how='all', subset=self.annotation_columns_present) datafile = datafile[~datafile['CAPICE_drop_out']] datafile.drop(columns=['CAPICE_drop_out'], inplace=True) self._correct_dtypes(datafile=datafile) datafile.fillna(self.impute_values, inplace=True) datafile = datafile.astype(dtype=self.pre_dtypes, copy=False) datafile = datafile.astype(dtype=self.dtypes, copy=False) datafile = self._add_missing_columns(datafile) self.log.info('Imputing successfully performed.') return datafile @deprecated def _add_missing_columns(self, datafile: pd.DataFrame): for key, value in self.impute_values.items(): if key not in datafile.columns: datafile[key] = value return datafile def _correct_dtypes(self, datafile: pd.DataFrame): """ Function to correct the dtypes that originate from the lookup annotator according to the dtypes specified within the data json. """ for key, item in self.impute_values.items(): if key in datafile.columns: self._save_dtypes(key=key, item=item) def _save_dtypes(self, key, item): if isinstance(item, int): self.pre_dtypes[key] = float else: self.pre_dtypes[key] = type(item) self.dtypes[key] = type(item) def _check_chrom_pos(self, dataset: pd.DataFrame): """ Function to check if all values of the columns Chr and Pos are present. :param dataset: not imputed pandas DataFrame :return: pandas DataFrame containing no NaN or gaps for Chr and Pos columns. """ chrom_is_float = False if dataset['Chr'].isnull().values.any(): if dataset.dtypes['Chr'] == np.float64: chrom_is_float = True n_delete = dataset['Chr'].isnull().values.sum() self.log.warning('Detected NaN in the Chromosome column! ' 'Deleting {} row(s).'.format(n_delete)) dataset = dataset[~dataset['Chr'].isnull()] if dataset['Pos'].isnull().values.any(): n_delete = dataset['Pos'].isnull().values.sum() self.log.warning('Detected NaN is the Position column! ' 'Deleting {} row(s).'.format(n_delete)) dataset = dataset[~dataset['Pos'].isnull()] dataset.index = range(0, dataset.shape[0]) if chrom_is_float: dataset['Chr'] = dataset['Chr'].astype(int) dataset['Chr'] = dataset['Chr'].astype(str) dataset['Pos'] = dataset['Pos'].astype(int) return dataset def _get_nan_ratio_per_column(self, dataset: pd.DataFrame): """ Generic function to get the percentage of gaps per column :param dataset: not imputed pandas DataFrame """ for column in dataset.columns: series = dataset[column] self._calculate_percentage_nan(column=series) def _calculate_percentage_nan(self, column): n_nan = column.isnull().sum() if n_nan > 0: n_samples = column.size p_nan = round((n_nan / n_samples) * 100, ndigits=2) self.log.debug( 'NaN detected in column {}, percentage: {}%.'.format( column.name, p_nan)) def _get_full_nan_row(self, dataset: pd.DataFrame): """ Function to get the samples of which absolutely no prediction is possible due to all non chr pos ref alt rows being gaps. :param dataset: not imputed pandas DataFrame """ n_samples = dataset.shape[0] dataset.index = range(1, n_samples + 1) dataset['CAPICE_drop_out'] = dataset[ self.annotation_columns_present].isnull().values.all(axis=1) samples_dropped_out = dataset[dataset['CAPICE_drop_out']] if samples_dropped_out.shape[0] > 0: self.log.warning( 'The following samples are filtered out due to missing values: ' '(indexing is python based, ' 'so the index starts at 0). \n {}'.format(samples_dropped_out[[ 'Chr', 'Pos', 'Ref', 'Alt', 'FeatureID' ]])) else: self.log.info( 'No samples are filtered out due to too many NaN values.')
class TemplateSetup(metaclass=ABCMeta): """ Abstract class to act as template for new models that might be added in future patches of CAPICE. Contains the necessary steps for preprocessing as well. """ def __init__(self, name, usable, vep_version, grch_build): self.log = Logger().logger self.property_checker = PropertyCheckerLogger() self.name = name self.usable = usable self.supported_vep_version = vep_version self.supported_grch_build = grch_build self.annotation_features = CapiceManager().annotation_features self.train = False self.model = None self.annotation_object = [] self.model_features = None @property def name(self): """ Property getter name, to get the init defined name of the model module. :return: str """ return self._name @name.setter def name(self, value='Template'): """ Property setter name, to set a name for a model module. Raises TypeError if not supplied with a string. :param value: str """ self.property_checker.check_property(value=value, expected_type=str) self._name = value @property def usable(self): """ Property getter usable, to get the boolean value of a model module whenever it can be used for preprocessing and prediction. :return: bool """ return self._usable @usable.setter def usable(self, value=False): """ Property setter usable, to set the boolean value of a model module whenever it should be used for preprocessing and prediction. Raises TypeError if not supplied with a boolean. :param value: bool """ self.property_checker.check_property(value=value, expected_type=bool) self._usable = value @property def supported_vep_version(self): """ Property getter supported_annotation_version, to get the float annotation_version value of a model/prediction file that is supported within the module. :return: float or None """ return self._vep_version @supported_vep_version.setter def supported_vep_version(self, value): """ Property setter supported_annotation_version, to set the float annotation_version value of a model/prediction file that is supported within the module. Raises TypeError if not supplied with a float or None. :param value: float or None """ self.property_checker.check_property( value=value, expected_type=float, include_none=True ) self._vep_version = value @property def supported_grch_build(self): """ Property getter supported_grch_build, to get the integer grch_build value that defines what genome build is supported by the model/prediction module. :return: integer or None """ return self._grch_build @supported_grch_build.setter def supported_grch_build(self, value): """ Property getter supported_grch_build, to set the integer value grch_build that defines what genome build is supported by the model/prediction module. Raises TypeError if not supplied with an integer or None. :param value: integer or None """ self.property_checker.check_property( value=value, expected_type=int, include_none=True ) self._grch_build = value def preprocess(self, dataset: pd.DataFrame, is_train: bool): """ Callable function to start the preprocessing of a dataset. :param dataset: imputed pandas DataFrame :param is_train: boolean :return: processed pandas DataFrame """ self.train = is_train self._load_model() if not self.train: self._load_model_features() dataset = self._duplicate_chr_pos_ref_alt(dataset=dataset) self._get_categorical_columns(dataset=dataset) processed_dataset = self._process_objects(dataset=dataset) if not self.train: processed_dataset = self._check_all_model_features_present( processed_dataset ) self.log.info('Successfully preprocessed data.') return processed_dataset @deprecated def _check_all_model_features_present(self, dataset: pd.DataFrame): for feature in self.model_features: if feature not in dataset.columns: dataset[feature] = 0 return dataset def _get_categorical_columns(self, dataset: pd.DataFrame): """ Function to get the categorical columns that are within the supplied annotation features of the imputing file. :param dataset: pandas DataFrame """ for feature in dataset.select_dtypes(include=["O"]).columns: if feature in self.annotation_features: self.annotation_object.append(feature) self.log.debug( 'Converting the categorical columns: {}.'.format( ", ".join(self.annotation_object) ) ) @staticmethod def _duplicate_chr_pos_ref_alt(dataset): """ Function to create the chr_pos_ref_alt column so that it doesn't get lost in preprocessing. :param dataset: unprocessed pandas DataFrame :return: unprocessed pandas DataFrame containing column 'chr_pos_ref_alt' """ dataset['chr_pos_ref_alt'] = dataset[ ['Chr', 'Pos', 'Ref', 'Alt']].astype(str).agg('_'.join, axis=1) return dataset @property def model_features(self): return self._model_features @model_features.setter def model_features(self, value): self._model_features = value def _process_objects(self, dataset: pd.DataFrame): """ (If train) will create a dictionary telling the processor how many categories are within a certain column. If not train: Will look up each annotation feature from the impute file within the columns of the datafile (either in full name or the column starts with the feature from the impute file). This dictionary is then passed to the actual processor. :param dataset: unprocessed pandas DataFrame :return: processed pandas DataFrame """ annotation_feats_dict = {} if self.train: hardcoded_features = ['Ref', 'Alt', 'Domain'] for feature in hardcoded_features: annotation_feats_dict[feature] = 5 self.log.info( 'Training protocol, ' 'creating new categorical conversion identifiers.' ) for feat in self.annotation_object: if feat not in annotation_feats_dict.keys(): annotation_feats_dict[feat] = 5 else: for feature in self.annotation_object: annotation_feats_dict = self._process_objects_no_train( feature=feature, annotation_features_dict=annotation_feats_dict ) processed_data = self._process_categorical_vars( dataset=dataset, annotation_feats_dict=annotation_feats_dict ) return processed_data def _process_objects_no_train(self, feature: str, annotation_features_dict: dict): for model_feature in self.model_features: if model_feature.startswith(feature): extension = model_feature.split(''.join([feature, '_']))[-1] if feature in annotation_features_dict.keys(): annotation_features_dict[feature].append(extension) else: annotation_features_dict[feature] = [extension] return annotation_features_dict def _load_model_features(self): """ Function to access the protected member of the XGBoost _Booster class to get the features that the model is trained on. :return: list """ self.log.info('Using features saved within the model.') self.model_features = self.model._Booster.feature_names def _process_categorical_vars(self, dataset: pd.DataFrame, annotation_feats_dict: dict): """ Processor of categorical columns. Will create new columns based on the quantity of a value within a column. :param dataset: unprocessed pandas DataFrame :param annotation_feats_dict: dictionary that is to contain the levels for each categorical feature :return: processed pandas DataFrame """ if self.train: for annotation_feature in annotation_feats_dict.keys(): feature_names = self._get_top10_or_less_cats( column=dataset[annotation_feature], return_num=annotation_feats_dict[annotation_feature] ) dataset[annotation_feature] = np.where( dataset[annotation_feature].isin(feature_names), dataset[annotation_feature], 'other') else: for annotation_feature in annotation_feats_dict.keys(): feature_names = annotation_feats_dict[annotation_feature] self.log.debug('For feature: {} loaded {} levels: {}'.format( annotation_feature, len(feature_names), feature_names )) dataset[annotation_feature] = np.where( dataset[annotation_feature].isin(feature_names), dataset[annotation_feature], 'other' ) dataset = pd.get_dummies( dataset, columns=list(annotation_feats_dict.keys()) ) # Checking if all annotation features are processed. # If not, add a column containing all "false" (0) for annotation_feature in annotation_feats_dict.keys(): dataset = self._check_all_annotation_features_processed( current_annotation_feature=annotation_feature, dataset=dataset, annotation_features_dict=annotation_feats_dict ) return dataset def _check_all_annotation_features_processed(self, current_annotation_feature, dataset: pd.DataFrame, annotation_features_dict): if not self.train: afd = annotation_features_dict for processed_feature in afd[current_annotation_feature]: col_be_present = "_".join( [current_annotation_feature, processed_feature]) if col_be_present not in dataset.columns: self.log.warning( 'Of annotation feature {},' ' detected {} not present in columns.'.format( current_annotation_feature, processed_feature)) dataset[col_be_present] = 0 return dataset def _get_top10_or_less_cats(self, column: pd.Series, return_num: int): """ Function for when a training file is preprocessed to get the top return_num quantity values within a categorical column. Some converting is done for the logger to be able to print them. :param column: pandas Series :param return_num: integer :return: pandas Series """ value_counts = column.value_counts().index[:return_num].values printable_value_counts = [] for value in value_counts: if not isinstance(value, str): value = str(value) printable_value_counts.append(value) self.log.info('For feature: {} saved the following values: {}'.format( column.name, ', '.join(printable_value_counts) )) return value_counts # Model stuff def predict(self, data: pd.DataFrame): """ Function to load the model and predict the CAPICE scores. Can be overwritten in case of legacy support. :return: pandas DataFrame """ self.log.info('Predicting for {} samples.'.format(data.shape[0])) self._load_model() self._load_model_features() data['probabilities'] = self._predict( self._create_input_matrix(dataset=data)) self.log.info('Predicting successful.') return data def _predict(self, predict_data): """ Further down defined prediction function, which is different for XGBoost 0.72.1 and current XGBoost version. :param predict_data: preprocessed pandas DataFrame :return: numpy array """ return self.model.predict_proba(predict_data)[:, 1] def _create_input_matrix(self, dataset: pd.DataFrame): """ Also a template function, which can be overwritten to be compatible with first generation CAPICE. :param dataset: pandas DataFrame :return: XGBoost workable data """ return dataset[self.model_features] def _load_model(self): """ Template method to load in the model once supported values are correct. :return: pickled model instance """ model = None if not self.train: with open(self._get_model_loc(), 'rb') as model_file: model = pickle.load(model_file) self.log.info('Successfully loaded model at: {}'.format( self._get_model_loc())) self.model = model @staticmethod @abstractmethod def _get_model_loc(): """ Template to mark the directory where the model is located. Use of os.path.join is required. You may use the get_project_root_dir() from utilities if the model is within this project directory. :return: path-like or None if no model has been created yet. """ pass
class InputVersionChecker: """ Class to check the given VEP config argument and file VEP version match. Class is self running. """ def __init__(self, config_vep_version: float, file_vep_version: float, config_grch_build: int, file_grch_build: int): """ Class to check the given VEP config argument and the header of the VEP file match. :param config_vep_version: float, config argument for the used VEP version :param file_vep_version: flaot, config argument for the used GRCh build """ self.config_vep_version = config_vep_version self.file_vep_version = file_vep_version self.config_grch_build = config_grch_build self.file_grch_build = file_grch_build self.manager = CapiceManager() self.export_vep_version = None self.export_grch_build = None self.check_match = [] self.unable_check = [] self.check_overrule = False self.log = Logger().logger self._check_all_present() if self.check_overrule: self._check_overrule() self._check_version_match() self._set_global_vep_version() self._set_global_grch_build() def _set_global_vep_version(self): """ Function to provide the CapiceManager with the VEP version to be used globally later on in CAPICE. """ self.manager.vep_version = self.export_vep_version self.log.info('VEP version set to: {}'.format(self.export_vep_version)) def _set_global_grch_build(self): """ Function to provide the CapiceManager with the Genome Build version to be used globally later on in CAPICE. """ self.manager.grch_build = self.export_grch_build self.log.info('GRCh build set to: {}'.format(self.export_grch_build)) def _check_overrule(self): """ Function called when either the VEP version or GRCh build can not be determined. Overrule must be present for both impute and model, since it can not determine what file to use without VEP or GRCh argument. """ if self.manager.overwrite_impute is False and \ self.manager.overwrite_model is False: error_message = """ VEP version or GRCh build not specified and both overwrites are not set! Not able to find a correct impute or processing file! """.strip() self.log.critical(error_message) raise InputError(error_message) def _check_all_present(self): """ Function to check if both the VEP version and GRCh build are present within either the config arguments or within the file. """ dict_of_all_present = { 'VEP': [self.file_vep_version, self.config_vep_version], 'GRCh': [self.file_grch_build, self.config_grch_build] } for type_of_check in dict_of_all_present.keys(): to_check = dict_of_all_present[type_of_check] self._check_individual_argument(to_check=to_check, type_of_check=type_of_check) def _check_individual_argument(self, to_check, type_of_check): """ Function belonging to _check_all_present to check if a VEP version and GRCh build can be set globally. :param to_check: list :param type_of_check: string """ if False in to_check: if to_check.count(False) == len(to_check): self._turn_on_check_overrule(type_of_check=type_of_check) for argument in to_check: self._apply_export_version(argument=argument, type_of_check=type_of_check) else: self.check_match.append(type_of_check) def _turn_on_check_overrule(self, type_of_check): """ Function to turn on the overrule check if no VEP or GRCh arguments are passed. """ self.check_overrule = type_of_check self.log.warning( 'Unable to obtain {} version from file or config file!'.format( type_of_check)) self.check_overrule = True def _apply_export_version(self, argument, type_of_check): """ Function to set the global VEP version or GRCh build. :param argument: int or float """ if argument is not False: if type_of_check == 'VEP': self.export_vep_version = argument else: self.export_grch_build = argument def _check_version_match(self): """ Function to check if the Config Argument and the file header specified VEP versions match. If not: use the config argument as form of "overwrite" and warn. """ if len(self.check_match) > 0: for check_match in self.check_match: if check_match == 'VEP': self._check_vep_match(check_match=check_match) elif check_match == 'GRCh': self._check_grch_match(check_match=check_match) def _check_vep_match(self, check_match): if self.file_vep_version != self.config_vep_version: self._raise_version_mismatch(type_of_mismatch=check_match, version_cla=self.file_vep_version, version_file=self.config_vep_version) else: self._raise_version_mismatch(type_of_mismatch=check_match, match_successful=True) self.export_vep_version = self.file_vep_version def _check_grch_match(self, check_match): if self.config_grch_build != self.file_grch_build: self._raise_version_mismatch(type_of_mismatch=check_match, version_cla=self.config_grch_build, version_file=self.file_grch_build) else: self._raise_version_mismatch(type_of_mismatch=check_match, match_successful=True) self.export_grch_build = self.file_grch_build def _raise_version_mismatch(self, type_of_mismatch, version_cla=None, version_file=None, match_successful=False): if match_successful: self.log.info( 'Successfully matched CLA and file versions for {}.'.format( type_of_mismatch)) else: warning_message = """ Warning matching {} versions. CLA version supplied: {} does not match file version: {} !""".format( type_of_mismatch, version_cla, version_file).strip() warnings.warn(warning_message) self.log.warning(warning_message)
class InputHeaderParser: """ Autonomous class to parse just the header of the input file to get the amount of comment lines that pandas should skip when reading. """ def __init__(self, is_gzipped: bool, input_file_loc: str): self.log = Logger().logger self.manager = CapiceManager() self.log.info('Starting to parse input file header.') self.is_gzipped = is_gzipped self.input_file_loc = input_file_loc self.header = '' self.header_build = False self.header_version = False self.header_present = False self.file_type = None self.skip_rows = 0 self._parse_header() if self.header_present: self.log.info( "Input file header successfully identified: {}".format( self.header.strip())) self._get_file_type() else: self.log.warning( 'Unable to parse input file header, header not located. ' 'Does the header start with "##"?') def _parse_header(self): """ Class to see if the first line is present within the input file. """ if self.is_gzipped: file_handle = gzip.open(self.input_file_loc, mode='rt') else: file_handle = open(self.input_file_loc, mode='rt') for line in file_handle: if line.startswith('##'): self._check_vep_version(line=line) self._add_skip_row(line=line) else: break file_handle.close() def _add_skip_row(self, line): if self.skip_rows == 0: self.header_present = True self.header = line self.skip_rows += 1 else: self.skip_rows += 1 def _check_vep_version(self, line): if line.startswith('##VEP="'): self._parse_vep_version(line) def _parse_vep_version(self, line): for annotation in line.split(' '): if annotation.startswith('##VEP'): self.header_version = float( annotation.split('v')[1].split('"')[0]) self.log.info('Header VEP version identified: {}'.format( self.header_version)) elif annotation.startswith('assembly'): self.header_build = int(annotation.split('h')[1].split('.')[0]) self.log.info('Header GRCh build identified: {}'.format( self.header_build)) def _get_file_type(self): if not self.header.startswith('## VEP VCF to CAPICE tsv converter'): warning_message = 'Unable to recognize origin of input file.' self.log.warning(warning_message) warnings.warn(warning_message) def get_skip_rows(self): """ Function to return the integer value of how many rows pandas.read_csv() should skip to reach the data. :return: int """ return self.skip_rows def get_vep_version(self): """ Function to return the float value of the VEP version used to generate the input file. :return: float """ return self.header_version def get_grch_build(self): """ Function to return the float value of the GRCh build used to generate the input file. :return: int """ return self.header_build