def test_standardise_error(self): """ test if a required column is missing, we raise an error """ self.initial = self.initial.drop("pp_dnm", axis=1) with self.assertRaises(KeyError): standardise_columns(self.initial)
def test_standardise_error(self): ''' test if a required column is missing, we raise an error ''' self.initial = self.initial.drop('pp_dnm', axis=1) with self.assertRaises(KeyError): standardise_columns(self.initial)
def test_standardise_columns(self): """ test that standardising the columns works """ self.initial["child_ref_F"] = [23, 35] self.compare_tables(standardise_columns(self.initial), self.expected) # if the dataframes are different, expect an error self.expected["pp_dnm"] = [0.0, 0.0] with self.assertRaises(AssertionError): self.compare_tables(standardise_columns(self.initial), self.expected)
def test_standardise_columns(self): ''' test that standardising the columns works ''' self.initial['child_ref_F'] = [23, 35] self.compare_tables(standardise_columns(self.initial), self.expected) # if the dataframes are different, expect an error self.expected['pp_dnm'] = [0.0, 0.0] with self.assertRaises(AssertionError): self.compare_tables(standardise_columns(self.initial), self.expected)
def test_standardise_columns_with_pass(self): """ test that standardising the columns works when a 'pass' column exists """ self.initial["pass"] = [True, False] self.initial["child_ref_F"] = [23, 35] self.expected["pass"] = [True, False] self.compare_tables(standardise_columns(self.initial), self.expected)
def test_standardise_columns_with_pass(self): ''' test that standardising the columns works when a 'pass' column exists ''' self.initial['pass'] = [True, False] self.initial['child_ref_F'] = [23, 35] self.expected['pass'] = [True, False] self.compare_tables(standardise_columns(self.initial), self.expected)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False, build='grch37'): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. build: whether to use the 'grch37' or 'grch38' build to get missing symbols. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [x.strip() for x in open(fails_path)] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [ x.strip() for x in open(fails_path) ] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)