def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False, build='grch37'): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. build: whether to use the 'grch37' or 'grch38' build to get missing symbols. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [x.strip() for x in open(fails_path)] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [ x.strip() for x in open(fails_path) ] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)
def test_fix_missing_gene_symbols(self): ''' check that get_most_severe works correctly ''' symbols = fix_missing_gene_symbols(self.variants) self.assertEqual(list(symbols), ['ARID1B', 'fake_symbol.2_129119889'])