def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False, build='grch37'): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. build: whether to use the 'grch37' or 'grch38' build to get missing symbols. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [x.strip() for x in open(fails_path)] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)
def test_exclude_segdups(self): ''' check that counting alleles from DP4 entries works correctly ''' # define variants that lie around the boundaries of a segdup region variants = DataFrame({'person_stable_id': ['a', 'a', 'a', 'a', 'a', 'a'], 'chrom': ['1', '1', '1', '1', '1', '1'], 'pos': [1379893, 1379895, 1379894, 1384309, 1384310, 1384311], 'ref': ['A', 'G', 'A', 'G', 'A', 'G'], 'alt': ['C', 'T', 'C', 'T', 'C', 'T'], 'symbol': ['TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1'], }) expected = [True, False, False, False, False, True] self.assertEqual(check_segdups(variants), expected)
def test_exclude_segdups(self): ''' check that counting alleles from DP4 entries works correctly ''' # define variants that lie around the boundaries of a segdup region variants = DataFrame({ 'person_stable_id': ['a', 'a', 'a', 'a', 'a', 'a'], 'chrom': ['1', '1', '1', '1', '1', '1'], 'pos': [1379893, 1379895, 1379894, 1384309, 1384310, 1384311], 'ref': ['A', 'G', 'A', 'G', 'A', 'G'], 'alt': ['C', 'T', 'C', 'T', 'C', 'T'], 'symbol': ['TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1'], }) expected = [True, False, False, False, False, True] self.assertEqual(check_segdups(variants), expected)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [ x.strip() for x in open(fails_path) ] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)