def screen_candidates(de_novos_path,
                      fails_path,
                      filter_function,
                      maf=0.01,
                      fix_symbols=True,
                      annotate_only=False,
                      build='grch37'):
    """ load and optionally filter candidate de novo mutations.
    
    Args:
        de_novos_path: path to table of unfiltered canddiate DNMs
        fails_path: path to file listing samples which failed QC, and therefore
            all of their candidates need to be excluded.
        filter_function: function for filtering the candidates, either
            filter_denovogear_sites(), or filter_missing_indels().
        maf: MAF threshold for filtering. This is 0.01 for denovogear sites,
            and 0 for the missing indels.
        fix_symbols: whether to annotate HGNC symbols for candidates
            missing these.
        annotate_only: whether to include a column indicating pass status, rather
            than excluding all candidates which fail the filtering.
        build: whether to use the 'grch37' or 'grch38' build to get
            missing symbols.
    
    Returns:
        pandas DataFrame of candidate de novo mutations.
    """

    if de_novos_path is None:
        return None

    # load the datasets
    de_novos = load_candidates(de_novos_path)
    sample_fails = []
    if fails_path is not None:
        sample_fails = [x.strip() for x in open(fails_path)]

    # run some initial screening
    status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf)
    segdup = check_segdups(de_novos)

    if fix_symbols:
        de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build)

    pass_status = filter_function(de_novos, status & segdup) & status & segdup

    if annotate_only:
        de_novos['pass'] = pass_status
    else:
        de_novos = de_novos[pass_status]

    return standardise_columns(de_novos)
 def test_exclude_segdups(self):
     ''' check that counting alleles from DP4 entries works correctly
     '''
     
     # define variants that lie around the boundaries of a segdup region
     variants = DataFrame({'person_stable_id': ['a', 'a', 'a', 'a', 'a', 'a'],
         'chrom': ['1', '1', '1', '1', '1', '1'],
         'pos': [1379893, 1379895, 1379894, 1384309, 1384310, 1384311],
         'ref': ['A', 'G', 'A', 'G', 'A', 'G'],
         'alt': ['C', 'T', 'C', 'T', 'C', 'T'],
         'symbol': ['TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1'],
         })
     
     expected = [True, False, False, False, False, True]
     
     self.assertEqual(check_segdups(variants), expected)
    def test_exclude_segdups(self):
        ''' check that counting alleles from DP4 entries works correctly
        '''

        # define variants that lie around the boundaries of a segdup region
        variants = DataFrame({
            'person_stable_id': ['a', 'a', 'a', 'a', 'a', 'a'],
            'chrom': ['1', '1', '1', '1', '1', '1'],
            'pos': [1379893, 1379895, 1379894, 1384309, 1384310, 1384311],
            'ref': ['A', 'G', 'A', 'G', 'A', 'G'],
            'alt': ['C', 'T', 'C', 'T', 'C', 'T'],
            'symbol': ['TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1', 'TEST1'],
        })

        expected = [True, False, False, False, False, True]

        self.assertEqual(check_segdups(variants), expected)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01,
        fix_symbols=True, annotate_only=False):
    """ load and optionally filter candidate de novo mutations.
    
    Args:
        de_novos_path: path to table of unfiltered canddiate DNMs
        fails_path: path to file listing samples which failed QC, and therefore
            all of their candidates need to be excluded.
        filter_function: function for filtering the candidates, either
            filter_denovogear_sites(), or filter_missing_indels().
        maf: MAF threshold for filtering. This is 0.01 for denovogear sites,
            and 0 for the missing indels.
        fix_symbols: whether to annotate HGNC symbols for candidates
            missing these.
        annotate_only: whether to include a column indicating pass status, rather
            than excluding all candidates which fail the filtering.
    
    Returns:
        pandas DataFrame of candidate de novo mutations.
    """
    
    if de_novos_path is None:
        return None
    
    # load the datasets
    de_novos = load_candidates(de_novos_path)
    sample_fails = []
    if fails_path is not None:
        sample_fails = [ x.strip() for x in open(fails_path) ]
    
    # run some initial screening
    status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf)
    segdup = check_segdups(de_novos)
    
    if fix_symbols:
        de_novos['symbol'] = fix_missing_gene_symbols(de_novos)
    
    pass_status = filter_function(de_novos, status & segdup) & status & segdup
    
    if annotate_only:
        de_novos['pass'] = pass_status
    else:
        de_novos = de_novos[pass_status]
    
    return standardise_columns(de_novos)