Ejemplo n.º 1
0
def screen_candidates(de_novos_path,
                      fails_path,
                      filter_function,
                      maf=0.01,
                      fix_symbols=True,
                      annotate_only=False,
                      build='grch37'):
    """ load and optionally filter candidate de novo mutations.
    
    Args:
        de_novos_path: path to table of unfiltered canddiate DNMs
        fails_path: path to file listing samples which failed QC, and therefore
            all of their candidates need to be excluded.
        filter_function: function for filtering the candidates, either
            filter_denovogear_sites(), or filter_missing_indels().
        maf: MAF threshold for filtering. This is 0.01 for denovogear sites,
            and 0 for the missing indels.
        fix_symbols: whether to annotate HGNC symbols for candidates
            missing these.
        annotate_only: whether to include a column indicating pass status, rather
            than excluding all candidates which fail the filtering.
        build: whether to use the 'grch37' or 'grch38' build to get
            missing symbols.
    
    Returns:
        pandas DataFrame of candidate de novo mutations.
    """

    if de_novos_path is None:
        return None

    # load the datasets
    de_novos = load_candidates(de_novos_path)
    sample_fails = []
    if fails_path is not None:
        sample_fails = [x.strip() for x in open(fails_path)]

    # run some initial screening
    status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf)
    segdup = check_segdups(de_novos)

    if fix_symbols:
        de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build)

    pass_status = filter_function(de_novos, status & segdup) & status & segdup

    if annotate_only:
        de_novos['pass'] = pass_status
    else:
        de_novos = de_novos[pass_status]

    return standardise_columns(de_novos)
Ejemplo n.º 2
0
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01,
        fix_symbols=True, annotate_only=False):
    """ load and optionally filter candidate de novo mutations.
    
    Args:
        de_novos_path: path to table of unfiltered canddiate DNMs
        fails_path: path to file listing samples which failed QC, and therefore
            all of their candidates need to be excluded.
        filter_function: function for filtering the candidates, either
            filter_denovogear_sites(), or filter_missing_indels().
        maf: MAF threshold for filtering. This is 0.01 for denovogear sites,
            and 0 for the missing indels.
        fix_symbols: whether to annotate HGNC symbols for candidates
            missing these.
        annotate_only: whether to include a column indicating pass status, rather
            than excluding all candidates which fail the filtering.
    
    Returns:
        pandas DataFrame of candidate de novo mutations.
    """
    
    if de_novos_path is None:
        return None
    
    # load the datasets
    de_novos = load_candidates(de_novos_path)
    sample_fails = []
    if fails_path is not None:
        sample_fails = [ x.strip() for x in open(fails_path) ]
    
    # run some initial screening
    status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf)
    segdup = check_segdups(de_novos)
    
    if fix_symbols:
        de_novos['symbol'] = fix_missing_gene_symbols(de_novos)
    
    pass_status = filter_function(de_novos, status & segdup) & status & segdup
    
    if annotate_only:
        de_novos['pass'] = pass_status
    else:
        de_novos = de_novos[pass_status]
    
    return standardise_columns(de_novos)
 def test_fix_missing_gene_symbols(self):
     ''' check that get_most_severe works correctly
     '''
     
     symbols = fix_missing_gene_symbols(self.variants)
     self.assertEqual(list(symbols), ['ARID1B', 'fake_symbol.2_129119889'])