def test_standardise_error(self):
        """ test if a required column is missing, we raise an error
        """

        self.initial = self.initial.drop("pp_dnm", axis=1)

        with self.assertRaises(KeyError):
            standardise_columns(self.initial)
Exemple #2
0
    def test_standardise_error(self):
        ''' test if a required column is missing, we raise an error
        '''

        self.initial = self.initial.drop('pp_dnm', axis=1)

        with self.assertRaises(KeyError):
            standardise_columns(self.initial)
    def test_standardise_columns(self):
        """ test that standardising the columns works
        """

        self.initial["child_ref_F"] = [23, 35]
        self.compare_tables(standardise_columns(self.initial), self.expected)

        # if the dataframes are different, expect an error
        self.expected["pp_dnm"] = [0.0, 0.0]
        with self.assertRaises(AssertionError):
            self.compare_tables(standardise_columns(self.initial), self.expected)
Exemple #4
0
    def test_standardise_columns(self):
        ''' test that standardising the columns works
        '''

        self.initial['child_ref_F'] = [23, 35]
        self.compare_tables(standardise_columns(self.initial), self.expected)

        # if the dataframes are different, expect an error
        self.expected['pp_dnm'] = [0.0, 0.0]
        with self.assertRaises(AssertionError):
            self.compare_tables(standardise_columns(self.initial),
                                self.expected)
    def test_standardise_columns_with_pass(self):
        """ test that standardising the columns works when a 'pass' column exists
        """

        self.initial["pass"] = [True, False]
        self.initial["child_ref_F"] = [23, 35]

        self.expected["pass"] = [True, False]
        self.compare_tables(standardise_columns(self.initial), self.expected)
Exemple #6
0
    def test_standardise_columns_with_pass(self):
        ''' test that standardising the columns works when a 'pass' column exists
        '''

        self.initial['pass'] = [True, False]
        self.initial['child_ref_F'] = [23, 35]

        self.expected['pass'] = [True, False]
        self.compare_tables(standardise_columns(self.initial), self.expected)
def screen_candidates(de_novos_path,
                      fails_path,
                      filter_function,
                      maf=0.01,
                      fix_symbols=True,
                      annotate_only=False,
                      build='grch37'):
    """ load and optionally filter candidate de novo mutations.
    
    Args:
        de_novos_path: path to table of unfiltered canddiate DNMs
        fails_path: path to file listing samples which failed QC, and therefore
            all of their candidates need to be excluded.
        filter_function: function for filtering the candidates, either
            filter_denovogear_sites(), or filter_missing_indels().
        maf: MAF threshold for filtering. This is 0.01 for denovogear sites,
            and 0 for the missing indels.
        fix_symbols: whether to annotate HGNC symbols for candidates
            missing these.
        annotate_only: whether to include a column indicating pass status, rather
            than excluding all candidates which fail the filtering.
        build: whether to use the 'grch37' or 'grch38' build to get
            missing symbols.
    
    Returns:
        pandas DataFrame of candidate de novo mutations.
    """

    if de_novos_path is None:
        return None

    # load the datasets
    de_novos = load_candidates(de_novos_path)
    sample_fails = []
    if fails_path is not None:
        sample_fails = [x.strip() for x in open(fails_path)]

    # run some initial screening
    status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf)
    segdup = check_segdups(de_novos)

    if fix_symbols:
        de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build)

    pass_status = filter_function(de_novos, status & segdup) & status & segdup

    if annotate_only:
        de_novos['pass'] = pass_status
    else:
        de_novos = de_novos[pass_status]

    return standardise_columns(de_novos)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01,
        fix_symbols=True, annotate_only=False):
    """ load and optionally filter candidate de novo mutations.
    
    Args:
        de_novos_path: path to table of unfiltered canddiate DNMs
        fails_path: path to file listing samples which failed QC, and therefore
            all of their candidates need to be excluded.
        filter_function: function for filtering the candidates, either
            filter_denovogear_sites(), or filter_missing_indels().
        maf: MAF threshold for filtering. This is 0.01 for denovogear sites,
            and 0 for the missing indels.
        fix_symbols: whether to annotate HGNC symbols for candidates
            missing these.
        annotate_only: whether to include a column indicating pass status, rather
            than excluding all candidates which fail the filtering.
    
    Returns:
        pandas DataFrame of candidate de novo mutations.
    """
    
    if de_novos_path is None:
        return None
    
    # load the datasets
    de_novos = load_candidates(de_novos_path)
    sample_fails = []
    if fails_path is not None:
        sample_fails = [ x.strip() for x in open(fails_path) ]
    
    # run some initial screening
    status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf)
    segdup = check_segdups(de_novos)
    
    if fix_symbols:
        de_novos['symbol'] = fix_missing_gene_symbols(de_novos)
    
    pass_status = filter_function(de_novos, status & segdup) & status & segdup
    
    if annotate_only:
        de_novos['pass'] = pass_status
    else:
        de_novos = de_novos[pass_status]
    
    return standardise_columns(de_novos)