def test_get_most_severe(self): ''' check that get_most_severe works correctly ''' cq = ['missense_variant', 'protein_altering_variant', 'splice_region_variant', 'incomplete_terminal_codon_variant'] self.assertEqual(get_most_severe(cq), 'missense_variant') cq = ['stop_lost', 'start_lost', 'transcript_amplification', 'conserved_exon_terminus_variant'] self.assertEqual(get_most_severe(cq), 'stop_lost') # an empty list raises an error with self.assertRaises(IndexError): get_most_severe([])
def test_get_most_severe(self): ''' check that get_most_severe works correctly ''' cq = [ 'missense_variant', 'protein_altering_variant', 'splice_region_variant', 'incomplete_terminal_codon_variant' ] self.assertEqual(get_most_severe(cq), 'missense_variant') cq = [ 'stop_lost', 'start_lost', 'transcript_amplification', 'conserved_exon_terminus_variant' ] self.assertEqual(get_most_severe(cq), 'stop_lost') # an empty list raises an error with self.assertRaises(IndexError): get_most_severe([])
def person_recurrence(de_novos): """ identify de novos recurrent in a gene within individuals. Find the de novos that are recurrent within a single individual in a single gene. We shall treat these as a single de novo event. Prioritise including the most severe event within a gene, then take the first variant left after that. Args: de_novos: dataframe of de novo variants Returns: pandas Series for whether each candidate is a duplicate or not """ # find the variants which are recurrent within a person in a single gene from_start = de_novos.duplicated(["person_stable_id", "symbol"]) from_end = de_novos.duplicated(["person_stable_id", "symbol"], keep='last') person_dups = from_start | from_end in_person_dups = de_novos[person_dups] # split the dataset, so we can process gene by gene genes = in_person_dups.groupby(["person_stable_id", "symbol"]) # pick a variant for each person, the first of the most severe consequence retain = pandas.Series([], dtype=numpy.bool_) for key, gene in genes: consequence = get_most_severe(gene["consequence"]) first = gene[gene["consequence"] == consequence].index[0] gene_retain = pandas.Series([True] * len(gene), index=gene.index) gene_retain[first] = False retain = retain.append(gene_retain) # set the selected de novos person_dups.loc[retain.index] = retain return person_dups
def person_recurrence(de_novos): """ identify de novos recurrent in a gene within individuals. Find the de novos that are recurrent within a single individual in a single gene. We shall treat these as a single de novo event. Prioritise including the most severe event within a gene, then take the first variant left after that. Args: de_novos: dataframe of de novo variants Returns: pandas Series for whether each candidate is a duplicate or not """ # find the variants which are recurrent within a person in a single gene from_start = de_novos.duplicated(["person_stable_id", "symbol"]) from_end = de_novos.duplicated(["person_stable_id", "symbol"], take_last=True) person_dups = from_start | from_end in_person_dups = de_novos[person_dups] # split the dataset, so we can process gene by gene genes = in_person_dups.groupby(["person_stable_id", "symbol"]) # pick a variant for each person, the first of the most severe consequence retain = pandas.Series([], dtype=numpy.bool_) for key, gene in genes: consequence = get_most_severe(gene["consequence"]) first = gene[gene["consequence"] == consequence].index[0] gene_retain = pandas.Series([True] * len(gene), index=gene.index) gene_retain[first] = False retain = retain.append(gene_retain) # set the selected de novos person_dups.loc[retain.index] = retain return person_dups