def test_get_de_novo_counts_loss_of_function(self): """ check the de novo counts for loss-of-function consequences """ # check all the loss-of-function-equivalent SNV VEP consequences snv = [ "stop_gained", "splice_acceptor_variant", "splice_donor_variant", "initiator_codon_variant", "start_lost", "conserved_exon_terminus_variant" ] self.expected["lof_snv"] = 1 for cq in snv: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected) # check all the loss-of-function-equivalent inframe insertion and # deletion VEP consequences indel = ["frameshift_variant"] self.expected["lof_snv"] = 0 self.expected["lof_indel"] = 1 self.de_novos["ref"] = "GG" for cq in indel: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_loss_of_function(self): """ check the de novo counts for loss-of-function consequences """ # check all the loss-of-function-equivalent SNV VEP consequences snv = ["stop_gained", "splice_acceptor_variant", "splice_donor_variant", "initiator_codon_variant", "start_lost", "conserved_exon_terminus_variant"] self.expected["lof_snv"] = 1 for cq in snv: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected) # check all the loss-of-function-equivalent inframe insertion and # deletion VEP consequences indel = ["frameshift_variant"] self.expected["lof_snv"] = 0 self.expected["lof_indel"] = 1 self.de_novos["ref"] = "GG" for cq in indel: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_missense(self): """ check the de novo counts for missense-equivalent consequences """ # check all the possible missense-equivalent SNV VEP consequences snv = [ "missense_variant", "stop_lost", "coding_sequence_variant", "protein_altering_variant" ] self.expected["missense_snv"] = 1 for cq in snv: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected) # check all the possible missense-equivalent inframe insertion and # deletion VEP consequences indel = ["inframe_deletion", "inframe_insertion"] self.expected["missense_snv"] = 0 self.expected["missense_indel"] = 1 self.de_novos["ref"] = "GG" for cq in indel: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_multiple(self): """ check the de novo counts for multiple variants """ # add another variant to the dataframe new_row = self.de_novos.copy() self.de_novos = self.de_novos.append(new_row, ignore_index=True) # counting two of the same variant give a count of 2 self.expected["missense_snv"] = 2 self.de_novos["consequence"] = "missense_variant" computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected) # counting two different variant types gives two counts of 1 self.expected["missense_snv"] = 1 self.expected["lof_snv"] = 1 self.de_novos.loc[0, "consequence"] = "stop_gained" computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected) # now check when we have two variants in different genes new_row = self.expected.copy() self.expected = self.expected.append(new_row, ignore_index=True) self.de_novos.loc[0, "hgnc"] = "ARID1B" self.expected.loc[0, "hgnc"] = "ARID1B" self.expected.loc[0, "missense_snv"] = 0 self.expected.loc[1, "lof_snv"] = 0 computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_missense(self): """ check the de novo counts for missense-equivalent consequences """ # check all the possible missense-equivalent SNV VEP consequences snv = ["missense_variant", "stop_lost", "coding_sequence_variant", "protein_altering_variant"] self.expected["missense_snv"] = 1 for cq in snv: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected) # check all the possible missense-equivalent inframe insertion and # deletion VEP consequences indel = ["inframe_deletion", "inframe_insertion"] self.expected["missense_snv"] = 0 self.expected["missense_indel"] = 1 self.de_novos["ref"] = "GG" for cq in indel: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def check_enrichment(constraint, de_novos, cache_dir, male, female, threshold, ratio): rates = get_rates_by_constraint(constraint, cache_dir, threshold, ratio) constrained_exp = get_expected_mutations( prepare_rates(rates['constrained']), male, female) unconstrained_exp = get_expected_mutations( prepare_rates(rates['unconstrained']), male, female) in_constraint = classify_de_novos_by_constraint(constraint, de_novos, cache_dir, threshold, ratio) constrained_obs = get_de_novo_counts(de_novos[in_constraint]) unconstrained_obs = get_de_novo_counts( de_novos[[not x for x in in_constraint]]) constrained_enrich = enrichment(constrained_obs, constrained_exp) unconstrained_enrich = enrichment(unconstrained_obs, unconstrained_exp) ptv_diff = compare_regions(constrained_enrich, unconstrained_enrich, 'PTV') pav_diff = compare_regions(constrained_enrich, unconstrained_enrich, 'PAV') return { 'constrained_enrich': constrained_enrich, 'unconstrained_enrich': unconstrained_enrich, 'PTV diff': ptv_diff, 'PAV diff': pav_diff }
def analyse_enrichment(de_novos, trios, rates=None, plot_path=None): """ analyse whether de novo mutations are enriched in genes Args: de_novos: data frame containing all the observed de novos for all the genes trios: dictionary of male and female proband counts in the population plot_path: path to save enrichment plots to, or None rates: gene-based mutation rates data frame, or None Returns: data frame containing results from testing for enrichment of de in each gene with de novos in it. """ observed = get_de_novo_counts(de_novos) expected = get_expected_mutations(rates, trios["male"], trios["female"]) # calculate p values for each gene using the mutation rates enrichment = gene_enrichment(expected, observed) # make a manhattan plot of enrichment P values if plot_path is not None: num_tests = 18500 plot_enrichment(enrichment, num_tests, plot_path, p_columns=["p_lof", "p_func"]) # remove the position column (which is only used to be able to locate the # gene's position on a chromosome on a Manhattan plot). del enrichment["start_pos"] return enrichment
def test_get_de_novo_counts_synonymous(self): """ check the de novo counts for synonymous consequences """ self.expected["synonymous_snv"] = 1 self.de_novos["consequence"] = 'synonymous_variant' computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_type_included(self): """ check the de novo counts when a type column is included """ # set the variant type before counting self.de_novos["type"] = "snv" self.expected["missense_snv"] = 1 computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_nonfunctional(self): """ check the de novo counts for nonfunctional consequence types """ snv = ["transcript_ablation", "transcript_amplification", \ "incomplete_terminal_codon_variant", "stop_retained_variant", \ "mature_miRNA_variant", \ "5_prime_UTR_variant", "3_prime_UTR_variant", \ "non_coding_transcript_exon_variant", "intron_variant", \ "NMD_transcript_variant", "non_coding_transcript_variant", \ "upstream_gene_variant", "downstream_gene_variant", \ "TFBS_ablation", "TFBS_amplification", "TF_binding_site_variant", \ "regulatory_region_ablation", "regulatory_region_amplification", \ "feature_elongation", "regulatory_region_variant", \ "feature_truncation", "intergenic_variant"] if pandas.__version__ < '0.18.0': # a dataframe with only nonfunctional consequences should raise an # error, due to difficulties in counting rows for cq in snv: self.de_novos["consequence"] = cq with self.assertRaises(KeyError): computed = get_de_novo_counts(self.de_novos) else: expected = DataFrame(columns=[ 'hgnc', 'chrom', 'start_pos', 'lof_indel', 'lof_snv', 'missense_indel', 'missense_snv', 'synonymous_snv' ]) for cq in snv: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, expected) # add a functional variant to the dataframe, so that the counting # doesn't raise an error new_row = self.de_novos.copy() new_row["consequence"] = "missense_variant" self.de_novos = self.de_novos.append(new_row, ignore_index=True) self.expected["missense_snv"] = 1 for cq in snv: self.de_novos.loc[0, "consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)
def test_get_de_novo_counts_nonfunctional(self): """ check the de novo counts for nonfunctional consequence types """ snv = ["transcript_ablation", "transcript_amplification", \ "incomplete_terminal_codon_variant", "stop_retained_variant", \ "mature_miRNA_variant", \ "5_prime_UTR_variant", "3_prime_UTR_variant", \ "non_coding_transcript_exon_variant", "intron_variant", \ "NMD_transcript_variant", "non_coding_transcript_variant", \ "upstream_gene_variant", "downstream_gene_variant", \ "TFBS_ablation", "TFBS_amplification", "TF_binding_site_variant", \ "regulatory_region_ablation", "regulatory_region_amplification", \ "feature_elongation", "regulatory_region_variant", \ "feature_truncation", "intergenic_variant"] if pandas.__version__ < '0.18.0': # a dataframe with only nonfunctional consequences should raise an # error, due to difficulties in counting rows for cq in snv: self.de_novos["consequence"] = cq with self.assertRaises(KeyError): computed = get_de_novo_counts(self.de_novos) else: expected = DataFrame(columns=['hgnc', 'chrom', 'start_pos', 'lof_indel', 'lof_snv', 'missense_indel', 'missense_snv', 'synonymous_snv']) for cq in snv: self.de_novos["consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, expected) # add a functional variant to the dataframe, so that the counting # doesn't raise an error new_row = self.de_novos.copy() new_row["consequence"] = "missense_variant" self.de_novos = self.de_novos.append(new_row, ignore_index=True) self.expected["missense_snv"] = 1 for cq in snv: self.de_novos.loc[0, "consequence"] = cq computed = get_de_novo_counts(self.de_novos) self.compare_tables(computed, self.expected)